In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import sys

In [13]:
from collections import Counter

In [22]:
def read_conll(filepath):
    all_toks = []
    all_cats = []
    sents = []
    with io.open(filepath, encoding='utf-8') as ip:
        sent = []
        for line in ip:
            if line == '-DOCSTART-\n':
                continue
            if line == '\n':
                sents.append(sent)
                sent = []
            else:
                line = line.strip()
                parts = line.split()
                #print(parts)
                token = parts[0]
                if not len(parts)>1:
                    print(line)
                cat = line.split()[1]
                all_toks.append(token)
                all_cats.append(cat)
                sent.append((token, cat))
    return sents, all_toks, all_cats

def get_tags_ents(all_cats):
    tags = []
    ents = []
    for cat in set(all_cats):
        parts = cat.split('-')
        tags.append(parts[0])
        if len(parts) > 1:
            ents.append(parts[1])
        
    return set(tags), set(ents)

In [15]:
print('processing train file...')
train_sents, train_all_toks, train_all_cats = read_conll('../../../resources/data/NER/wikigold/train.txt')

print('processing validation file...')
valid_sents, valid_all_toks, valid_all_cats = read_conll('../../../resources/data/NER/wikigold/valid.txt')

print('processing testing file...')
test_sents, test_all_toks, test_all_cats = read_conll('../../../resources/data/NER/wikigold/test.txt')

processing train file...
processing validation file...
processing testing file...


In [16]:
print('# training sentences: ', len(train_sents))
print('# validation sentences: ', len(valid_sents))
print('# testing sentences: ', len(test_sents))

# training sentences:  1187
# validation sentences:  169
# testing sentences:  339


In [18]:
print('# unique training tokens: ', len(set(train_all_toks)))
print('# unique_validation tokens: ', len(set(valid_all_toks)))
print('# unique_testing tokens: ', len(set(test_all_toks)))

# unique training tokens:  6877
# unique_validation tokens:  1801
# unique_testing tokens:  2829


In [19]:
print('# training categories: ', len(set(train_all_cats)))
print('# validation categories: ', len(set(valid_all_cats)))
print('# testing categories: ', len(set(test_all_cats)))

# training categories:  16
# validation categories:  11
# testing categories:  11


In [20]:
train_tags, train_ents = get_tags_ents(train_all_cats)
print('training tags, entities: ', train_tags, train_ents, len(train_ents))

valid_tags, valid_ents = get_tags_ents(valid_all_cats)
print('validation tags, entities: ', valid_tags, valid_ents, len(valid_ents))

test_tags, test_ents = get_tags_ents(test_all_cats)
print('training tags, entities: ', test_tags, test_ents, len(test_ents))

training tags, entities:  {'Â\x84ski', 'Â\x82awa', 'B', 'Â\x82aw', 'Â\x9fti', 'Ã\x8eÂ³Ã\x8eÂºÃ\x8eÂ¿Ã\x8eÂ¹Ã\x8eÂ½Ã\x8fÂ\x89Ã\x8eÂ½Ã\x8eÂ¹Ã\x8fÂ\x8eÃ\x8eÂ½', 'O', 'ibenik', 't', 'I'} {'ORG', 'Knin', 'MISC', 'PER', 'LOC'} 5
validation tags, entities:  {'Â\x84', 'B', 'I', 'O', 'Â\x81Ã\x84Â\x99czyca'} {'MISC', 'ORG', 'LOC', 'PER'} 4
training tags, entities:  {'Â\x84', 'Â\x9fescu', 'B', 'O', 'I'} {'MISC', 'ORG', 'LOC', 'PER'} 4


In [9]:
print('training distribution')
print(Counter(train_all_cats))
print('\n')

print('validation distribution')
print(Counter(valid_all_cats))
print('\n')

print('testing distribution')
print(Counter(test_all_cats))
print('\n')

training distribution
Counter({'O': 22486, 'I-ORG': 768, 'B-LOC': 684, 'B-PER': 644, 'B-ORG': 626, 'I-PER': 493, 'B-MISC': 482, 'I-MISC': 471, 'I-LOC': 295, 'Â\x82aw': 2, 'Â\x82awa': 1, 'Â\x84ski': 1, 't': 1, 'Ã\x8eÂ³Ã\x8eÂºÃ\x8eÂ¿Ã\x8eÂ¹Ã\x8eÂ½Ã\x8fÂ\x89Ã\x8eÂ½Ã\x8eÂ¹Ã\x8fÂ\x8eÃ\x8eÂ½': 1, 'Â\x9fti': 1, 'ibenik-Knin': 1})


validation distribution
Counter({'O': 3593, 'B-LOC': 107, 'I-ORG': 97, 'B-PER': 91, 'B-ORG': 90, 'I-PER': 73, 'B-MISC': 66, 'I-MISC': 63, 'I-LOC': 60, 'Â\x81Ã\x84Â\x99czyca': 1, 'Â\x84': 1})


testing distribution
Counter({'O': 6491, 'B-LOC': 217, 'I-ORG': 199, 'B-PER': 193, 'B-ORG': 176, 'B-MISC': 157, 'I-MISC': 152, 'I-PER': 133, 'I-LOC': 80, 'Â\x84': 1, 'Â\x9fescu': 1})


