In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import sys

In [4]:
from collections import Counter

In [5]:
def read_conll(filepath):
    all_toks = []
    all_cats = []
    sents = []
    with io.open(filepath, encoding='utf-8') as ip:
        sent = []
        for line in ip:
            if line == '\n':
                sents.append(sent)
                sent = []
            else:
                line = line.strip()
                token = line.split('\t')[0]
                cat = line.split('\t')[1]
                all_toks.append(token)
                all_cats.append(cat)
                sent.append((token, cat))
    return sents, all_toks, all_cats

def get_tags_ents(all_cats):
    tags = []
    ents = []
    for cat in set(all_cats):
        parts = cat.split('-')
        tags.append(parts[0])
        if len(parts) > 1:
            ents.append(parts[1])
        
    return set(tags), set(ents)

In [8]:
print('processing train file...')
train_sents, train_all_toks, train_all_cats = read_conll('../../../resources/data/NER/twitter_wnut16/train')

print('processing validation file...')
valid_sents, valid_all_toks, valid_all_cats = read_conll('../../../resources/data/NER/twitter_wnut16/dev')

print('processing testing file...')
test_sents, test_all_toks, test_all_cats = read_conll('../../../resources/data/NER/twitter_wnut16/test')

processing train file...
processing validation file...
processing testing file...


In [9]:
print('# training sentences: ', len(train_sents))
print('# validation sentences: ', len(valid_sents))
print('# testing sentences: ', len(test_sents))

# training sentences:  2394
# validation sentences:  1000
# testing sentences:  3856


In [10]:
print('# unique training tokens: ', len(set(train_all_toks)))
print('# unique_validation tokens: ', len(set(valid_all_toks)))
print('# unique_testing tokens: ', len(set(test_all_toks)))

# unique training tokens:  10586
# unique_validation tokens:  6255
# unique_testing tokens:  18320


In [11]:
print('# training categories: ', len(set(train_all_cats)))
print('# validation categories: ', len(set(valid_all_cats)))
print('# testing categories: ', len(set(test_all_cats)))

# training categories:  21
# validation categories:  20
# testing categories:  21


In [12]:
train_tags, train_ents = get_tags_ents(train_all_cats)
print('training tags, entities: ', train_tags, train_ents, len(train_ents))

valid_tags, valid_ents = get_tags_ents(valid_all_cats)
print('validation tags, entities: ', valid_tags, valid_ents, len(valid_ents))

test_tags, test_ents = get_tags_ents(test_all_cats)
print('training tags, entities: ', test_tags, test_ents, len(test_ents))

training tags, entities:  {'O', 'B', 'I'} {'sportsteam', 'movie', 'person', 'musicartist', 'geo', 'tvshow', 'other', 'company', 'facility', 'product'} 10
validation tags, entities:  {'O', 'B', 'I'} {'sportsteam', 'movie', 'person', 'musicartist', 'geo', 'tvshow', 'company', 'facility', 'other', 'product'} 10
training tags, entities:  {'O', 'B', 'I'} {'sportsteam', 'movie', 'person', 'musicartist', 'geo', 'tvshow', 'other', 'company', 'facility', 'product'} 10


In [13]:
print('training distribution')
print(Counter(train_all_cats))
print('\n')

print('validation distribution')
print(Counter(valid_all_cats))
print('\n')

print('testing distribution')
print(Counter(test_all_cats))
print('\n')

training distribution
Counter({'O': 44007, 'B-person': 449, 'I-other': 320, 'B-geo-loc': 276, 'B-other': 225, 'I-person': 215, 'B-company': 171, 'I-facility': 105, 'B-facility': 104, 'B-product': 97, 'I-product': 80, 'I-musicartist': 61, 'B-musicartist': 55, 'B-sportsteam': 51, 'I-geo-loc': 49, 'I-movie': 46, 'I-company': 36, 'B-movie': 34, 'B-tvshow': 34, 'I-tvshow': 31, 'I-sportsteam': 23})


validation distribution
Counter({'O': 15133, 'B-person': 171, 'B-other': 132, 'I-product': 121, 'B-geo-loc': 116, 'I-other': 97, 'I-person': 95, 'B-sportsteam': 70, 'I-geo-loc': 42, 'B-musicartist': 41, 'I-facility': 39, 'B-company': 39, 'B-facility': 38, 'B-product': 37, 'I-musicartist': 35, 'B-movie': 15, 'I-movie': 15, 'I-sportsteam': 13, 'I-company': 10, 'B-tvshow': 2})


testing distribution
Counter({'O': 55953, 'B-geo-loc': 882, 'B-company': 621, 'B-other': 584, 'I-other': 556, 'I-product': 500, 'B-person': 482, 'I-facility': 366, 'I-person': 300, 'I-company': 265, 'B-facility': 253, 'B-pr