In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import sys

In [2]:
from collections import Counter

In [6]:
def read_conll(filepath):
    all_toks = []
    all_cats = []
    sents = []
    with io.open(filepath, encoding='ISO-8859-1') as ip:
        sent = []
        for line in ip:
            if line == '-DOCSTART-\n':
                continue
            if line == '\n':
                sents.append(sent)
                sent = []
            else:
                line = line.strip()
                parts = line.split(' ')
                #print(parts)
                token = parts[0]
                if not len(parts)>1:
                    print(line)
                cat = line.split(' ')[1]
                all_toks.append(token)
                all_cats.append(cat)
                sent.append((token, cat))
    return sents, all_toks, all_cats

def get_tags_ents(all_cats):
    tags = []
    ents = []
    for cat in set(all_cats):
        parts = cat.split('-')
        tags.append(parts[0])
        if len(parts) > 1:
            ents.append(parts[1])
        
    return set(tags), set(ents)

In [7]:
print('processing train file...')
train_sents, train_all_toks, train_all_cats = read_conll('../../../resources/data/NER/GMB/train.txt')
print('processing valid file...')
valid_sents, valid_all_toks, valid_all_cats = read_conll('../../../resources/data/NER/GMB/valid.txt')
print('processing test file...')
test_sents, test_all_toks, test_all_cats = read_conll('../../../resources/data/NER/GMB/test.txt')

processing train file...
processing valid file...
processing test file...


In [14]:
print('# training sentences: ', len(train_sents))
print('# unique training tokens: ', len(set(train_all_toks)))
print('# training categories: ', len(set(train_all_cats)))
train_tags, train_ents = get_tags_ents(train_all_cats)
print('training tags, entities: ', train_tags, train_ents, len(train_ents))
print('training distribution')
print(Counter(train_all_cats))
print('\n')

# training sentences:  47958
# unique training tokens:  35178
# training categories:  17
training tags, entities:  {'B', 'I', 'O'} {'per', 'tim', 'art', 'nat', 'geo', 'gpe', 'eve', 'org'} 8
training distribution
Counter({'O': 887908, 'B-geo': 37644, 'B-tim': 20333, 'B-org': 20143, 'I-per': 17251, 'B-per': 16990, 'I-org': 16784, 'B-gpe': 15870, 'I-geo': 7414, 'I-tim': 6528, 'B-art': 402, 'B-eve': 308, 'I-art': 297, 'I-eve': 253, 'B-nat': 201, 'I-gpe': 198, 'I-nat': 51})




In [8]:
print('# validation sentences: ', len(valid_sents))
print('# unique validation tokens: ', len(set(valid_all_toks)))
print('# validation categories: ', len(set(valid_all_cats)))
valid_tags, valid_ents = get_tags_ents(valid_all_cats)
print('validation tags, entities: ', valid_tags, valid_ents, len(valid_ents))
print('validation distribution')
print(Counter(valid_all_cats))
print('\n')

# validation sentences:  4795
# unique validation tokens:  11982
# validation categories:  17
validation tags, entities:  {'O', 'B', 'I'} {'gpe', 'tim', 'nat', 'org', 'eve', 'per', 'geo', 'art'} 8
validation distribution
Counter({'O': 89054, 'B-geo': 3780, 'B-tim': 2083, 'B-org': 1932, 'I-per': 1745, 'B-per': 1676, 'I-org': 1644, 'B-gpe': 1641, 'I-geo': 806, 'I-tim': 672, 'B-art': 46, 'I-art': 40, 'B-nat': 31, 'B-eve': 29, 'I-eve': 15, 'I-gpe': 14, 'I-nat': 10})




In [9]:
print('# testing sentences: ', len(test_sents))
print('# unique testing tokens: ', len(set(test_all_toks)))
print('# testing categories: ', len(set(test_all_cats)))
test_tags, test_ents = get_tags_ents(test_all_cats)
print('training tags, entities: ', test_tags, test_ents, len(test_ents))
print('training distribution')
print(Counter(test_all_cats))
print('\n')

# testing sentences:  9591
# unique testing tokens:  16872
# testing categories:  17
training tags, entities:  {'O', 'B', 'I'} {'gpe', 'tim', 'nat', 'org', 'eve', 'per', 'geo', 'art'} 8
training distribution
Counter({'O': 177390, 'B-geo': 7500, 'B-org': 4047, 'B-tim': 3984, 'I-per': 3495, 'I-org': 3486, 'B-per': 3459, 'B-gpe': 3200, 'I-geo': 1553, 'I-tim': 1230, 'B-art': 84, 'B-eve': 58, 'I-art': 57, 'I-eve': 48, 'B-nat': 45, 'I-gpe': 40, 'I-nat': 11})




In [10]:
data = pd.read_csv('../../../resources/data/NER/GMB/ner_dataset.csv', encoding='ISO-8859-1')

In [11]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [12]:
data['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [13]:
data['Sentence #'].unique()

array(['Sentence: 1', nan, 'Sentence: 2', ..., 'Sentence: 47957',
       'Sentence: 47958', 'Sentence: 47959'], dtype=object)