In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import sys

In [4]:
from collections import Counter

In [5]:
def read_conll(filepath):
    all_toks = []
    all_cats = []
    sents = []
    with io.open(filepath, encoding='ISO-8859-1') as ip:
        sent = []
        for line in ip:
            if line == '\n':
                sents.append(sent)
                sent = []
            else:
                line = line.strip()
                parts = line.split(' ')
                #print(parts)
                token = parts[0]
                if not len(parts)>1:
                    print(line)
                cat = line.split(' ')[1]
                all_toks.append(token)
                all_cats.append(cat)
                sent.append((token, cat))
    return sents, all_toks, all_cats

def get_tags_ents(all_cats):
    tags = []
    ents = []
    for cat in set(all_cats):
        parts = cat.split('-')
        tags.append(parts[0])
        if len(parts) > 1:
            ents.append(parts[1])
        
    return set(tags), set(ents)

In [7]:
print('processing train file...')
train_sents, train_all_toks, train_all_cats = read_conll('../../../resources/data/NER/wikigold/data.txt')

processing train file...


In [8]:
print('# training sentences: ', len(train_sents))
print('# unique training tokens: ', len(set(train_all_toks)))
print('# training categories: ', len(set(train_all_cats)))
train_tags, train_ents = get_tags_ents(train_all_cats)
print('training tags, entities: ', train_tags, train_ents, len(train_ents))
print('training distribution')
print(Counter(train_all_cats))
print('\n')

# training sentences:  1695
# unique training tokens:  8504
# training categories:  9
training tags, entities:  {'O', 'I', 'B'} {'LOC', 'MISC', 'PER', 'ORG'} 4
training distribution
Counter({'O': 32576, 'I-ORG': 1066, 'B-LOC': 1011, 'B-PER': 932, 'B-ORG': 892, 'B-MISC': 706, 'I-PER': 702, 'I-MISC': 686, 'I-LOC': 436})




In [7]:
train_sents

[[('010', 'B-MISC'),
  ('is', 'O'),
  ('the', 'O'),
  ('tenth', 'O'),
  ('album', 'O'),
  ('from', 'O'),
  ('Japanese', 'B-MISC'),
  ('Punk', 'O'),
  ('Techno', 'O'),
  ('band', 'O'),
  ('The', 'B-ORG'),
  ('Mad', 'I-ORG'),
  ('Capsule', 'I-ORG'),
  ('Markets', 'I-ORG'),
  ('.', 'O')],
 [('This', 'O'),
  ('album', 'O'),
  ('proved', 'O'),
  ('to', 'O'),
  ('be', 'O'),
  ('more', 'O'),
  ('commercial', 'O'),
  ('and', 'O'),
  ('more', 'O'),
  ('techno-based', 'O'),
  ('than', 'O'),
  ('Osc-Dis', 'B-MISC'),
  (',', 'O'),
  ('with', 'O'),
  ('heavily', 'O'),
  ('synthesized', 'O'),
  ('songs', 'O'),
  ('like', 'O'),
  ('Introduction', 'B-MISC'),
  ('010', 'I-MISC'),
  ('and', 'O'),
  ('Come', 'B-MISC'),
  ('.', 'O')],
 [('Founding', 'O'),
  ('member', 'O'),
  ('Kojima', 'B-PER'),
  ('Minoru', 'I-PER'),
  ('played', 'O'),
  ('guitar', 'O'),
  ('on', 'O'),
  ('Good', 'B-MISC'),
  ('Day', 'I-MISC'),
  (',', 'O'),
  ('and', 'O'),
  ('Wardanceis', 'B-MISC'),
  ('cover', 'O'),
  ('of', 'O'),
  