In [1]:
TRAIN_FILE_PATH = '/tmp/eng.train2'
TAG_FILE_PATH = '/tmp/my_eng2.testa'
BROWN_DIR = '/Users/konix/Workspace/nertagger/resources/brown-clusters'

from collections import defaultdict, Counter
from nertagger.parser import parse_conll_tagged, format_conll_tagged, parse_conll_train, format_conll_train
from nertagger.word import parsed_documents_to_words, words_to_parsed_documents
from nertagger.annotators.brown_clusters import parse_brown_clusters_directory

In [2]:
raw_data = open('/Users/konix/Workspace/nertagger/data/eng.train', 'rb').read()
training_words = parsed_documents_to_words(parse_conll_train(raw_data))

all_clusters_dict = parse_brown_clusters_directory(BROWN_DIR)
rcv1_dict = all_clusters_dict['brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt']

cluster_to_type_histogram = defaultdict(Counter)
for word_ in training_words:
    if word_.gold_label != 'O':
        word_cluster = rcv1_dict[word_.text][0] if word_.text in rcv1_dict else None
        if word_cluster:
            cluster_to_type_histogram[word_cluster].update([word_.gold_label])

cluster_to_purity = {}
for cluster, cluster_type_histogram in cluster_to_type_histogram.iteritems():
    all_count = sum(cluster_type_histogram.values())
    common_count = cluster_type_histogram.most_common(1)[0][1]
    if all_count > 20:
        common_fraction = common_count / float(all_count)
        cluster_to_purity[cluster] = True if common_fraction > 0.9 else False

In [3]:
import random

def convert_entity_type(entity):
    entity_type = entity.entity_type
#     return entity_type + str(random.randint(0, 1) + 1)
    first_word = entity.words[0]
    word_cluster = rcv1_dict[first_word.text][0] if first_word.text in rcv1_dict else None
    if word_cluster and cluster_to_purity.get(word_cluster, False) and entity_type == 'MISC':
        return entity_type + 'pure'
    else:
        return entity_type

In [4]:
class Entity(object):
    def __init__(self, words, tag_attr='gold_label'):
        self.words = words
        self._tag_attr = tag_attr
    
    @property
    def text(self):
        return ' '.join([word.text for word in self.words])

    @property
    def entity_type(self):
        return getattr(self.words[0], self._tag_attr)[2:]
        
    
    def __str__(self):
        return "<%s: '%s'>" % (self.entity_type, self.text)
    
    def __repr__(self):
        return str(self)


def _get_word_tag(word, tag_attr='gold_label'):
    return getattr(word, tag_attr)[0] if word is not None else None

def _get_word_type(word, tag_attr='gold_label'):
    if word is None:
        return None
    word_tag = getattr(word, tag_attr)
    return None if word_tag == 'O' else word_tag[2:]

def get_entity_list(word_list):
    entity_list = []
    current_entyity_words = []
    for word in word_list:
        word_tag = _get_word_tag(word)
        word_type = _get_word_type(word)

        prev_word = None if word.sentence_index == 0 else word.sentence[word.sentence_index-1]
        prev_word_tag = _get_word_tag(prev_word)
        prev_word_type = _get_word_type(prev_word)

        if word_tag != 'I' or word_type != prev_word_type:
            if current_entyity_words:
                entity_list.append(Entity(current_entyity_words))
                current_entyity_words = []
        if word_tag != 'O':
            current_entyity_words.append(word)
    return entity_list

In [None]:
raw_data = open('/Users/konix/Workspace/nertagger/data/eng.train', 'rb').read()
training_words = parsed_documents_to_words(parse_conll_train(raw_data))
entity_list = get_entity_list(training_words)

for word_ in training_words:
    word_.tag = None

for entity in entity_list:
    entity_type = entity.entity_type
    new_entity_type = convert_entity_type(entity)
    for word in entity.words:
        word.gold_label = word.gold_label.replace(entity_type, new_entity_type)

converted_raw_data = format_conll_train(words_to_parsed_documents(training_words))
open(TRAIN_FILE_PATH, 'wb').write(converted_raw_data)

In [7]:
import re
pure_re = re.compile(' [BI]-(?:PER|ORG|LOC|MISC)pure')

y = None
def pure_repl(match):
    return 'pure' + match.group(0)

x = open("/tmp/my_eng2.testa", 'rb').read()
# x = pure_re.sub(pure_repl, x)
x = x.replace('MISCpure', 'MISC')
open("/tmp/my_eng2.testa", 'wb').write(x)

In [8]:
raw_data = open('/Users/konix/Workspace/nertagger/data/my_eng.testa.l1', 'rb').read()
normal_tagger_words = parsed_documents_to_words(parse_conll_tagged(raw_data))
raw_data = open('/tmp/my_eng2.testa', 'rb').read()
split_tagger_words = parsed_documents_to_words(parse_conll_tagged(raw_data))

for normal_word, split_word in zip(normal_tagger_words, split_tagger_words):
    if 'PER' in split_word.tag:
        normal_word.tag = split_word.tag
    elif 'PER' in normal_word.tag:
        normal_word.tag = 'O'

merged_raw_data = format_conll_tagged(words_to_parsed_documents(normal_tagger_words))
open('/tmp/blabla.a', 'wb').write(merged_raw_data)

In [None]:
len(normal_tagger_words), len(split_tagger_words)

In [None]:
word[0].tag

In [10]:
x = [word_ for word_ in training_words if 'MISC' in word_.gold_label and 'pure' not in word_.gold_label]

In [25]:
len([z for z in training_words if z.text in ['Ferrari', 'McLaren', 'Leeds', 'Wimbledon']])

65

In [12]:
y = set([x_.text for x_ in x])

In [24]:
x = defaultdict(list)
for y_ in y:
    y_cluster = rcv1_dict[y_][0] if y_ in rcv1_dict else None
    y_cluster_type_hist = cluster_to_type_histogram[y_cluster] if y_cluster else None
    if y_cluster and y_cluster_type_hist and sum(y_cluster_type_hist.values()) > 20:
        x[y_cluster].append(y_)

for y_cluster, y_list in x.iteritems():
    y_cluster_type_hist = cluster_to_type_histogram[y_cluster]
    print y_cluster, y_cluster_type_hist
    for y_ in y_list:
        print '\t' + y_

1101100011 Counter({'I-LOC': 21, 'I-MISC': 6, 'I-ORG': 6, 'I-PER': 6})
	Indian-ruled
	Pakistan-ruled
	British-ruled
	Palestinian-ruled
	Shabwa
1101100010 Counter({'I-PER': 31, 'I-MISC': 7, 'I-LOC': 5, 'I-ORG': 4})
	Serb-controlled
	Kurdish-controlled
	Serb-held
111111010 Counter({'I-MISC': 35, 'B-MISC': 1})
	100
	310
	95
	154
	215
	150
	4.0
11011011110 Counter({'I-MISC': 32, 'I-LOC': 11, 'I-ORG': 8, 'I-PER': 1, 'B-MISC': 1})
	Communist-led
	Arab
	Baltic
	Himalayan
	Islamic
	Andean
11101101111111 Counter({'I-ORG': 1126, 'I-LOC': 239, 'I-PER': 233, 'I-MISC': 26})
	Ferrari
	McLaren
	Leeds
	Wimbledon
11101101111110 Counter({'I-ORG': 18, 'I-LOC': 3, 'I-MISC': 1})
	Congress
1101100110 Counter({'I-LOC': 95, 'I-ORG': 49, 'I-MISC': 8})
	New
111011111010 Counter({'I-PER': 83, 'I-ORG': 54, 'I-LOC': 15, 'I-MISC': 11})
	du
	et
	de
	al
1101011010 Counter({'I-MISC': 36, 'I-ORG': 33, 'I-PER': 9, 'I-LOC': 8})
	Apertura
	PGA
	Bundesliga
	Aryan
	Test
	Supercup
	Tour
	Subaru
11100000 Counter({'I-LOC': 20,