## Necessary Imports

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import nltk
nltk.download('averaged_perceptron_tagger')

## Helper Functions for Data Cleaning


In [None]:
def merge_words(df):
    sentences = []
    tags = []
    
    sentence = []
    labels = []
    for i in range(len(df)):
        if str(df.loc[i].word) != "nan":
            sentence.append(df.loc[i].word)
            labels.append(df.loc[i].label)
        else:
            s = " ".join(sentence)
            l = " ".join(labels)
            sentences.append(s)
            tags.append(l)
            sentence = []
            labels = []
    return sentences, tags

In [None]:
def split_sentences(df):
    ss = []
    tt = []
    pp = []
    nn = []
    for i in range(len(df)):
        sentence_tokens = df.loc[i].Sentence.split()
        tag_tokens = df.loc[i].Tag.split()
        pos_tokens = df.loc[i].POS.split()
        number_tokens = len(sentence_tokens) * ["Sentence: %s" % i]
        ss.append(sentence_tokens)
        tt.append(tag_tokens)
        pp.append(pos_tokens)
        nn.append(number_tokens)
    ss = list(itertools.chain.from_iterable(ss))
    tt = list(itertools.chain.from_iterable(tt))
    pp = list(itertools.chain.from_iterable(pp))
    nn = list(itertools.chain.from_iterable(nn))
    df_final = pd.DataFrame()
    df_final["Sentence Number"] = nn
    df_final["Word"] = ss
    df_final["Tag"] = tt
    df_final["POS"] = pp
    return df_final
   

In [90]:
 
def parse_data(df):
    sentences, tags = merge_words(df)
    
    df_sentence = pd.DataFrame()
    df_sentence["Sentence"] = sentences
    df_sentence["Tag"] = tags
    
    df_sentence["POS"] = df_sentence.Sentence.apply(lambda x: " ".join(token[-1] for token in nltk.pos_tag(x.split())))

    df_sentence = split_sentences(df_sentence)
    return df_sentence

## Data Processing

In [102]:
DATA_PATH = "/home/ilke/Desktop/ner-interview-question/data/"
engtrain = pd.read_csv("%sengtrain.bio.txt" % DATA_PATH, sep="\t", header=None, skip_blank_lines=False)
engtest = pd.read_csv("%sengtest.bio.txt" % DATA_PATH, sep="\t", header=None, skip_blank_lines=False)
trivia_train = pd.read_csv("%strivia10k13train.bio.txt" % DATA_PATH, header=None, sep="\t", skip_blank_lines=False)
trivia_test = pd.read_csv("%strivia10k13test.bio.txt" % DATA_PATH, header=None, sep="\t", skip_blank_lines=False)

In [105]:
engtrain.columns = ["label", "word"]
engtest.columns = ["label", "word"]
trivia_train.columns = ["label", "word"]
trivia_test.columns = ["label", "word"]

## Cross Check Datasets for Merging

In [107]:
trivia_train.label.unique()

array(['B-Actor', 'I-Actor', 'O', 'B-Plot', 'I-Plot', 'B-Opinion',
       'I-Opinion', nan, 'B-Award', 'I-Award', 'B-Year', 'B-Genre',
       'B-Origin', 'I-Origin', 'B-Director', 'I-Director', 'I-Genre',
       'I-Year', 'B-Soundtrack', 'I-Soundtrack', 'B-Relationship',
       'I-Relationship', 'B-Character_Name', 'I-Character_Name',
       'B-Quote', 'I-Quote'], dtype=object)

In [108]:
engtrain.label.unique()

array(['O', 'B-ACTOR', 'I-ACTOR', nan, 'B-YEAR', 'B-TITLE', 'B-GENRE',
       'I-GENRE', 'B-DIRECTOR', 'I-DIRECTOR', 'B-SONG', 'I-SONG',
       'B-PLOT', 'I-PLOT', 'B-REVIEW', 'B-CHARACTER', 'I-CHARACTER',
       'B-RATING', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-TITLE',
       'I-RATING', 'B-TRAILER', 'I-TRAILER', 'I-REVIEW', 'I-YEAR'],
      dtype=object)

In [111]:
trivia_train.label = trivia_train.label.replace("B-Character_Name", "B-CHARACTER").replace("I-Character_Name", "I-CHARACTER").replace("B-Soundtrack", "B-SONG").replace("I-Soundtrack", "I-SONG")
trivia_test.label = trivia_test.label.replace("B-Character_Name", "B-CHARACTER").replace("I-Character_Name", "I-CHARACTER").replace("B-Soundtrack", "B-SONG").replace("I-Soundtrack", "I-SONG")

## Merge Datasets

In [113]:
train = pd.concat([engtrain, trivia_train])
test = pd.concat([engtest, trivia_test])

In [128]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

## Parse Datasets for Training

In [129]:
train_final = parse_data(train)

In [130]:
test_final = parse_data(test)

## Finishing Touches

In [131]:
train_final.head()

Unnamed: 0,Sentence Number,Word,Tag,POS
0,Sentence: 0,what,O,WP
1,Sentence: 0,movies,O,NNS
2,Sentence: 0,star,O,VBP
3,Sentence: 0,bruce,B-ACTOR,NN
4,Sentence: 0,willis,I-ACTOR,NN


In [133]:
train_final.Tag = train_final.Tag.str.upper()
test_final.Tag = test_final.Tag.str.upper()

In [134]:
train_final.Tag.unique()

array(['O', 'B-ACTOR', 'I-ACTOR', 'B-YEAR', 'B-TITLE', 'B-GENRE',
       'I-GENRE', 'B-DIRECTOR', 'I-DIRECTOR', 'B-SONG', 'I-SONG',
       'B-PLOT', 'I-PLOT', 'B-REVIEW', 'B-CHARACTER', 'I-CHARACTER',
       'B-RATING', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-TITLE',
       'I-RATING', 'B-TRAILER', 'I-TRAILER', 'I-REVIEW', 'I-YEAR',
       'B-OPINION', 'I-OPINION', 'B-AWARD', 'I-AWARD', 'B-ORIGIN',
       'I-ORIGIN', 'B-RELATIONSHIP', 'I-RELATIONSHIP', 'B-QUOTE',
       'I-QUOTE'], dtype=object)

In [135]:
train_final.Tag = train_final.Tag.replace("B-RATINGS_AVERAGE", "B-RATING").replace("I-RATINGS_AVERAGE", "I-RATING")
test_final.Tag = test_final.Tag.replace("B-RATINGS_AVERAGE", "B-RATING").replace("I-RATINGS_AVERAGE", "I-RATING")

In [136]:
train_final.to_excel("%sner_train.xlsx" % DATA_PATH)
test_final.to_excel("%sner_test.xlsx" % DATA_PATH)