In [1]:
import pandas as pd
import nltk
import io

In [2]:
def file_reader(filename):
    with open(filename) as f:
        yield '<START>\t<START>\n'
        for line in f:
            if line and line != '\n':
                yield line
            else:
                yield '<END>\t<END>\n<START>\t<START>\n'

# load the data
data = io.StringIO(''.join(file_reader('trivia10k13train.bio')))
df = pd.read_csv(data, sep = '\t', header = None, names = ['label', 'word'])
df = df.iloc[:-1]

# columns for previous word and following word
df['prevword'] = df['word'].shift(1, fill_value = df.iloc[-1:, 1])
df['postword'] = df['word'].shift(-1, fill_value = df.iloc[0:1, 1])

# column with data in appropriate format for nltk
df['feature_set'] = df.apply(lambda x: {'word': x['word'], 'prevword':x['prevword'], 'postword':x['postword']}, axis=1)

# do same for test set
data = io.StringIO(''.join(file_reader('trivia10k13test.bio')))
df_test = pd.read_csv(data, sep = '\t', header = None,  names = ['label', 'word'])
df_test = df_test.iloc[:-1]
df_test['prevword'] = df_test['word'].shift(1, fill_value = df.iloc[-1:, 1])
df_test['postword'] = df_test['word'].shift(-1, fill_value = df.iloc[0:1, 1])
df_test['feature_set'] = df_test.apply(lambda x: {'word': x['word'], 'prevword':x['prevword'], 'postword':x['postword']}, axis=1)


In [29]:
df.head()

Unnamed: 0,label,word,prevword,postword,feature_set,feature_set_embed
0,<START>,<START>,<END>,steve,"{'word': '<START>', 'prevword': '<END>', 'post...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,B-Actor,steve,<START>,mcqueen,"{'word': 'steve', 'prevword': '<START>', 'post...","[0.3767693, -0.46156234, -0.14322689, -0.19135..."
2,I-Actor,mcqueen,steve,provided,"{'word': 'mcqueen', 'prevword': 'steve', 'post...","[0.043367274, -0.046646018, -0.021394843, -0.0..."
3,O,provided,mcqueen,a,"{'word': 'provided', 'prevword': 'mcqueen', 'p...","[-0.020892624, -0.08252325, -0.17344192, -0.06..."
4,O,a,provided,thrilling,"{'word': 'a', 'prevword': 'provided', 'postwor...","[-1.5991454, 1.7220784, -0.26430577, -1.092318..."


In [3]:
# prep data for nltk classifier
train_set = list(zip(df['feature_set'], df['label']))
test_set = list(zip(df_test['feature_set'], df_test['label']))

# train model and predict on test set
clf = nltk.NaiveBayesClassifier.train(train_set)
predictions = clf.classify_many(df_test['feature_set'])

In [8]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

f1 = f1_score(df_test['label'].to_list(), predictions, average='weighted')
prec = precision_score(df_test['label'].to_list(), predictions, average='weighted')
recall = recall_score(df_test['label'].to_list(), predictions, average='weighted')
accuracy = accuracy_score(df_test['label'].to_list(), predictions)
print("f1:", f1, "| precision:", prec, "| recall:", recall, "| accuracy:", accuracy)

f1: 0.7898015166767505 | precision: 0.7916808840683861 | recall: 0.8051512540462495 | accuracy: 0.8051512540462495


In [12]:
cm = nltk.ConfusionMatrix(df_test['label'].to_list(), predictions)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=10))

         |                                                       I               |
         |                           <             I      B      -      B        |
         |      I                    S      B      -      -      O      -      B |
         |      -             <      T      -      A      A      r      G      - |
         |      P             E      A      P      c      c      i      e      Y |
         |      l             N      R      l      t      t      g      n      e |
         |      o             D      T      o      o      o      i      r      a |
         |      t      O      >      >      t      r      r      n      e      r |
---------+-----------------------------------------------------------------------+
  I-Plot | <29.9%>  2.7%   0.0%   0.0%   0.7%   0.1%   0.1%   0.2%   0.1%   0.1% |
       O |   4.3% <27.2%>  0.0%   0.0%   0.6%   0.1%   0.1%   0.2%   0.2%   0.1% |
   <END> |      .      .  <4.5%>     .      .      .      .      .      .      . |
 <ST

In [15]:
from nltk.metrics.scores import (precision, recall, f_measure)

#training and evaluating decision tree and Maximum Entropy classifiers
clf_dt = nltk.DecisionTreeClassifier.train(train_set)
print("Decision tree accuracy:", nltk.classify.accuracy(clf_dt, test_set))

clf_me = nltk.MaxentClassifier.train(train_set)
print("Maxent accuracy:", nltk.classify.accuracy(clf_me, test_set))

Decision tree accuracy: 0.7413660604084674
  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.29584        0.320
             2          -0.76494        0.836


  exp_nf_delta = 2 ** nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)


         Final               nan        0.864
Maxent accuracy: 0.8086444190866537


In [16]:
# array of all entities
df_test['label'].unique()

array(['<START>', 'O', 'B-Plot', 'I-Plot', '<END>', 'B-Genre', 'I-Genre',
       'B-Opinion', 'B-Relationship', 'B-Director', 'I-Director',
       'B-Actor', 'I-Actor', 'B-Origin', 'I-Origin', 'B-Year', 'B-Award',
       'I-Award', 'B-Quote', 'I-Quote', 'B-Character_Name',
       'I-Character_Name', 'I-Opinion', 'I-Year', 'I-Relationship',
       'B-Soundtrack', 'I-Soundtrack'], dtype=object)

# Part 2 - Using word embeddings as features

In [21]:
import gensim
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/rob/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/rob/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
# generate word embeddings based on movie reviews corpus
# slow - commented out
#model = gensim.models.Word2Vec(movie_reviews.sents())
#model.save('movie.embedding')


In [30]:
# load model saved earlier
new_model = gensim.models.Word2Vec.load('movie.embedding')

In [23]:
import numpy as np
zeros = np.zeros(100)

def tryembed(word):
    try:
        return new_model.wv[word]
    except:
        return zeros
    
# use word embeddings as features
df['feature_set_embed'] = df.apply(lambda x: tryembed(x['word']), axis=1)
features_embed =  pd.DataFrame(df['feature_set_embed'].values.tolist())

df_test = df_test.copy()
df_test['feature_set_embed'] = df_test.apply(lambda x: tryembed(x['word']), axis=1)
features_embed_test =  pd.DataFrame(df_test['feature_set_embed'].values.tolist())

In [24]:
# add features for previous and following words
features_embed_prev = features_embed.shift(1, fill_value = 0)
features_embed_post = features_embed.shift(-1, fill_value = 0)
features_embed = pd.concat([features_embed, features_embed_prev, features_embed_post], axis=1)

features_embed_prev_test = features_embed_test.shift(1, fill_value = 0)
features_embed_post_test = features_embed_test.shift(-1, fill_value = 0)
features_embed_test = pd.concat([features_embed_test, features_embed_prev_test, features_embed_post_test], axis=1)

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
rf.fit(features_embed, df['label'])
predictions_rf = rf.predict(features_embed_test)
print("Random forest accuracy:",accuracy_score(df_test['label'], predictions_rf))



Random forest accuracy: 0.7998882187187071


In [117]:
# sample of actual test labels vs. predicted test labels
#list(zip(df_test['label'][:100], predictions[:100]))

# Part 3 - Sequence model (using previous predicted label)

In [119]:
df2 = df.copy()
df2_test = df_test.copy()

# add feature for previous word label
df2['prevlabel'] = df2['label'].shift(1, fill_value = df2.iloc[-1:, 1])
df2['feature_set'] = df2.apply(lambda x: {'word': x['word'],  'prevword':x['prevword'], 'postword':x['postword'], 'prevlabel':x['prevlabel']}, axis=1)
train_set2 = list(zip(df2['feature_set'], df2['label']))

# add feature for previous word label 
df2_test['prevlabel'] = '<END>' #placeholder
df2_test['feature_set'] = df2_test.apply(lambda x: {'word': x['word'],  'prevword':x['prevword'], 'postword':x['postword'], 'prevlabel':x['prevlabel']}, axis=1)
test_set2 = list(zip(df2_test['feature_set'], df2_test['label']))

#train NB model
clf2 = nltk.NaiveBayesClassifier.train(train_set2)


In [121]:
df2.head()

Unnamed: 0,label,word,prevword,postword,feature_set,feature_set_embed,prevlabel
0,<START>,<START>,<END>,steve,"{'word': '<START>', 'prevword': '<END>', 'post...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",<END>
1,B-Actor,steve,<START>,mcqueen,"{'word': 'steve', 'prevword': '<START>', 'post...","[0.3767693, -0.46156234, -0.14322689, -0.19135...",<START>
2,I-Actor,mcqueen,steve,provided,"{'word': 'mcqueen', 'prevword': 'steve', 'post...","[0.043367274, -0.046646018, -0.021394843, -0.0...",B-Actor
3,O,provided,mcqueen,a,"{'word': 'provided', 'prevword': 'mcqueen', 'p...","[-0.020892624, -0.08252325, -0.17344192, -0.06...",I-Actor
4,O,a,provided,thrilling,"{'word': 'a', 'prevword': 'provided', 'postwor...","[-1.5991454, 1.7220784, -0.26430577, -1.092318...",O


In [122]:
##df2_test['predicted'] = '<END>'

# predict first label
df2_test.loc[0, 'predicted'] = clf2.classify(df2_test['feature_set'][0])
# predict all following labels
for i in range(1, len(df2_test)):
    df2_test.loc[i, 'feature_set']['prevlabel'] = df2_test.loc[i - 1, 'predicted']
    df2_test.loc[i, 'predicted'] = clf2.classify(df2_test['feature_set'][i])

df2_test.head(10)

Unnamed: 0,label,word,prevword,postword,feature_set,feature_set_embed,prevlabel,predicted
0,<START>,<START>,<END>,i,"{'word': '<START>', 'prevword': '<END>', 'post...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",<END>,<START>
1,O,i,<START>,need,"{'word': 'i', 'prevword': '<START>', 'postword...","[-1.4769813, -0.8428824, -1.3867565, 0.8732244...",<END>,O
2,O,need,i,that,"{'word': 'need', 'prevword': 'i', 'postword': ...","[-0.20602795, -0.68958354, -1.057523, 0.941065...",<END>,O
3,O,that,need,movie,"{'word': 'that', 'prevword': 'need', 'postword...","[-1.1211042, 0.47837305, -0.71875197, -0.16998...",<END>,O
4,O,movie,that,which,"{'word': 'movie', 'prevword': 'that', 'postwor...","[-0.50112474, 0.3242098, -1.7734414, -0.276343...",<END>,O
5,O,which,movie,involves,"{'word': 'which', 'prevword': 'movie', 'postwo...","[-1.2074406, 1.5997456, -0.63699865, -0.219116...",<END>,O
6,O,involves,which,aliens,"{'word': 'involves', 'prevword': 'which', 'pos...","[0.38131714, -0.05572253, 0.14211194, -0.44351...",<END>,O
7,B-Plot,aliens,involves,invading,"{'word': 'aliens', 'prevword': 'involves', 'po...","[-0.28205568, 0.020349815, 0.11402833, 0.14295...",<END>,B-Plot
8,I-Plot,invading,aliens,earth,"{'word': 'invading', 'prevword': 'aliens', 'po...","[-0.031538054, 0.0007638845, -0.0256259, -0.05...",<END>,I-Plot
9,I-Plot,earth,invading,in,"{'word': 'earth', 'prevword': 'invading', 'pos...","[-1.1734735, 0.2517501, -0.35981947, 0.4552904...",<END>,I-Plot


In [123]:
accuracy_clf2 = accuracy_score(df2_test['label'].to_list(), df2_test['predicted'].to_list())
print(accuracy_clf2)

0.8123704617964184


In [125]:
# sample of actual test labels vs. predicted test labels
#list(zip(df_test['label'][:100],df2_test['predicted'].to_list()[:100], predictions[:100]))

In [126]:
cm2 = nltk.ConfusionMatrix(df_test['label'].to_list(), df2_test['predicted'].to_list())
print(cm2.pretty_format(sort_by_count=True, show_percents=True, truncate=14))

           |                                                                                   B      I        |
           |                                                                                   -      -        |
           |                                                       I                           D      D        |
           |                           <             I      B      -      B             I      i      i      I |
           |      I                    S      B      -      -      O      -      B      -      r      r      - |
           |      -             <      T      -      A      A      r      G      -      G      e      e      Q |
           |      P             E      A      P      c      c      i      e      Y      e      c      c      u |
           |      l             N      R      l      t      t      g      n      e      n      t      t      o |
           |      o             D      T      o      o      o      i      r      a      r      o