In [1]:
import pandas as pd
import nltk
import io

In [2]:
def file_reader(filename):
    with open(filename) as f:
        yield '<START>\t<START>\n'
        for line in f:
            if line and line != '\n':
                yield line
            else:
                yield '<END>\t<END>\n<START>\t<START>\n'

# load the data
data = io.StringIO(''.join(file_reader('trivia10k13train.bio')))
df = pd.read_csv(data, sep = '\t', header = None, names = ['label', 'word'])
df = df.iloc[:-1]

# columns for previous word and following word
df['prevword'] = df['word'].shift(1, fill_value = df.iloc[-1:, 1])
df['postword'] = df['word'].shift(-1, fill_value = df.iloc[0:1, 1])

# column with data in appropriate format for nltk
df['feature_set'] = df.apply(lambda x: {'word': x['word'], 'prevword':x['prevword'], 'postword':x['postword']}, axis=1)

# do same for test set
data = io.StringIO(''.join(file_reader('trivia10k13test.bio')))
df_test = pd.read_csv(data, sep = '\t', header = None,  names = ['label', 'word'])
df_test = df_test.iloc[:-1]
df_test['prevword'] = df_test['word'].shift(1, fill_value = df.iloc[-1:, 1])
df_test['postword'] = df_test['word'].shift(-1, fill_value = df.iloc[0:1, 1])
df_test['feature_set'] = df_test.apply(lambda x: {'word': x['word'], 'prevword':x['prevword'], 'postword':x['postword']}, axis=1)


In [3]:
# prep data for nltk classifier
train_set = list(zip(df['feature_set'], df['label']))
test_set = list(zip(df_test['feature_set'], df_test['label']))

# train model and predict on test set
clf = nltk.NaiveBayesClassifier.train(train_set)
predictions = clf.classify_many(df_test['feature_set'])

In [8]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

f1 = f1_score(df_test['label'].to_list(), predictions, average='weighted')
prec = precision_score(df_test['label'].to_list(), predictions, average='weighted')
recall = recall_score(df_test['label'].to_list(), predictions, average='weighted')
accuracy = accuracy_score(df_test['label'].to_list(), predictions)
print("f1:", f1, "| precision:", prec, "| recall:", recall, "| accuracy:", accuracy)

f1: 0.7898015166767505 | precision: 0.7916808840683861 | recall: 0.8051512540462495 | accuracy: 0.8051512540462495


In [12]:
cm = nltk.ConfusionMatrix(df_test['label'].to_list(), predictions)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=10))

         |                                                       I               |
         |                           <             I      B      -      B        |
         |      I                    S      B      -      -      O      -      B |
         |      -             <      T      -      A      A      r      G      - |
         |      P             E      A      P      c      c      i      e      Y |
         |      l             N      R      l      t      t      g      n      e |
         |      o             D      T      o      o      o      i      r      a |
         |      t      O      >      >      t      r      r      n      e      r |
---------+-----------------------------------------------------------------------+
  I-Plot | <29.9%>  2.7%   0.0%   0.0%   0.7%   0.1%   0.1%   0.2%   0.1%   0.1% |
       O |   4.3% <27.2%>  0.0%   0.0%   0.6%   0.1%   0.1%   0.2%   0.2%   0.1% |
   <END> |      .      .  <4.5%>     .      .      .      .      .      .      . |
 <ST

In [15]:
from nltk.metrics.scores import (precision, recall, f_measure)

#training and evaluating decision tree and Maximum Entropy classifiers
clf_dt = nltk.DecisionTreeClassifier.train(train_set)
print("Decision tree accuracy:", nltk.classify.accuracy(clf_dt, test_set))

clf_me = nltk.MaxentClassifier.train(train_set)
print("Maxent accuracy:", nltk.classify.accuracy(clf_me, test_set))

Decision tree accuracy: 0.7413660604084674
  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.29584        0.320
             2          -0.76494        0.836


  exp_nf_delta = 2 ** nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)


         Final               nan        0.864
Maxent accuracy: 0.8086444190866537


In [16]:
# array of all entities
df_test['label'].unique()

array(['<START>', 'O', 'B-Plot', 'I-Plot', '<END>', 'B-Genre', 'I-Genre',
       'B-Opinion', 'B-Relationship', 'B-Director', 'I-Director',
       'B-Actor', 'I-Actor', 'B-Origin', 'I-Origin', 'B-Year', 'B-Award',
       'I-Award', 'B-Quote', 'I-Quote', 'B-Character_Name',
       'I-Character_Name', 'I-Opinion', 'I-Year', 'I-Relationship',
       'B-Soundtrack', 'I-Soundtrack'], dtype=object)

In [21]:
import gensim
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/rob/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/rob/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
# generate word embeddings based on movie reviews corpus
# slow - commented out
#model = gensim.models.Word2Vec(movie_reviews.sents())
#model.save('movie.embedding')


In [22]:
new_model = gensim.models.Word2Vec.load('movie.embedding')

In [23]:
import numpy as np
zeros = np.zeros(100)

def tryembed(word):
    try:
        return new_model.wv[word]
    except:
        return zeros
    
# use word embeddings as features
df['feature_set_embed'] = df.apply(lambda x: tryembed(x['word']), axis=1)
features_embed =  pd.DataFrame(df['feature_set_embed'].values.tolist())

df_test = df_test.copy()
df_test['feature_set_embed'] = df_test.apply(lambda x: tryembed(x['word']), axis=1)
features_embed_test =  pd.DataFrame(df_test['feature_set_embed'].values.tolist())

In [24]:
# add features for previous and following words
features_embed_prev = features_embed.shift(1, fill_value = 0)
features_embed_post = features_embed.shift(-1, fill_value = 0)
features_embed = pd.concat([features_embed, features_embed_prev, features_embed_post], axis=1)

features_embed_prev_test = features_embed_test.shift(1, fill_value = 0)
features_embed_post_test = features_embed_test.shift(-1, fill_value = 0)
features_embed_test = pd.concat([features_embed_test, features_embed_prev_test, features_embed_post_test], axis=1)

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
rf.fit(features_embed, df['label'])
predictions_rf = rf.predict(features_embed_test)
print("Random forest accuracy:",accuracy_score(df_test['label'], predictions_rf))



Random forest accuracy: 0.7998882187187071


In [28]:
# actual test labels vs. predicted test labels
list(zip(df_test['label'][:500], predictions[:500]))

[('<START>', '<START>'),
 ('O', 'O'),
 ('O', 'O'),
 ('O', 'O'),
 ('O', 'O'),
 ('O', 'O'),
 ('O', 'O'),
 ('B-Plot', 'B-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('<END>', '<END>'),
 ('<START>', '<START>'),
 ('O', 'O'),
 ('B-Genre', 'B-Year'),
 ('I-Genre', 'B-Genre'),
 ('I-Genre', 'I-Genre'),
 ('B-Opinion', 'I-Genre'),
 ('O', 'O'),
 ('B-Plot', 'B-Plot'),
 ('I-Plot', 'I-Plot'),
 ('I-Plot', 'I-Plot'),
 ('O', 'O'),
 ('O', 'O'),
 ('B-Relationship', 'O'),
 ('O', 'O'),
 ('B-Director', 'B-Director'),
 ('I-Director', 'I-Director'),
 ('O', 'O'),
 ('B-Actor', 'B-Actor'),
 ('I-Actor', 'I-Actor'),
 ('<END>', '<END>'),
 ('<START>', '<START>'),
 ('O', 'O'),
 ('B-Genre', 'B-Genre'),
 ('I-Genre', 'I-Genre'),
 ('O', 'O'),
 ('O', 'I-Origin'),
 ('B-Origin', 'I-Origin'),
 ('I-Origin', 'I-Origin'),
 ('I-Origin', 'I