In [1]:
import gensim
import spacy
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV # For optimization
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Change this to your own path of a word embedding model
google_news_path = "../embedding_models/GoogleNews-vectors-negative300.bin.gz"

# Load google news embeddings using gensim
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(google_news_path, binary=True)

In [2]:
# Set this to the amount of dimensions in the used word embedding model
num_features = 300

def find_embed(word):
    '''
    Returns the word embedding for a word if it exists, otherwise returns a list full of zeros
    '''
    try:
        return(word_embedding_model[word.lower()])
    except:
        return [0]*num_features

In [3]:
# Use pandas to read in txt file into pandas dataframe
df_train = pd.read_csv('SEM-2012-SharedTask-CD-SCO-training-simple.v2.txt', sep="\t", 
                 names=["story", "sent_index", "token_index", "token", "bio"])

# Find corresponding functions for every token
df_train['vector'] = df_train['token'].apply(find_embed)

# Show dimensions of embeddings
print(len(df_train.iloc[0]['vector']))

# Show structure of data
df_train.head(1)

300


Unnamed: 0,story,sent_index,token_index,token,bio,vector
0,baskervilles01,0,0,Chapter,O,"[0.00065231323, -0.083496094, 0.1328125, -0.09..."


In [4]:
X = np.array(df_train['vector'].tolist())
y = np.array(df_train['bio'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [5]:
# Use GridSearch to find the best parameters
mlp = MLPClassifier(max_iter=5000)
parameter_space = {
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.01],
}

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

# Print best paramete settings found in the grid search
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'alpha': 0.0001, 'solver': 'adam'}


In [14]:
best_alpha = clf.best_params_['alpha']
best_solver = clf.best_params_['solver']

mlp = MLPClassifier(max_iter=10000,alpha = best_alpha, solver = best_solver)
mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)

print(classification_report(y_test, predictions, digits=5))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-NEG    0.91016   0.88258   0.89615       264
       I-NEG    0.00000   0.00000   0.00000         3
           O    0.99801   0.99870   0.99835     16096

    accuracy                        0.99664     16363
   macro avg    0.63606   0.62709   0.63150     16363
weighted avg    0.99641   0.99664   0.99652     16363



  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
mlp

MLPClassifier(alpha=0.01, max_iter=10000)

In [11]:
clf.best_params_['alpha']

0.0001