In [1]:
import numpy as np
import scipy

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import os

import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import pandas as pd

from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# Hyperparamater
EMBED_DIM = 300

In [3]:
df = pd.read_csv("data/text_emotion.csv", sep=',', engine='python')
# TODO: should filter out empty sentiment?

In [4]:
tag2int = {"surprise":0, "happiness":1, "sadness":2, "anger":3, "fun":4, "worry":5, "love":6, "hate":7, "enthusiasm":8, "boredom":9, "relief":10, "empty":11, "neutral":12}
int2tag = {}
for tag in tag2int:
    idx = tag2int[tag]
    int2tag[idx] = tag

In [5]:
def create_tagged_docs(df):
    tagged_docs = []
    for index, row in df.iterrows():
        tagged_doc = TaggedDocument(words=word_tokenize(row["content"]), 
                                    tags=[tag2int[row["sentiment"]]])
        tagged_docs.append(tagged_doc)
    return tagged_docs

In [6]:
all_tagged_docs = create_tagged_docs(df)
train_docs, test_docs = train_test_split(all_tagged_docs, test_size=0.2)
train_docs, val_docs = train_test_split(train_docs, test_size=0.2)
# TODO: check class balance

In [7]:
print("Train example: ", train_docs[0])

Train example:  TaggedDocument(['On', 'the', 'plane', 'to', 'yosemite', '.', 'Forgot', 'my', 'laptop', 'at', 'home'], [5])


In [8]:
print("Train size = {}".format(len(train_docs)))
print("Val size = {}".format(len(val_docs)))
print("Test size = {}".format(len(test_docs)))

Train size = 25600
Val size = 6400
Test size = 8000


In [10]:
# TODO: fine tune the embedding model
doc2vec_model = Doc2Vec(dm=1, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, alpha=0.025, min_alpha=0.001, epochs=50)
doc2vec_model.build_vocab(train_docs)
doc2vec_model.train(train_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Show a sample embedded vector for a test post.

In [59]:
test_example_text = test_docs[0].words
doc2vec_model.infer_vector(test_example_text)

array([ 2.83844601e-02,  4.08873826e-01, -5.48948884e-01, -2.23083198e-01,
       -1.86418891e-01, -4.39181000e-01, -4.36206609e-01,  2.58653730e-01,
        2.91305065e-01, -1.17004864e-01, -1.71481282e-01,  1.82668641e-01,
       -1.98771074e-01,  2.58236945e-01, -4.81589764e-01,  2.96151459e-01,
        6.38123810e-01, -2.32414976e-01,  9.78083387e-02, -2.01625198e-01,
        2.00216249e-01,  1.14549085e-01, -1.14083886e-01,  3.25587213e-01,
        4.40092891e-01,  1.22033380e-01,  1.84285536e-01,  7.87060633e-02,
       -6.26463950e-01,  2.41776835e-02,  1.67207658e-01,  8.04566666e-02,
       -4.10289556e-01, -8.20667371e-02, -7.49115720e-02,  1.18457168e-01,
       -1.73431143e-01,  5.16544282e-01, -4.27971601e-01, -3.89754653e-01,
       -1.81273654e-01, -2.21461639e-01, -7.75295347e-02,  1.81137443e-01,
       -8.33418667e-01,  9.87765938e-02, -5.24490811e-02, -2.06260309e-01,
        3.40102911e-01, -1.74491957e-01, -2.48825982e-01,  3.00210603e-02,
        2.39798933e-01, -

## Baseline Classifiers

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

baseline_models = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier()]

model_names = ["Nearest Neighbors", "Linear SVM", "Gaussian Process",
         "Random Forest", "Neural Net", "AdaBoost"]

In [13]:
def embed_docs(tagged_docs, model):
    num_docs = len(tagged_docs)
    X = np.zeros((num_docs, EMBED_DIM))
    y = np.zeros(num_docs)
    for i in range(num_docs):
        words = tagged_docs[i].words
        tag = tagged_docs[i].tags[0] # TODO: May change if we want multi labels
        X[i] = model.infer_vector(words)
        y[i] = tag
    return X, y

In [14]:
X_train, y_train = embed_docs(train_docs, doc2vec_model)
X_val, y_val  = embed_docs(val_docs, doc2vec_model)
X_test, y_test  = embed_docs(test_docs, doc2vec_model)

In [12]:
for i in range(len(baseline_models)):
    print(model_names[i])
    model = baseline_models[i].fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Testing accuracy = {}'.format(accuracy_score(y_test, y_pred)))
    print('Testing F1 score = {}'.format(f1_score(y_test, y_pred, average='weighted')))
    print()

Nearest Neighbors


NameError: name 'X_train' is not defined