In [80]:
%load_ext autoreload
%autoreload 2

import math
import gensim.models as gs
import pickle as pk
import sklearn.metrics as met
import scipy.stats as stats
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

import twitter_sentiment_dataset as tsd
import phrase2vec as p2v
from twitter_sentiment_dataset import TweetTrainingExample
from model import ModelParams

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setup
Load the three vector representations from files. In general, any variable with the word 'none' in it refers to Google News word2vec w/o any emoji vectors, 'ours' to Google News word2vec w/ vectors we trained, and 'theirs' to Google News word2vec with the vectors trained by Barbieri et. al.

In [81]:
w2v_path='./data/word2vec/GoogleNews-vectors-negative300.bin'

in_dim = 300   # Length of word2vec vectors
out_dim = 300  # Desired dimension of output vectors
pos_ex = 4
neg_ratio = 1
max_epochs = 40
dropout = 0.1

params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs,
                    neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5, model="dan")



e2v_ours_path = params.model_folder('unicode') + '/emoji2vec.bin'
e2v_theirs_path = './data/word2vec/emoji_subset_theirs.bin'

In [82]:
w2v = gs.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
e2v_ours = gs.KeyedVectors.load_word2vec_format(e2v_ours_path, binary=True)
e2v_theirs = gs.KeyedVectors.load_word2vec_format(e2v_ours_path, binary=True)

In [83]:
p2v_no_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=None)
p2v_our_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_ours)
p2v_their_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_theirs)

Using stats scraped from emojitracker.com at a certain point in time, we generate two sets of emoji: the top 173 most frequently used emoji, whose usage constitutes 90% of emoji usage on Twitter, and the bottom 612 least frequently used emoji, whose usage constitutes 10% of emoji usage on Twitter.

Subsequently, 'common' will refer to the former group, while 'rare' will refer to the latter.

In [84]:
p = open('./data/tweets/frequencies_w_emoji.txt', 'r')
ems = p.readlines()
ems = [l.split('\t')[0] for l in ems]
p.close()
top90 = set(ems[:173])
bottom10 = set(ems[173:])
p.close()

In [85]:
def emoji_dataset_stats(tweets):
    total_tweets = len(tweets)
    total_emoji = tsd.num_tweets_with_emoji(tweets, e2v_ours, e2v_theirs, ems)
    top_90_total = tsd.num_tweets_with_emoji(tweets, set(), set(), top90)
    bottom_10_total = tsd.num_tweets_with_emoji(tweets, set(), set(), bottom10)
    return total_tweets, total_emoji, top_90_total, bottom_10_total

Statistics for the entire Twitter corpus. Counts refer to # of tweets containing emoji of a type.

In [86]:
train_tweets, test_tweets = tsd.load_training_test_sets()
print('All Tweets in corpus: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(tsd.get_all_examples()))
print('Training set: total tweets: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(train_tweets))
print('Test set: total tweets: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(test_tweets))

All Tweets in corpus: 64599, total emoji: 11701, common emoji: 11137, rare emoji: 1576
Training set: total tweets: 51679, total emoji: 9405, common emoji: 8950, rare emoji: 1268
Test set: total tweets: 12920, total emoji: 2296, common emoji: 2187, rare emoji: 308


In [87]:
def emoji_dataset_label_stats(tweets):
    res = dict()
    res['Positive'] = 0
    res['Negative'] = 0
    res['Neutral'] = 0
    for tweet in tweets:
        res[tweet.label] += 1/len(tweets)
    print(res)

In [88]:
emoji_dataset_label_stats(train_tweets)
emoji_dataset_label_stats(test_tweets)

{'Positive': 0.2882215213142463, 'Negative': 0.25207531105472025, 'Neutral': 0.45970316763111974}
{'Positive': 0.2876160990712081, 'Negative': 0.25069659442723424, 'Neutral': 0.4616873065016006}


## Prepare Training and Testing Vectors
Given the raw training and test tweets, calculate the vector representations for each tweet for each model.

In [89]:
train_none, train_y = tsd.prepare_tweet_vector_averages(train_tweets, p2v_no_emoji)
train_ours, _ = tsd.prepare_tweet_vector_averages(train_tweets, p2v_our_emoji)
train_theirs, _ = tsd.prepare_tweet_vector_averages(train_tweets, p2v_their_emoji)

In [90]:
test_none, test_y = tsd.prepare_tweet_vector_averages(test_tweets, p2v_no_emoji)
test_ours, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_our_emoji)
test_theirs, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_their_emoji)

# Classification

In [91]:
classifiers = {
    'SGD (n_iter=50)' : SGDClassifier(),
    'Random Forest (n_estimators=60)' : RandomForestClassifier(n_estimators=60)
}

In [92]:
def train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5):
    scores_none = cross_val_score(clf, train_none, train_y, cv=cv)
    print("None: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_none.mean(), scores_none.std() * 2))
    
    scores_ours = cross_val_score(clf, train_ours, train_y, cv=cv)
    print("Ours: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_ours.mean(), scores_ours.std() * 2))
    
    scores_theirs = cross_val_score(clf, train_theirs, train_y, cv=cv)
    print("Theirs: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_theirs.mean(), scores_theirs.std() * 2))

In [93]:
def train_and_predict(train_data, train_y, test_data, test_y, clf):
    clf.fit(train_data, train_y)
    predictions = clf.predict(test_data)
    score = met.accuracy_score(test_y, predictions)
    f1 = met.f1_score(test_y, predictions, average='weighted')
    return predictions, score, f1

In [94]:
def train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name):
    none_pred, none_acc, none_f1 = train_and_predict(train_none, train_y, test_none, test_y, clf)
    print('None: %s Test Accuracy: %0.5f, f1=%0.5f' % (clf_name, none_acc, none_f1))
    
    ours_pred, ours_acc, ours_f1 = train_and_predict(train_ours, train_y, test_ours, test_y, clf)
    ours_p = tsd.calculate_mcnemars(none_pred, ours_pred, test_y)
    print('Ours: %s Test Accuracy: %0.5f, p=%0.5f, f1=%0.5f' % (clf_name, ours_acc, ours_p, ours_f1))
    
    theirs_pred, theirs_acc, theirs_f1 = train_and_predict(train_theirs, train_y, test_theirs, test_y, clf)
    theirs_p = tsd.calculate_mcnemars(none_pred, theirs_pred, test_y)
    print('Theirs: %s Test Accuracy: %0.5f, p=%0.5f, f1=%0.5f' % (clf_name, theirs_acc, theirs_p, theirs_f1))
    
    ours_theirs_p = tsd.calculate_mcnemars(ours_pred, theirs_pred, test_y)
    print('Significance between ours and theirs: p=%0.5f' % ours_theirs_p)

## Performance on Training Set and Complete Test Set
For each classifier, we calculate the average performance of the classifier on the training set when cross validation is applied, as well as the accuracy on the complete test set.

In [95]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    
    print()
    
    print('Cross Validation Accuracy on Training Set\n')
    train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5)
    
    print()
    
    print('Accuracy on Test Set\n')
    train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name)
    
    print()

SGD (n_iter=50)

Cross Validation Accuracy on Training Set

None: SGD (n_iter=50) Train Accuracy: 0.61 (+/- 0.008)
Ours: SGD (n_iter=50) Train Accuracy: 0.62 (+/- 0.010)
Theirs: SGD (n_iter=50) Train Accuracy: 0.62 (+/- 0.014)

Accuracy on Test Set

None: SGD (n_iter=50) Test Accuracy: 0.61455, f1=0.60247
Ours: SGD (n_iter=50) Test Accuracy: 0.61641, p=0.52332, f1=0.60075
Theirs: SGD (n_iter=50) Test Accuracy: 0.62732, p=0.00000, f1=0.61819
Significance between ours and theirs: p=0.00000

Random Forest (n_estimators=60)

Cross Validation Accuracy on Training Set

None: Random Forest (n_estimators=60) Train Accuracy: 0.58 (+/- 0.009)
Ours: Random Forest (n_estimators=60) Train Accuracy: 0.60 (+/- 0.010)
Theirs: Random Forest (n_estimators=60) Train Accuracy: 0.59 (+/- 0.006)

Accuracy on Test Set

None: Random Forest (n_estimators=60) Test Accuracy: 0.58266, f1=0.56610
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.59892, p=0.00006, f1=0.58577
Theirs: Random Forest (n_estimators

In [75]:
def train_and_predict_all_on_test_subset(test_tweets, clf, clf_name):
    test_none, test_y = tsd.prepare_tweet_vector_averages(test_tweets, p2v_no_emoji)
    test_ours, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_our_emoji)
    test_theirs, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_their_emoji)

    train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name)

In [96]:
emoji_test_tweets = tsd.get_tweets_with_emoji(test_tweets, e2v_ours, e2v_theirs, ems)
emoji_test_tweets_top90 = tsd.get_tweets_with_emoji(test_tweets, set(), set(), top90)
emoji_test_tweets_bottom10 = tsd.get_tweets_with_emoji(test_tweets, set(), set(), bottom10)

## Test Subset - All Tweets with Emoji
For each classifier, we calculate the accuracy on the subset of test examples that contain emoji.

In [97]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    train_and_predict_all_on_test_subset(emoji_test_tweets, clf, clf_name)
    print()

SGD (n_iter=50)
None: SGD (n_iter=50) Test Accuracy: 0.52134, f1=0.53301
Ours: SGD (n_iter=50) Test Accuracy: 0.59451, p=0.00000, f1=0.60263
Theirs: SGD (n_iter=50) Test Accuracy: 0.61890, p=0.00000, f1=0.61937
Significance between ours and theirs: p=0.00013

Random Forest (n_estimators=60)
None: Random Forest (n_estimators=60) Test Accuracy: 0.47300, f1=0.48541
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.58014, p=0.00000, f1=0.58005


KeyboardInterrupt: 

## Test Subset - All Tweets with Common Emoji
For each classifier, we calculate the accuracy on the subset of test examples that contain common (Top 90%) emoji.

In [98]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    train_and_predict_all_on_test_subset(emoji_test_tweets_top90, clf, clf_name)
    print()

SGD (n_iter=50)
None: SGD (n_iter=50) Test Accuracy: 0.49657, f1=0.50814
Ours: SGD (n_iter=50) Test Accuracy: 0.60128, p=0.00000, f1=0.61132
Theirs: SGD (n_iter=50) Test Accuracy: 0.60219, p=0.00000, f1=0.61112
Significance between ours and theirs: p=0.80554

Random Forest (n_estimators=60)
None: Random Forest (n_estimators=60) Test Accuracy: 0.47737, f1=0.49023
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.59351, p=0.00000, f1=0.59494


KeyboardInterrupt: 

## Test Subset - All Tweets with Rare Emoji
For each classifier, we calculate the accuracy on the subset of test examples that contain rare (Bottom 10%) emoji.

In [99]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    train_and_predict_all_on_test_subset(emoji_test_tweets_bottom10, clf, clf_name)
    print()

SGD (n_iter=50)
None: SGD (n_iter=50) Test Accuracy: 0.46429, f1=0.45852
Ours: SGD (n_iter=50) Test Accuracy: 0.53571, p=0.02938, f1=0.53963
Theirs: SGD (n_iter=50) Test Accuracy: 0.54545, p=0.01664, f1=0.54354
Significance between ours and theirs: p=0.43858

Random Forest (n_estimators=60)
None: Random Forest (n_estimators=60) Test Accuracy: 0.44805, f1=0.43681
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.55519, p=0.00513, f1=0.55268


KeyboardInterrupt: 