In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import pickle
import nltk
import scipy.sparse as sparse

#NLP libraries
import spacy, nltk, gensim, sklearn

#Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC

In [2]:
seed = 42
np.random.seed(seed)

In [3]:
def count_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count

# importing Data

In [4]:
DATA_FOLDER = 'data/'
TWITTER_FOLDER = DATA_FOLDER + 'twitter-datasets/'
EN_CORE_WEB_SM = DATA_FOLDER + 'en_core_web_sm-3.0.0'

In [5]:
pos = pd.read_csv(TWITTER_FOLDER + 'train_pos_full.txt', sep='\r\t', header=None, names=['tweet'], engine='python')
neg = pd.read_csv(TWITTER_FOLDER + 'train_neg_full.txt', sep='\r\t', header=None, names=['tweet'], engine='python', on_bad_lines='skip')

In [6]:
test_data = pd.read_csv(TWITTER_FOLDER + 'test_data.txt', sep='\t', header=None, names=['tweet'])
test_data['tweet'] = test_data['tweet'].apply(lambda t : t[t.find(',')+1:])

# Creating Test/Train

#### Adding some processing

In [7]:
def combine_pos_neg(pos, neg, n=None):
    pos = pos.drop_duplicates()[:n]
    neg = neg.drop_duplicates()[:n]
    pos['is_pos'] = 1
    neg['is_pos'] = 0
    return pd.concat([pos, neg])

In [8]:
def adding_metadata(tweets):
    analyzer = SentimentIntensityAnalyzer()
    polarity_scores = tweets['tweet'].apply(analyzer.polarity_scores)

    tweets['neg']       = polarity_scores.apply(lambda d : d['neg'])
    tweets['neu']       = polarity_scores.apply(lambda d : d['neu'])
    tweets['pos']       = polarity_scores.apply(lambda d : d['pos'])
    tweets['compound']  = polarity_scores.apply(lambda d : d['compound'])

    tweets['par_count'] = tweets["tweet"].apply(count_parenthesis)
    tweets['len_tweet'] = tweets["tweet"].apply(len)
    return tweets

#### Split test / train

In [9]:
def split_test_train(tweets):
    X = tweets.drop(['is_pos'], axis=1)
    Y = tweets.drop(['tweet'],  axis=1)
    X, Y = shuffle(X, Y, random_state=42)
    X_tr_df, X_te_df, Y_tr_df, Y_te_df = train_test_split(X, Y, test_size=10000, random_state=42)
    return X_tr_df, X_te_df, Y_tr_df, Y_te_df

#### Vectorization

In [10]:
def vectorize_tweets(vectorizer, X_tr_df, X_te_df):
    X_tr_vec = vectorizer.fit_transform(X_tr_df['tweet'])
    X_te_vec = vectorizer.transform(X_te_df['tweet'])
    return X_tr_vec, X_te_vec

In [11]:
def scale_data(scaler, X_tr, X_te):
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    return X_tr, X_te

In [12]:
def get_scaled_data(scaler, features):
    X_tr                     = X_tr_df[features].to_numpy()
    X_te                     = X_te_df[features].to_numpy()
    X_tr_scaled, X_te_scaled = scale_data(scaler, X_tr, X_te)
    return X_tr_scaled, X_te_scaled

# PreProcessing

In [13]:
tweets = combine_pos_neg(pos, neg)
tweets = adding_metadata(tweets)

In [14]:
X_tr_df, X_te_df, Y_tr_df, Y_te_df = split_test_train(tweets)

In [15]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.75, ngram_range=(1,5), strip_accents='unicode')

In [16]:
Y_tr = np.array(Y_tr_df['is_pos'])
Y_te = np.array(Y_te_df['is_pos'])

In [17]:
scaler = StandardScaler()

par_count_tr,        par_count_te        = get_scaled_data(scaler, ['par_count'])
pol_score_tr,        pol_score_te        = get_scaled_data(scaler, ['neg', 'neu', 'pos', 'compound'])
metadata_tr,         metadata_te         = get_scaled_data(scaler, ['neg', 'neu', 'pos', 'compound', 'par_count', 'len_tweet'])


X_tr_vec,            X_te_vec            = vectorize_tweets(vectorizer, X_tr_df, X_te_df)
X_tr_with_pol_score, X_te_with_pol_score = sparse.hstack((X_tr_vec, pol_score_tr)), sparse.hstack((X_te_vec, pol_score_te))
X_tr_with_par_cont,  X_te_with_par_cont  = sparse.hstack((X_tr_vec, par_count_tr)), sparse.hstack((X_te_vec, par_count_te))
X_tr_with_metadata,  X_te_with_metadata  = sparse.hstack((X_tr_vec, metadata_tr)),  sparse.hstack((X_te_vec, metadata_te))

# Predictions

In [18]:
def fit_and_get_score(model, X_tr, X_te, Y_tr=Y_tr, Y_te=Y_te):
    model.fit(X_tr, Y_tr)
    acc_tr = model.score(X_tr, Y_tr)
    acc_te = model.score(X_te, Y_te)
    return acc_tr, acc_te

In [19]:
log_reg = LogisticRegression(solver='liblinear')
svc     = LinearSVC()
rfc     = RandomForestClassifier(max_depth=100, random_state=42)

In [20]:
pol_score_acc      = fit_and_get_score( log_reg,  pol_score_tr,        pol_score_te)
par_count_acc      = fit_and_get_score( log_reg,  par_count_tr,        par_count_te)
metadata_acc       = fit_and_get_score( log_reg,  metadata_tr,         metadata_te)
X_tr_vec_acc       = fit_and_get_score( log_reg,  X_tr_vec,            X_te_vec)
with_pol_score_acc = fit_and_get_score( log_reg,  X_tr_with_pol_score, X_te_with_pol_score)
with_par_cont_acc  = fit_and_get_score( log_reg,  X_tr_with_par_cont,  X_te_with_par_cont)
with_metadata_acc  = fit_and_get_score( log_reg,  X_tr_with_metadata,  X_te_with_metadata)

In [46]:
svc_acc            = fit_and_get_score( svc,      X_tr_vec,            X_tr_vec)
rfc_acc            = fit_and_get_score( rfc,      X_tr_vec,            X_tr_vec)

ValueError: Found input variables with inconsistent numbers of samples: [10000, 2222156]

In [21]:
print('Accuracy test/train for prediction using logistic regression with :')
print('only polarity score                     ',pol_score_acc)
print('only parenthesis count                  ',par_count_acc)
print('only metadata                           ',metadata_acc)
print('only vectorized tweet                   ',X_tr_vec_acc)
print('vectorized tweet plus polarity score    ',with_pol_score_acc)
print('vectorized tweet plus parenthesis count ',with_par_cont_acc)
print('vectorized tweet plus tweet metadata    ',with_metadata_acc)

Accuracy test/train for prediction using logistic regression with :
only polarity score                      (0.644183851054775, 0.6413)
only parenthesis count                   (0.6553398788399996, 0.6524)
only metadata                            (0.737303371581813, 0.7372)
only vectorized tweet                    (0.8782626891078982, 0.8433)
vectorized tweet plus polarity score     (0.8782626891078982, 0.8449)
vectorized tweet plus parenthesis count  (0.8856726131860373, 0.8548)
vectorized tweet plus tweet metadata     (0.8855266266221098, 0.8547)


In [None]:
print('accuracy test/train for prediction using')
print('Linear Support Vector Machine classifier', svc_acc)
print('Random Forest classifier                ', rfc_acc)

## Deeper Analysis

In [20]:
X_tr = sparse.csr_matrix(X_tr_with_metadata)
X_te = sparse.csr_matrix(X_te_with_metadata)

In [30]:
subset_acc = fit_and_get_score(log_reg, X_tr[X_tr_df.par_count > 0], X_te[X_te_df.par_count > 0],
                                        Y_tr[Y_tr_df.par_count > 0], Y_te[Y_te_df.par_count > 0])

full_acc   = fit_and_get_score(log_reg, X_tr, X_te[X_te_df.par_count > 0],
                                        Y_tr, Y_te[Y_te_df.par_count > 0])

print('Accuracy test/train for prediction using logistic regression on the subset of tweets with positive parenthesis count : ')
print('when trained on the corresponding subset ({:.4}, {:.4})'.format(subset_acc[0], subset_acc[1]))
print('when trained on the whole dataset        ({:.4}, {:.4})'.format(full_acc[0],   full_acc[1]  ))

Accuracy test/train for prediction using logistic regression on the subset of tweets with positive parenthesis count : 
when trained on the corresponding subset (0.9824, 0.9816)
when trained on the whole dataset        (0.8855, 0.9834)


In [24]:
subset_acc = fit_and_get_score(log_reg, X_tr[X_tr_df.par_count < 0], X_te[X_te_df.par_count < 0],
                                        Y_tr[Y_tr_df.par_count < 0], Y_te[Y_te_df.par_count < 0])

full_acc   = fit_and_get_score(log_reg, X_tr, X_te[X_te_df.par_count < 0],
                                        Y_tr, Y_te[Y_te_df.par_count < 0])

print('Accuracy test/train for prediction using logistic regression on the subset of tweets with negative parenthesis count : ')
print('when trained on the corresponding subset ({:.4}, {:.4})'.format(subset_acc[0], subset_acc[1]))
print('when trained on the whole dataset        ({:.4}, {:.4})'.format(full_acc[0],   full_acc[1]  ))

Accuracy test/train for prediction using logistic regression on the subset of tweets with negative parenthesis count : 
when trained on the corresponding subset (0.9031, 0.896)
when trained on the whole dataset        (0.8855, 0.8893)


In [25]:
subset_acc = fit_and_get_score(log_reg, X_tr[X_tr_df.par_count == 0], X_te[X_te_df.par_count == 0],
                                        Y_tr[Y_tr_df.par_count == 0], Y_te[Y_te_df.par_count == 0])

full_acc   = fit_and_get_score(log_reg, X_tr, X_te[X_te_df.par_count == 0],
                                        Y_tr, Y_te[Y_te_df.par_count == 0])

print('Accuracy test/train for prediction using logistic regression on the subset of tweets with balanced parenthesis count : ')
print('when trained on the corresponding subset ({:.4}, {:.4})'.format(subset_acc[0], subset_acc[1]))
print('when trained on the whole dataset        ({:.4}, {:.4})'.format(full_acc[0],   full_acc[1]  ))

Accuracy test/train for prediction using logistic regression on the subset of tweets with balanced parenthesis count : 
when trained on the corresponding subset (0.8654, 0.8288)
when trained on the whole dataset        (0.8855, 0.8264)


In [23]:
log_reg.fit(X_tr[X_tr_df.par_count > 0], Y_tr[Y_tr_df.par_count > 0]);

In [22]:
test_data_df = adding_metadata(test_data)

In [26]:
test_data_vec = vectorizer.transform(test_data_df['tweet'])

In [24]:
scaler.fit(X_te_df[['neg', 'neu', 'pos', 'compound', 'par_count', 'len_tweet']])
test_metadata = scaler.transform(test_data_df[['neg', 'neu', 'pos', 'compound', 'par_count', 'len_tweet']])

In [28]:
test_with_metadata  = sparse.hstack((test_data_vec, test_metadata))

In [34]:
test_data_df['pred'] = log_reg.predict(test_with_metadata)

In [39]:
translation = test_data_df[test_data_df.par_count > 0]['pred'].to_dict()

In [38]:
import json

In [40]:
with open('dict.txt', 'w') as convert_file:
    convert_file.write(json.dumps(translation))