## Baselines

In [1]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np
import pandas as pd
%matplotlib inline

import scipy.sparse as sparse

#Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

In [2]:
seed = 42
np.random.seed(seed)

# importing Data

In [3]:
DATA_FOLDER = 'data/'
TWITTER_FOLDER = DATA_FOLDER + 'twitter-datasets/'
EN_CORE_WEB_SM = DATA_FOLDER + 'en_core_web_sm-3.0.0'

In [4]:
pos = pd.read_csv(TWITTER_FOLDER + 'train_pos_full.txt', sep='\r\t', header=None, names=['tweet'], engine='python')
neg = pd.read_csv(TWITTER_FOLDER + 'train_neg_full.txt', sep='\r\t', header=None, names=['tweet'], engine='python', on_bad_lines='skip')

# Creating Test/Train

#### Split test / train

In [5]:
def split_test_train(tweets, test_size=10000):
    """
    Split the labelled tweets in train/test set. Randomisation for the creation is fixed for reproducibility.

    :param tweets: Labelled tweets in a Dataframe
    :param test_size: The size of the test set
    :return: dataset of tweet splited in train/test set
    """
    X = tweets.drop(['is_pos'], axis=1)
    Y = tweets.drop(['tweet'],  axis=1)
    X, Y = shuffle(X, Y, random_state=42)
    X_tr_df, X_te_df, Y_tr_df, Y_te_df = train_test_split(X, Y, test_size=test_size, random_state=42)
    return X_tr_df, X_te_df, Y_tr_df, Y_te_df

#### Adding some processing

In [10]:
def combine_pos_neg(pos, neg):
    """
    :param pos: Dataframe containing positive tweets
    :param neg: Dataframe containing negative tweets
    :return: the combination of the two sets with labels in one Dataframe (without duplicates)
    """
    pos['is_pos'] = 1
    neg['is_pos'] = 0
    return pd.concat([pos, neg]).drop_duplicates()

In [7]:
def count_parenthesis(tweet):
    """
    :param tweet: a tweet
    :return: the count of opening parenthesis minus the closing parenthesis
    """
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count


def adding_metadata(tweets):
    """
    Add metadata to the dataframe of tweet : the added metadata are the result of sentiment analysis and the count of parenthesis
    :param tweets: A Dataframe of tweet
    :return: The same dataframe with additional metadata
    """
    analyzer = SentimentIntensityAnalyzer()

    # we keep track on the time to apply the sentiment analysis to the dataset
    start_time = time.time()
    polarity_scores = tweets['tweet'].apply(analyzer.polarity_scores)
    sentiment_analysis_time = time.time() - start_time
    print('Time to apply sentiment analysis on the dataset {:.4} seconde'.format(sentiment_analysis_time))

    # storing the result of the sentiment analysis
    tweets['neg']       = polarity_scores.apply(lambda d : d['neg'])
    tweets['neu']       = polarity_scores.apply(lambda d : d['neu'])
    tweets['pos']       = polarity_scores.apply(lambda d : d['pos'])
    tweets['compound']  = polarity_scores.apply(lambda d : d['compound'])

    # storing the parenthesis count
    tweets['par_count'] = tweets["tweet"].apply(count_parenthesis)
    return tweets

In [8]:
def scale_data(scaler, X_tr, X_te):
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    return X_tr, X_te

def get_scaled_data(scaler, X_tr_df, X_te_df, features):
    X_tr                     = X_tr_df[features].to_numpy()
    X_te                     = X_te_df[features].to_numpy()
    X_tr_scaled, X_te_scaled = scale_data(scaler, X_tr, X_te)
    return X_tr_scaled, X_te_scaled

#### Vectorization

In [9]:
def vectorize_tweets(vectorizer, X_tr_df, X_te_df):
    X_tr_vec = vectorizer.fit_transform(X_tr_df['tweet'])
    X_te_vec = vectorizer.transform(X_te_df['tweet'])
    return X_tr_vec, X_te_vec

# PreProcessing

In [13]:
tweets = combine_pos_neg(pos, neg)
tweets = adding_metadata(tweets)

Time to apply sentiment analysis on the dataset 118.4 seconde


In [14]:
X_tr_df, X_te_df, Y_tr_df, Y_te_df = split_test_train(tweets, test_size=100000)

In [None]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,5), strip_accents='unicode')
X_tr_vec,            X_te_vec            = vectorize_tweets(vectorizer, X_tr_df, X_te_df)
vectorization_time = time.time() - start_time

scaler = StandardScaler()
par_count_tr,        par_count_te        = get_scaled_data(scaler, ['par_count'])
pol_score_tr,        pol_score_te        = get_scaled_data(scaler, ['neg', 'neu', 'pos', 'compound'])

In [None]:
print('Time to vectorize the dataset {:.4} seconde'.format(vectorization_time))

In [None]:
Y_tr = np.array(Y_tr_df['is_pos'])
Y_te = np.array(Y_te_df['is_pos'])

# Predictions

In [None]:
FULL_TRAIN_SET = [True]*len(X_tr_df)
FULL_TEST_SET  = [True]*len(X_te_df)

In [None]:
def create_inputs(has_vectorized_tweet, has_par_count, has_pol_score):
    X_tr = None
    X_te = None

    if has_par_count:
        X_tr = sparse.hstack((X_tr, par_count_tr))
        X_te = sparse.hstack((X_te, par_count_te))
    if has_pol_score:
        X_tr = sparse.hstack((X_tr, pol_score_tr))
        X_te = sparse.hstack((X_te, pol_score_te))
    if has_vectorized_tweet:
        X_tr = sparse.hstack((X_tr, X_tr_vec))
        X_te = sparse.hstack((X_te, X_te_vec))

    return sparse.csr_matrix(X_tr), sparse.csr_matrix(X_te)

In [None]:
def fit_and_store_res(df_score, model, model_name, has_vectorized_tweet, has_par_count, has_pol_score,
                      subset_train=FULL_TRAIN_SET, subset_train_name='Full train set'):
    X_tr, X_te = create_inputs(has_vectorized_tweet, has_par_count, has_pol_score)

    start_time     = time.time()
    model.fit(X_tr[subset_train], Y_tr[subset_train])
    time_to_fit     = time.time() - start_time

    start_time      = time.time()
    test_acc        = model.score(X_te, Y_te)
    time_to_predict = time.time() - start_time

    new_row = pd.DataFrame()

    new_row['model']                         = model_name,
    new_row['Train has vectorized tweet']    = has_vectorized_tweet,
    new_row['Train has parenthesis count']   = has_par_count,
    new_row['Train has polarity score']      = has_pol_score,
    new_row['Train on']                      = subset_train_name,
    new_row['time to fit [s]']               = time_to_fit,
    new_row['time_to_predict [s]']           = time_to_predict,
    new_row['accuracy on the training set']  = model.score(X_tr[subset_train], Y_tr[subset_train]),
    new_row['accuracy on the full test set'] = test_acc,
    new_row['accuracy on subset test 1']     = model.score(X_te[X_te_df.par_count == 0], Y_te[Y_te_df.par_count == 0]),
    new_row['accuracy on subset test 2']     = model.score(X_te[X_te_df.par_count >  0], Y_te[Y_te_df.par_count >  0]),
    new_row['accuracy on subset test 3']     = model.score(X_te[X_te_df.par_count <  0], Y_te[Y_te_df.par_count <  0])

    return pd.concat([df_score, new_row])

## Running the predicitons

In [None]:
log_reg = LogisticRegression(solver='liblinear')
svc     = LinearSVC()
rfc     = RandomForestClassifier(max_depth=10, random_state=42)
bnb     = BernoulliNB()

In [None]:
df_score = pd.DataFrame()

In [None]:
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', False, False, True )
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', False, True,  False)
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  False, False)
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', False, True,  True )
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  False)
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  False, True )
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True )

In [None]:
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count == 0, subset_train_name= 'Subset 1')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count >  0, subset_train_name= 'Subset 2')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count <  0, subset_train_name= 'Subset 3')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count >= 0, subset_train_name= 'Subset 1 and 2')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count <= 0, subset_train_name= 'Subset 1 and 3')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count != 0, subset_train_name= 'Subset 2 and 3')

In [None]:
df_score = fit_and_store_res(df_score, svc,    'Support Vector Machine', True,  True,  True )
df_score = fit_and_store_res(df_score, rfc,    'Random forest',          True,  True,  True )
df_score = fit_and_store_res(df_score, bnb,    'Bernoulli',              True,  True,  True )

In [None]:
df_score.to_csv(DATA_FOLDER + 'baseline_scores.csv', index=False)

In [None]:
df_score.to_csv(DATA_FOLDER + 'baseline_scores.csv', index=False)

In [76]:
df_score

Unnamed: 0,model,Train has vectorized tweet,Train has parenthesis count,Train has polarity score,Train on,time to fit [s],time_to_predict [s],accuracy on the training set,accuracy on the full test set,accuracy on subset test 1,accuracy on subset test 2,accuracy on subset test 3
0,Logistic Regression,False,False,True,Full train set,1.338913,0.002481,0.644184,0.6413,0.639107,0.66429,0.57047
0,Logistic Regression,False,True,False,Full train set,1.110298,0.00208,0.65534,0.6524,0.577503,0.969697,0.875839
0,Logistic Regression,True,False,False,Full train set,35.563274,0.010992,0.878264,0.8432,0.820801,0.957813,0.798658
0,Logistic Regression,False,True,True,Full train set,1.653476,0.003276,0.73864,0.7378,0.684,0.969697,0.875839
0,Logistic Regression,True,True,False,Full train set,30.441568,0.008271,0.885723,0.8548,0.826537,0.983363,0.889262
0,Logistic Regression,True,False,True,Full train set,51.61027,0.007873,0.878325,0.8447,0.822048,0.958408,0.812081
0,Logistic Regression,True,True,True,Full train set,63.252423,0.010196,0.885728,0.8565,0.828657,0.983363,0.889262
0,Logistic Regression,True,True,True,Subset 1,57.304283,0.007033,0.746165,0.7159,0.829031,0.151515,0.85906
0,Logistic Regression,True,True,True,Subset 2,8.690293,0.005807,0.664643,0.6712,0.607058,0.980986,0.647651
0,Logistic Regression,True,True,True,Subset 3,2.156092,0.006085,0.524703,0.5193,0.606934,0.036245,0.889262
