# Baselines

In [1]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np
import pandas as pd
%matplotlib inline

import scipy.sparse as sparse

#Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

In [2]:
seed = 42
np.random.seed(seed)

## importing Data

In [3]:
DATA_FOLDER = 'data/'
TWITTER_FOLDER = DATA_FOLDER + 'twitter-datasets/'
EN_CORE_WEB_SM = DATA_FOLDER + 'en_core_web_sm-3.0.0'

In [4]:
pos = pd.read_csv(TWITTER_FOLDER + 'train_pos_full.txt', sep='\r\t', header=None, names=['tweet'], engine='python')
neg = pd.read_csv(TWITTER_FOLDER + 'train_neg_full.txt', sep='\r\t', header=None, names=['tweet'], engine='python', on_bad_lines='skip')

In [5]:
test_data_raw = pd.read_csv(TWITTER_FOLDER + 'test_data.txt', sep='\t', header=None, names=['tweet'])

test_data = pd.DataFrame()
test_data['Id'] = test_data_raw['tweet'].apply(lambda t : t[:t.find(',')])
test_data['tweet'] = test_data_raw['tweet'].apply(lambda t : t[t.find(',')+1:])

## Defining useful methods for per-processing

In [6]:
def split_test_train(tweets, test_size=10000):
    """
    Split the labelled tweets in train/test set. Randomisation for the creation is fixed for reproducibility.

    :param tweets: Labelled tweets in a Dataframe
    :param test_size: The size of the test set
    :return: dataset of tweet splited in train/test set
    """
    X = tweets.drop(['is_pos'], axis=1)
    Y = tweets.drop(['tweet'],  axis=1)
    X, Y = shuffle(X, Y, random_state=42)
    X_tr_df, X_te_df, Y_tr_df, Y_te_df = train_test_split(X, Y, test_size=test_size, random_state=42)
    return X_tr_df, X_te_df, Y_tr_df, Y_te_df

In [7]:
def combine_pos_neg(pos, neg):
    """
    :param pos: Dataframe containing positive tweets
    :param neg: Dataframe containing negative tweets
    :return: the combination of the two sets with labels in one Dataframe (without duplicates)
    """
    pos['is_pos'] = 1
    neg['is_pos'] = 0
    return pd.concat([pos, neg]).drop_duplicates()

In [8]:
def count_parenthesis(tweet):
    """
    :param tweet: a tweet
    :return: the count of opening parenthesis minus the closing parenthesis
    """
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count


def adding_metadata(tweets):
    """
    Add metadata to the dataframe of tweet : the added metadata are the result of sentiment analysis and the count of parenthesis
    :param tweets: A Dataframe of tweet
    :return: The same dataframe with additional metadata
    """
    analyzer = SentimentIntensityAnalyzer()

    # we keep track on the time to apply the sentiment analysis to the dataset
    start_time = time.time()
    polarity_scores = tweets['tweet'].apply(analyzer.polarity_scores)
    sentiment_analysis_time = time.time() - start_time
    print('Time to apply sentiment analysis on the dataset {:.4} seconde'.format(sentiment_analysis_time))

    # storing the result of the sentiment analysis
    tweets['neg']       = polarity_scores.apply(lambda d : d['neg'])
    tweets['neu']       = polarity_scores.apply(lambda d : d['neu'])
    tweets['pos']       = polarity_scores.apply(lambda d : d['pos'])
    tweets['compound']  = polarity_scores.apply(lambda d : d['compound'])

    # storing the parenthesis count
    tweets['par_count'] = tweets["tweet"].apply(count_parenthesis)
    return tweets

In [9]:
def scale_data(scaler, X_tr, X_te):
    """
    Scales the dataset according to the given scaler, The scaler is fitted only with the training set.
    :param scaler: The scaler used for the scaling
    :param X_tr: training set
    :param X_te: testing set
    :return: scaled version of the training/testing set
    """
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    return X_tr, X_te

def get_scaled_data(scaler, X_tr_df, X_te_df, features):
    """
    Scales the dataset with the features given according to the given scaler, the scaler is fitted only with the training set.
    :param scaler: The scaler used for the scaling
    :param X_tr: training set in Dataframe
    :param X_te: testing set in Dataframe
    :param features: features of the dataset we want to scale
    :return: scaled version of the training/testing set with only the corresponding features.
    """
    X_tr                     = X_tr_df[features].to_numpy()
    X_te                     = X_te_df[features].to_numpy()
    X_tr_scaled, X_te_scaled = scale_data(scaler, X_tr, X_te)
    return X_tr_scaled, X_te_scaled

In [10]:
def vectorize_tweets(vectorizer, X_tr_df, X_te_df):
    """
    Returns vectorized tweets with the given vectorizer, the vectoriser is only fitted on the training dataset
    :param vectorizer: The vectorizer used for vectorization
    :param X_tr_df: the training set in Dataframe
    :param X_te_df: the testing set in Dataframe
    :return: the vectorized test/train tweets with the given vectorizer
    """
    X_tr_vec = vectorizer.fit_transform(X_tr_df['tweet'])
    X_te_vec = vectorizer.transform(X_te_df['tweet'])
    return X_tr_vec, X_te_vec

## PreProcessing

In [11]:
tweets = combine_pos_neg(pos, neg)
tweets = adding_metadata(tweets)

Time to apply sentiment analysis on the dataset 116.9 seconde


In [12]:
X_tr_df, X_te_df, Y_tr_df, Y_te_df = split_test_train(tweets, test_size=10000)

In [13]:
# vectorize the tweets
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,5), strip_accents='unicode')
X_tr_vec,            X_te_vec            = vectorize_tweets(vectorizer, X_tr_df, X_te_df)
vectorization_time = time.time() - start_time

# scale the metadata
scaler = StandardScaler()
par_count_tr,        par_count_te        = get_scaled_data(scaler, X_tr_df, X_te_df, ['par_count'])
pol_score_tr,        pol_score_te        = get_scaled_data(scaler, X_tr_df, X_te_df, ['neg', 'neu', 'pos', 'compound'])

In [14]:
print('Time to vectorize the dataset {:.4} seconde'.format(vectorization_time))

Time to vectorize the dataset 220.3 seconde


In [15]:
# get the labels
Y_tr = np.array(Y_tr_df['is_pos'])
Y_te = np.array(Y_te_df['is_pos'])

# Predictions

In [16]:
def create_inputs(has_vectorized_tweet, has_par_count, has_pol_score):
    """
    Combine the features into one sparse matrix.
    :param has_vectorized_tweet: Boolean if the de desired input has the vectorized tweet.
    :param has_par_count: Boolean if the de desired input has the parenthesis count.
    :param has_pol_score: Boolean if the de desired input has the polarity score.
    :return: the combined train/test features into one sparse matrix.
    """
    X_tr = None
    X_te = None

    if has_par_count:
        X_tr = sparse.hstack((X_tr, par_count_tr))
        X_te = sparse.hstack((X_te, par_count_te))
    if has_pol_score:
        X_tr = sparse.hstack((X_tr, pol_score_tr))
        X_te = sparse.hstack((X_te, pol_score_te))
    if has_vectorized_tweet:
        X_tr = sparse.hstack((X_tr, X_tr_vec))
        X_te = sparse.hstack((X_te, X_te_vec))

    return sparse.csr_matrix(X_tr), sparse.csr_matrix(X_te)

Below we define a function that test and store results for each runs

In [17]:
# Constant for getting the full training set
FULL_TRAIN_SET = [True]*len(X_tr_df)

In [18]:
def fit_and_store_res(df_score, model, model_name, has_vectorized_tweet, has_par_count, has_pol_score,
                      subset_train=FULL_TRAIN_SET, subset_train_name='Full train set'):
    """

    :param df_score: The dataframe in which we store the score for each run
    :param model: The model used
    :param model_name: The string name of the model
    :param has_vectorized_tweet: Boolean if the de desired input has the vectorized tweet.
    :param has_par_count: Boolean if the de desired input has the parenthesis count.
    :param has_pol_score: Boolean if the de desired input has the polarity score.
    :param subset_train: The subset in which the model will be trained
    :param subset_train_name: The string name of the subset
    :return: The updated dataframe with the result of the run
    """
    X_tr, X_te = create_inputs(has_vectorized_tweet, has_par_count, has_pol_score)

    # we monitor time for fitting the model
    start_time     = time.time()
    model.fit(X_tr[subset_train], Y_tr[subset_train])
    time_to_fit     = time.time() - start_time

    # we monitor the time for predicting the result
    start_time      = time.time()
    test_acc        = model.score(X_te, Y_te)
    time_to_predict = time.time() - start_time

    # we create a new row and store each value for the result
    new_row = pd.DataFrame()

    new_row['model']                         = model_name,
    new_row['Train has vectorized tweet']    = has_vectorized_tweet,
    new_row['Train has parenthesis count']   = has_par_count,
    new_row['Train has polarity score']      = has_pol_score,
    new_row['Train on']                      = subset_train_name,
    new_row['time to fit [s]']               = time_to_fit,
    new_row['time_to_predict [s]']           = time_to_predict,
    new_row['accuracy on the training set']  = model.score(X_tr[subset_train], Y_tr[subset_train]),
    new_row['accuracy on the full test set'] = test_acc,
    new_row['accuracy on subset test 1']     = model.score(X_te[X_te_df.par_count == 0], Y_te[Y_te_df.par_count == 0]),
    new_row['accuracy on subset test 2']     = model.score(X_te[X_te_df.par_count >  0], Y_te[Y_te_df.par_count >  0]),
    new_row['accuracy on subset test 3']     = model.score(X_te[X_te_df.par_count <  0], Y_te[Y_te_df.par_count <  0])

    return pd.concat([df_score, new_row])

## Running the predicitons

In [19]:
# definition of our four models tested for baselines
log_reg = LogisticRegression(solver='liblinear')
svc     = LinearSVC()
rfc     = RandomForestClassifier(max_depth=10, random_state=42)
bnb     = BernoulliNB()

In [20]:
# creating the dataframe of scores
df_score = pd.DataFrame()

In [21]:
# testing logistic regression with multiple combination of features for training
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', False, False, True )
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', False, True,  False)
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  False, False)
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', False, True,  True )
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  False)
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  False, True )
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True )

In [22]:
# testing logistic regression with multiple combination of subset
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count == 0, subset_train_name= 'Subset 1')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count >  0, subset_train_name= 'Subset 2')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count <  0, subset_train_name= 'Subset 3')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count >= 0, subset_train_name= 'Subset 1 and 2')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count <= 0, subset_train_name= 'Subset 1 and 3')
df_score = fit_and_store_res(df_score, log_reg, 'Logistic Regression', True,  True,  True ,
                             subset_train = X_tr_df.par_count != 0, subset_train_name= 'Subset 2 and 3')

In [23]:
# testing other models
df_score = fit_and_store_res(df_score, svc,    'Support Vector Machine', True,  True,  True )
df_score = fit_and_store_res(df_score, rfc,    'Random forest',          True,  True,  True )
df_score = fit_and_store_res(df_score, bnb,    'Bernoulli',              True,  True,  True )



In [24]:
# storing the data
df_score.to_csv(DATA_FOLDER + 'baseline_scores.csv', index=False)

In [25]:
df_score

Unnamed: 0,model,Train has vectorized tweet,Train has parenthesis count,Train has polarity score,Train on,time to fit [s],time_to_predict [s],accuracy on the training set,accuracy on the full test set,accuracy on subset test 1,accuracy on subset test 2,accuracy on subset test 3
0,Logistic Regression,False,False,True,Full train set,1.371146,0.00174,0.644184,0.6413,0.639107,0.66429,0.57047
0,Logistic Regression,False,True,False,Full train set,1.086416,0.000961,0.65534,0.6524,0.577503,0.969697,0.875839
0,Logistic Regression,True,False,False,Full train set,33.288939,0.005042,0.878264,0.8432,0.820801,0.957813,0.798658
0,Logistic Regression,False,True,True,Full train set,1.74865,0.000876,0.73864,0.7378,0.684,0.969697,0.875839
0,Logistic Regression,True,True,False,Full train set,32.675965,0.005268,0.885723,0.8548,0.826537,0.983363,0.889262
0,Logistic Regression,True,False,True,Full train set,61.178934,0.006363,0.878325,0.8447,0.822048,0.958408,0.812081
0,Logistic Regression,True,True,True,Full train set,74.933091,0.006318,0.885728,0.8565,0.828657,0.983363,0.889262
0,Logistic Regression,True,True,True,Subset 1,55.33755,0.005222,0.865341,0.7159,0.829031,0.151515,0.85906
0,Logistic Regression,True,True,True,Subset 2,8.603601,0.003918,0.981464,0.6712,0.607058,0.980986,0.647651
0,Logistic Regression,True,True,True,Subset 3,2.322908,0.003162,0.900728,0.5193,0.606934,0.036245,0.889262


# Creating submission for the leaderboard

Fitting our best model to the training set

In [20]:
X_tr, X_te = create_inputs(True, True, True)
log_reg.fit(X_tr, Y_tr);

Pre-processing for the testing set

In [21]:
test_data_df = adding_metadata(test_data)

Time to apply sentiment analysis on the dataset 0.6682 seconde


In [22]:
# vectorize the tweets
test_data_vec = vectorizer.transform(test_data['tweet'])

# scale the metadata
scaler = StandardScaler()
scaler.fit(X_tr_df[['par_count', 'neg', 'neu', 'pos', 'compound']])
test_metadata = scaler.transform(test_data_df[['par_count', 'neg', 'neu', 'pos', 'compound']])
X_test = sparse.hstack((test_metadata, test_data_vec))

Prediciton for the testing set

In [23]:
test_data['Prediction'] = log_reg.predict(X_test)
test_data['Prediction'] = test_data['Prediction'].apply(lambda p : 1 if p==1 else -1)

Creating the submission

In [24]:
test_data[['Id', 'Prediction']].to_csv(DATA_FOLDER + 'prediction_baseline.csv', index=False)