# 2019 Canadian Election tweets
# OSEMN Step 4: Model
# Sentiment analysis of Sentiment 140 dataset
# Hyperparameter tuning: grid search of text vectorization methods

This notebook describes part of Step 4: Explore of OSEMN methodology.

## Import dependencies

In [1]:
random_state = 0

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB

sns.set()
pd.options.display.max_colwidth = 200

In [3]:
from nltk import download
download('stopwords')

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
os.listdir('../..')

['.git', '.gitignore', 'src', 'notebooks', 'methodology', 'README.md', 'data']

In [5]:
sys.path.append('../../src')
from proc_utils import string_concat, tfm_2class

In [6]:
data_dir = '../../data/sentiment140/'
os.listdir(data_dir)

['testdata.manual.2009.06.14.csv',
 'training.1600000.processed.noemoticon.csv',
 'sentiment140_train_nodup.csv',
 'sentiment140_train_cleaned.csv']

## Load cleaned Sentiment 140 dataset

In [7]:
t = time()
df = pd.read_csv(data_dir + 'sentiment140_train_nodup.csv')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) +
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df.shape[0], df.shape[1]) +
      "\n-- Column names:\n", df.columns)

----- DataFrame loaded
in 5.04 seconds
with 1,309,540 rows
and 8 columns
-- Column names:
 Index(['sentiment', 'ids', 'date', 'query', 'user', 'text', 'hashtags',
       'handles'],
      dtype='object')


In [8]:
mask1 = df['sentiment'] == 4
df.loc[mask1, 'sentiment'] = 1

## Define functions for tokenization / stemming, with examples

In [9]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [10]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(w) for w in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [11]:
snowball = EnglishStemmer()
def tokenizer_snowball(text):
    return [snowball.stem(w) for w in text.split()]
tokenizer_snowball('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thus', 'they', 'run']

In [12]:
lancaster = LancasterStemmer()
def tokenizer_lancaster(text):
    return [lancaster.stem(w) for w in text.split()]
tokenizer_lancaster('runners like running and thus they run')

['run', 'lik', 'run', 'and', 'thu', 'they', 'run']

In [13]:
wnl = WordNetLemmatizer()
def tokenizer_lemmatizer(text):
    return [wnl.lemmatize(w) for w in text.split()]
tokenizer_lemmatizer('runners like running and thus they run')

['runner', 'like', 'running', 'and', 'thus', 'they', 'run']

## Subset for grid search: 1/5 of all records

In [14]:
s = df.sample(len(df) // 5, random_state=random_state).copy()
print("{0:,.0f} rows in the subset".format(len(s)))

261,908 rows in the subset


## Select features and target, perform train-test split

In [15]:
s['sentiment'].value_counts()

0    135523
1    126385
Name: sentiment, dtype: int64

In [16]:
X = s['text']
y = s['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)
print("Performed train-test split.")
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print('Labels counts in y_test:', np.bincount(y_test))

Performed train-test split.
Labels counts in y: [135523 126385]
Labels counts in y_train: [94866 88469]
Labels counts in y_test: [40657 37916]


## Grid search of text vectorization hyperparameters
Lemmatizer is currently not used.

In [17]:
gs_save_dir = 'results/gs_results/'
os.listdir(gs_save_dir)

['gs_lr_tfidf.csv',
 'gs_lr_bow.csv',
 'gs_tree_bow.csv',
 'gs_tree_tfidf.csv',
 'gs_complnb_tfidf.csv',
 'gs_lsvc_tfidf.csv',
 'gs_multinb_tfidf.csv',
 'gs_lsvc_bow.csv',
 '.ipynb_checkpoints']

In [18]:
stop = stopwords.words('english')  # corpus of English stopwords needs to be downloaded from NLTK

In [20]:
param_grid = [
    {'vect__ngram_range': [(1,1), (1,2), (1,3)], 
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_porter, tokenizer_snowball, tokenizer_lancaster],
     'vect__binary': [True, False],
     'vect__use_idf': [False],       # model based on raw term frequencies
     'vect__smooth_idf': [False],    # model based on raw term frequencies
     'vect__norm': [None]},          # model based on raw term frequencies
    
    {'vect__ngram_range': [(1,1), (1,2), (1,3)], 
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_porter, tokenizer_snowball, tokenizer_lancaster],
     'vect__binary': [True, False],
     'vect__use_idf': [True],        # model based on TF-IDF
     'vect__smooth_idf': [True],     # model based on TF-IDF
     'vect__norm': ['l2']}           # model based on TF-IDF
]

### Logistic regression

In [21]:
t = time()

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

lr_tfidf = Pipeline([('vect', tfidf), 
                     ('clf', LogisticRegression(random_state=random_state, penalty='l1', C=1.0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=12)
gs_lr_tfidf.fit(X_train, y_train)

elapsed = time() - t
print("Grid search completed! Took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.2min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed: 16.7min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 41.3min
[Parallel(n_jobs=12)]: Done 480 out of 480 | elapsed: 47.5min finished


Grid search completed! Took 2,896.60 seconds (48.28 minutes)


In [53]:
gs_lr_tfidf.best_score_

0.8022908882646521

In [22]:
gs_lr_tfidf.best_score_

0.8022908882646521

In [23]:
gs_lr_tfidf.best_params_

{'vect__binary': True,
 'vect__ngram_range': (1, 2),
 'vect__norm': 'l2',
 'vect__smooth_idf': True,
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer_snowball(text)>,
 'vect__use_idf': True}

In [24]:
clf = gs_lr_tfidf.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.7f' % clf.score(X_test, y_test))



Test accuracy: 0.8052893


In [46]:
lr_tfidf_gs_results = pd.DataFrame(gs_lr_tfidf.cv_results_)
lr_tfidf_gs_results.to_csv(gs_save_dir + 'gs_lr.csv', index=False)
print("DataFrame with grid search results recorded to a .csv file")

DataFrame with grid search results recorded to a .csv file


### Linear Support Vector Classifier

In [26]:
t = time()

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
lsvc_tfidf = Pipeline([('vect', tfidf), 
                       ('clf', LinearSVC(random_state=random_state, penalty='l2', C=0.1))])

gs_lsvc_tfidf = GridSearchCV(lsvc_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=12)
gs_lsvc_tfidf.fit(X_train, y_train)

elapsed = time() - t
print("Grid search completed! Took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.2min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed: 18.5min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 45.9min
[Parallel(n_jobs=12)]: Done 480 out of 480 | elapsed: 51.4min finished


Grid search completed! Took 3,115.76 seconds (51.93 minutes)


In [27]:
gs_lsvc_tfidf.best_score_

0.8052635885128318

In [28]:
gs_lsvc_tfidf.best_params_

{'vect__binary': True,
 'vect__ngram_range': (1, 3),
 'vect__norm': None,
 'vect__smooth_idf': False,
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer(text)>,
 'vect__use_idf': False}

In [29]:
clf = gs_lsvc_tfidf.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.7f' % clf.score(X_test, y_test))

Test accuracy: 0.8083693


In [47]:
lsvc_tfidf_gs_results = pd.DataFrame(gs_lsvc_tfidf.cv_results_)
lsvc_tfidf_gs_results.to_csv(gs_save_dir + 'gs_lsvc.csv', index=False)
print("DataFrame with grid search results recorded to a .csv file")

DataFrame with grid search results recorded to a .csv file


### Mutlinomial Naive Bayes

In [31]:
t = time()

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
multinb_tfidf = Pipeline([('vect', tfidf), 
                          ('clf', MultinomialNB())])

gs_multinb_tfidf = GridSearchCV(multinb_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=12)
gs_multinb_tfidf.fit(X_train, y_train)
elapsed = time() - t
print("Grid search completed! Took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.4min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed: 15.1min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 37.6min
[Parallel(n_jobs=12)]: Done 480 out of 480 | elapsed: 42.9min finished


Grid search completed! Took 2,587.40 seconds (43.12 minutes)


In [32]:
gs_multinb_tfidf.best_score_

0.7848365014863501

In [33]:
gs_multinb_tfidf.best_params_

{'vect__binary': True,
 'vect__ngram_range': (1, 2),
 'vect__norm': None,
 'vect__smooth_idf': False,
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer(text)>,
 'vect__use_idf': False}

In [34]:
clf = gs_multinb_tfidf.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.7f' % clf.score(X_test, y_test))

Test accuracy: 0.7879424


In [48]:
multinb_tfidf_gs_results = pd.DataFrame(gs_multinb_tfidf.cv_results_)
multinb_tfidf_gs_results.to_csv(gs_save_dir + 'gs_multinb.csv', index=False)
print("DataFrame with grid search results recorded to a .csv file")

DataFrame with grid search results recorded to a .csv file


### Complement Naive Bayes

In [36]:
t = time()

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
complnb_tfidf = Pipeline([('vect', tfidf), 
                          ('clf', ComplementNB())])

gs_complnb_tfidf = GridSearchCV(complnb_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=12)
gs_complnb_tfidf.fit(X_train, y_train)
elapsed = time() - t
print("Grid search completed! Took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.1min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed: 15.2min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 37.5min
[Parallel(n_jobs=12)]: Done 480 out of 480 | elapsed: 42.8min finished


Grid search completed! Took 2,580.27 seconds (43.00 minutes)


In [37]:
gs_complnb_tfidf.best_score_

0.7859437641475987

In [38]:
gs_complnb_tfidf.best_params_

{'vect__binary': True,
 'vect__ngram_range': (1, 2),
 'vect__norm': None,
 'vect__smooth_idf': False,
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer(text)>,
 'vect__use_idf': False}

In [39]:
clf = gs_complnb_tfidf.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.7f' % clf.score(X_test, y_test))

Test accuracy: 0.7887315


In [49]:
complnb_tfidf_gs_results = pd.DataFrame(gs_complnb_tfidf.cv_results_)
complnb_tfidf_gs_results.to_csv(gs_save_dir + 'gs_complnb.csv', index=False)
print("DataFrame with grid search results recorded to a .csv file")

DataFrame with grid search results recorded to a .csv file


### Decision Tree

In [41]:
t = time()

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
tree_tfidf = Pipeline([('vect', tfidf), 
                       ('clf', DecisionTreeClassifier(random_state=random_state, 
                                                      criterion='gini', max_depth=40))])

gs_tree_tfidf = GridSearchCV(tree_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=12)
gs_tree_tfidf.fit(X_train, y_train)
elapsed = time() - t
print("Grid search completed! Took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  4.0min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed: 82.9min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 234.3min
[Parallel(n_jobs=12)]: Done 480 out of 480 | elapsed: 293.6min finished


Grid search completed! Took 18,338.24 seconds (305.64 minutes)


In [42]:
gs_tree_tfidf.best_score_

0.7010827174298415

In [43]:
gs_tree_tfidf.best_params_

{'vect__binary': True,
 'vect__ngram_range': (1, 3),
 'vect__norm': None,
 'vect__smooth_idf': False,
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer_lancaster(text)>,
 'vect__use_idf': False}

In [44]:
clf = gs_tree_tfidf.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.7f' % clf.score(X_test, y_test))

Test accuracy: 0.7039441


In [50]:
tree_tfidf_gs_results = pd.DataFrame(gs_tree_tfidf.cv_results_)
tree_tfidf_gs_results.to_csv(gs_save_dir + 'gs_tree.csv', index=False)
print("DataFrame with grid search results recorded to a .csv file")

DataFrame with grid search results recorded to a .csv file
