# 2019 Canadian Election tweets
# OSEMN Step 4: Model
# Sentiment analysis of Sentiment 140 dataset
# Serialize trained model

This notebook describes part of Step 4: Explore of OSEMN methodology. It covers serialization of a classifier trained on Sentiment 140 dataset.

## Import dependencies

In [10]:
random_state = 0

In [29]:
import numpy as np
import pandas as pd
import pickle
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from time import time

In [3]:
os.listdir('../..')

['.git',
 '.gitignore',
 'src',
 'notebooks',
 'presentations',
 'methodology',
 'README.md',
 'data']

In [4]:
data_dir = '../../data/sentiment140/'
os.listdir(data_dir)

['testdata.manual.2009.06.14.csv',
 'training.1600000.processed.noemoticon.csv',
 'sentiment140_train_nodup.csv',
 'sentiment140_train_cleaned.csv']

## Load cleaned Sentiment 140 dataset

In [5]:
t = time()
df = pd.read_csv(data_dir + 'sentiment140_train_nodup.csv')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) +
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df.shape[0], df.shape[1]) +
      "\n-- Column names:\n", df.columns)

----- DataFrame loaded
in 4.67 seconds
with 1,309,540 rows
and 8 columns
-- Column names:
 Index(['sentiment', 'ids', 'date', 'query', 'user', 'text', 'hashtags',
       'handles'],
      dtype='object')


In [6]:
mask1 = df['sentiment'] == 4
df.loc[mask1, 'sentiment'] = 1

## Train and test a classification algorithm

In [7]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [26]:
X = df['text']
y = df['sentiment']

bow = CountVectorizer(ngram_range=(1,1), tokenizer=tokenizer)
X_bow = bow.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.3, 
                                                    random_state=random_state, stratify=y)
print("Performed train-test split.")
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print('Labels counts in y_test:', np.bincount(y_test))

Performed train-test split.
Labels counts in y: [678182 631358]
Labels counts in y_train: [474727 441951]
Labels counts in y_test: [203455 189407]


In [27]:
penalty = 'l1'
c = 1.0
clf = LogisticRegression(random_state=random_state, solver='liblinear', penalty=penalty, C=c)
t = time()
clf.fit(X_train, y_train)
elapsed = time() - t
print("Model fit, score on test set: {0}. Took {1:,.2f} seconds ({2:,.2f} minutes)"
      .format(clf.score(X_test, y_test), elapsed, elapsed / 60))

Model fit, score on test set: 0.8004464672073145. Took 19.42 seconds (0.32 minutes)


## Serialize the fit model

In [40]:
vect_name = 'bow_1gram_tok'
model_name = 'lr_{0}_{1}_{2}'.format(penalty, c, vect_name)

X = df['text']
y = df['sentiment']

bow = CountVectorizer(ngram_range=(1,1), tokenizer=tokenizer)
X_bow = bow.fit_transform(X)

clf = LogisticRegression(random_state=random_state, solver='liblinear', penalty=penalty, C=c)
t = time()
clf.fit(X_bow, y)
elapsed = time() - t
print("Model fit, took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

dest = os.path.join('results', 'models')
clf_save_path = os.path.join(dest, '{0}.pkl'.format(model_name))
voc_save_path = os.path.join(dest, '{0}_vocabulary.pkl'.format(vect_name))
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(clf, open(clf_save_path, 'wb'), protocol=4)
print("\nModel saved to:\n  {0}".format(clf_save_path))

pickle.dump(bow.vocabulary_, open(voc_save_path, 'wb'), protocol=4)
print("Vectorizer vocabulary saved to:\n  {0}".format(voc_save_path))

Model fit, took 24.67 seconds (0.41 minutes)

Model saved to:
  results/models/lr_l1_1.0_bow_1gram_tok.pkl
Vectorizer vocabulary saved to:
  results/models/bow_1gram_tok_vocabulary.pkl
