# 2019 Canadian Election tweets
# OSEMN Step 3: Explore
# Sentiment analysis of Sentiment 140 dataset
# Comparison of different text tokenization and vectorization techniques

This notebook describes part of Step 3: Explore of OSEMN methodology. It covers exploration of different text vectorization techniques on Sentiment 140 dataset.

## Import dependencies

In [1]:
random_state = 0

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

sns.set()
pd.options.display.max_colwidth = 200

In [3]:
from nltk import download
download('stopwords')
download('wordnet')

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
os.listdir('../..')

['.git', '.gitignore', 'src', 'notebooks', 'methodology', 'README.md', 'data']

In [5]:
sys.path.append('../../src')
from proc_utils import string_concat, tfm_2class

In [6]:
data_dir = '../../data/sentiment140/'
os.listdir(data_dir)

['testdata.manual.2009.06.14.csv',
 'training.1600000.processed.noemoticon.csv',
 'sentiment140_train_nodup.csv',
 'sentiment140_train_cleaned.csv']

## Load cleaned Sentiment 140 dataset

In [7]:
t = time()
df = pd.read_csv(data_dir + 'sentiment140_train_nodup.csv')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) +
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df.shape[0], df.shape[1]) +
      "\n-- Column names:\n", df.columns)

----- DataFrame loaded
in 4.49 seconds
with 1,309,540 rows
and 8 columns
-- Column names:
 Index(['sentiment', 'ids', 'date', 'query', 'user', 'text', 'hashtags',
       'handles'],
      dtype='object')


## Split documents into tokens

### Tokenization

In [8]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [9]:
t = time()
df['token'] = df['text'].apply(tokenizer)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['token'].head()

Documents were split into tokens, took 4.79 seconds (0.08 minutes)


0                             [switchfoot, awww, that, s, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d;D]
1    [is, upset, that, he, can, t, update, his, facebook, by, texting, it, and, might, cry, as, a, result, school, today, also, blah]
2                            [kenichan, i, dived, many, times, for, the, ball, managed, to, save, 50, the, rest, go, out, of, bounds]
3                                                                           [my, whole, body, feels, itchy, and, like, its, on, fire]
4                                                                                                   [kwesidei, not, the, whole, crew]
Name: token, dtype: object

In [10]:
df['token'].tail()

1309535            [just, woke, up, having, no, school, is, the, best, feeling, ever]
1309536                    [thewdb, com, very, cool, to, hear, old, walt, interviews]
1309537           [are, you, ready, for, your, mojo, makeover, ask, me, for, details]
1309538    [happy, 38th, birthday, to, my, boo, of, alll, time, tupac, amaru, shakur]
1309539               [happy, charitytuesday, thenspcc, sparkscharity, speakinguph4h]
Name: token, dtype: object

### Tokenization with stop word removal

In [11]:
stop = stopwords.words('english')
def tokenizer_sw(text):
    return [w for w in text.split() if w not in stop]
tokenizer_sw('runners like running and thus they run')

['runners', 'like', 'running', 'thus', 'run']

In [12]:
t = time()
df['token_sw'] = df['text'].apply(tokenizer_sw)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['token_sw'].head()

Documents were split into tokens, took 54.44 seconds (0.91 minutes)


0               [switchfoot, awww, bummer, shoulda, got, david, carr, third, day, d;D]
1    [upset, update, facebook, texting, might, cry, result, school, today, also, blah]
2            [kenichan, dived, many, times, ball, managed, save, 50, rest, go, bounds]
3                                              [whole, body, feels, itchy, like, fire]
4                                                              [kwesidei, whole, crew]
Name: token_sw, dtype: object

In [13]:
df['token_sw'].tail()

1309535                                [woke, school, best, feeling, ever]
1309536                   [thewdb, com, cool, hear, old, walt, interviews]
1309537                              [ready, mojo, makeover, ask, details]
1309538     [happy, 38th, birthday, boo, alll, time, tupac, amaru, shakur]
1309539    [happy, charitytuesday, thenspcc, sparkscharity, speakinguph4h]
Name: token_sw, dtype: object

### Porter stemmer

In [14]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [15]:
t = time()
df['porter'] = df['text'].apply(tokenizer_porter)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['porter'].head()

Documents were split into tokens, took 401.12 seconds (6.69 minutes)


0                        [switchfoot, awww, that, s, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d;d]
1    [is, upset, that, he, can, t, updat, hi, facebook, by, text, it, and, might, cri, as, a, result, school, today, also, blah]
2                            [kenichan, i, dive, mani, time, for, the, ball, manag, to, save, 50, the, rest, go, out, of, bound]
3                                                                        [my, whole, bodi, feel, itchi, and, like, it, on, fire]
4                                                                                              [kwesidei, not, the, whole, crew]
Name: porter, dtype: object

In [16]:
df['porter'].tail()

1309535                 [just, woke, up, have, no, school, is, the, best, feel, ever]
1309536                     [thewdb, com, veri, cool, to, hear, old, walt, interview]
1309537              [are, you, readi, for, your, mojo, makeov, ask, me, for, detail]
1309538    [happi, 38th, birthday, to, my, boo, of, alll, time, tupac, amaru, shakur]
1309539                  [happi, charitytuesday, thenspcc, sparkschar, speakinguph4h]
Name: porter, dtype: object

### Porter stemmer with stop words removal

In [17]:
def remove_sw(tokens):
    return [w for w in tokens if w not in stop]

In [18]:
t = time()
df['porter_sw'] = df['porter'].apply(remove_sw)
elapsed = time() - t
print("Stop words removed, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['porter_sw'].head()

Stop words removed, took 51.11 seconds (0.85 minutes)


0               [switchfoot, awww, bummer, shoulda, got, david, carr, third, day, d;d]
1    [upset, updat, hi, facebook, text, might, cri, result, school, today, also, blah]
2                 [kenichan, dive, mani, time, ball, manag, save, 50, rest, go, bound]
3                                               [whole, bodi, feel, itchi, like, fire]
4                                                              [kwesidei, whole, crew]
Name: porter_sw, dtype: object

In [19]:
df['porter_sw'].tail()

1309535                                  [woke, school, best, feel, ever]
1309536             [thewdb, com, veri, cool, hear, old, walt, interview]
1309537                                [readi, mojo, makeov, ask, detail]
1309538    [happi, 38th, birthday, boo, alll, time, tupac, amaru, shakur]
1309539      [happi, charitytuesday, thenspcc, sparkschar, speakinguph4h]
Name: porter_sw, dtype: object

### Snowball stemmer (ignore_stopwords=True)

In [8]:
snowball = EnglishStemmer()
def tokenizer_snowball(text):
    return [snowball.stem(word) for word in text.split()]
tokenizer_snowball('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thus', 'they', 'run']

In [21]:
t = time()
df['snowball'] = df['text'].apply(tokenizer_snowball)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['snowball'].head()

Documents were split into tokens, took 185.58 seconds (3.09 minutes)


0                         [switchfoot, awww, that, s, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d;d]
1    [is, upset, that, he, can, t, updat, his, facebook, by, text, it, and, might, cri, as, a, result, school, today, also, blah]
2                             [kenichan, i, dive, mani, time, for, the, ball, manag, to, save, 50, the, rest, go, out, of, bound]
3                                                                        [my, whole, bodi, feel, itchi, and, like, its, on, fire]
4                                                                                               [kwesidei, not, the, whole, crew]
Name: snowball, dtype: object

In [22]:
df['snowball'].tail()

1309535               [just, woke, up, having, no, school, is, the, best, feel, ever]
1309536                     [thewdb, com, very, cool, to, hear, old, walt, interview]
1309537              [are, you, readi, for, your, mojo, makeov, ask, me, for, detail]
1309538    [happi, 38th, birthday, to, my, boo, of, alll, time, tupac, amaru, shakur]
1309539                  [happi, charitytuesday, thenspcc, sparkschar, speakinguph4h]
Name: snowball, dtype: object

### Snowball stemmer (ingore_stopwords=False)

In [23]:
snowball_sw = EnglishStemmer(ignore_stopwords=True)
def tokenizer_snowball_sw(text):
    return [snowball_sw.stem(word) for word in text.split()]
tokenizer_snowball_sw('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thus', 'they', 'run']

In [24]:
t = time()
df['snowball_sw'] = df['text'].apply(tokenizer_snowball_sw)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['snowball_sw'].head()

Documents were split into tokens, took 181.98 seconds (3.03 minutes)


0                         [switchfoot, awww, that, s, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d;d]
1    [is, upset, that, he, can, t, updat, his, facebook, by, text, it, and, might, cri, as, a, result, school, today, also, blah]
2                             [kenichan, i, dive, mani, time, for, the, ball, manag, to, save, 50, the, rest, go, out, of, bound]
3                                                                        [my, whole, bodi, feel, itchi, and, like, its, on, fire]
4                                                                                               [kwesidei, not, the, whole, crew]
Name: snowball_sw, dtype: object

In [25]:
df['snowball_sw'].tail()

1309535               [just, woke, up, having, no, school, is, the, best, feel, ever]
1309536                     [thewdb, com, very, cool, to, hear, old, walt, interview]
1309537              [are, you, readi, for, your, mojo, makeov, ask, me, for, detail]
1309538    [happi, 38th, birthday, to, my, boo, of, alll, time, tupac, amaru, shakur]
1309539                  [happi, charitytuesday, thenspcc, sparkschar, speakinguph4h]
Name: snowball_sw, dtype: object

### Snowball stemmer with stop words removal

In [26]:
t = time()
df['snowball_sw'] = df['snowball'].apply(remove_sw)
elapsed = time() - t
print("Stop words removed, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['snowball_sw'].head()

Stop words removed, took 51.34 seconds (0.86 minutes)


0           [switchfoot, awww, bummer, shoulda, got, david, carr, third, day, d;d]
1    [upset, updat, facebook, text, might, cri, result, school, today, also, blah]
2             [kenichan, dive, mani, time, ball, manag, save, 50, rest, go, bound]
3                                           [whole, bodi, feel, itchi, like, fire]
4                                                          [kwesidei, whole, crew]
Name: snowball_sw, dtype: object

In [27]:
df['snowball_sw'].tail()

1309535                                  [woke, school, best, feel, ever]
1309536                   [thewdb, com, cool, hear, old, walt, interview]
1309537                                [readi, mojo, makeov, ask, detail]
1309538    [happi, 38th, birthday, boo, alll, time, tupac, amaru, shakur]
1309539      [happi, charitytuesday, thenspcc, sparkschar, speakinguph4h]
Name: snowball_sw, dtype: object

### Lancaster stemmer

In [28]:
lancaster = LancasterStemmer()
def tokenizer_lancaster(text):
    return [lancaster.stem(word) for word in text.split()]
tokenizer_lancaster('runners like running and thus they run')

['run', 'lik', 'run', 'and', 'thu', 'they', 'run']

In [29]:
t = time()
df['lancaster'] = df['text'].apply(tokenizer_lancaster)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['lancaster'].head()

Documents were split into tokens, took 383.00 seconds (6.38 minutes)


0                            [switchfoot, awww, that, s, a, bum, you, should, got, david, car, of, third, day, to, do, it, d;d]
1    [is, upset, that, he, can, t, upd, his, facebook, by, text, it, and, might, cry, as, a, result, school, today, also, blah]
2                                   [kenich, i, div, many, tim, for, the, bal, man, to, sav, 50, the, rest, go, out, of, bound]
3                                                                          [my, whol, body, feel, itchy, and, lik, it, on, fir]
4                                                                                               [kweside, not, the, whol, crew]
Name: lancaster, dtype: object

In [30]:
df['lancaster'].tail()

1309535                 [just, wok, up, hav, no, school, is, the, best, feel, ev]
1309536                 [thewdb, com, very, cool, to, hear, old, walt, interview]
1309537             [ar, you, ready, for, yo, mojo, makeov, ask, me, for, detail]
1309538    [happy, 38th, birthday, to, my, boo, of, all, tim, tupac, amaru, shak]
1309539                [happy, charitytuesday, thenspcc, sparksch, speakinguph4h]
Name: lancaster, dtype: object

### Lancaster stemmer with stop words removal

In [31]:
t = time()
df['lancaster_sw'] = df['lancaster'].apply(remove_sw)
elapsed = time() - t
print("Stop words removed, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['lancaster_sw'].head()

Stop words removed, took 48.42 seconds (0.81 minutes)


0                      [switchfoot, awww, bum, got, david, car, third, day, d;d]
1    [upset, upd, facebook, text, might, cry, result, school, today, also, blah]
2                   [kenich, div, many, tim, bal, man, sav, 50, rest, go, bound]
3                                            [whol, body, feel, itchy, lik, fir]
4                                                          [kweside, whol, crew]
Name: lancaster_sw, dtype: object

In [32]:
df['lancaster_sw'].tail()

1309535                            [wok, hav, school, best, feel, ev]
1309536               [thewdb, com, cool, hear, old, walt, interview]
1309537                    [ar, ready, yo, mojo, makeov, ask, detail]
1309538         [happy, 38th, birthday, boo, tim, tupac, amaru, shak]
1309539    [happy, charitytuesday, thenspcc, sparksch, speakinguph4h]
Name: lancaster_sw, dtype: object

### WordNet lemmatizer

In [33]:
wnl = WordNetLemmatizer()
def tokenizer_lemmatizer(text):
    return [wnl.lemmatize(word) for word in text.split()]
tokenizer_lemmatizer('runners like running and thus they run')

['runner', 'like', 'running', 'and', 'thus', 'they', 'run']

In [34]:
t = time()
df['wnl'] = df['text'].apply(tokenizer_lemmatizer)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['wnl'].head()

Documents were split into tokens, took 102.53 seconds (1.71 minutes)


0                            [switchfoot, awww, that, s, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d;D]
1    [is, upset, that, he, can, t, update, his, facebook, by, texting, it, and, might, cry, a, a, result, school, today, also, blah]
2                             [kenichan, i, dived, many, time, for, the, ball, managed, to, save, 50, the, rest, go, out, of, bound]
3                                                                            [my, whole, body, feel, itchy, and, like, it, on, fire]
4                                                                                                  [kwesidei, not, the, whole, crew]
Name: wnl, dtype: object

In [35]:
df['wnl'].tail()

1309535            [just, woke, up, having, no, school, is, the, best, feeling, ever]
1309536                     [thewdb, com, very, cool, to, hear, old, walt, interview]
1309537            [are, you, ready, for, your, mojo, makeover, ask, me, for, detail]
1309538    [happy, 38th, birthday, to, my, boo, of, alll, time, tupac, amaru, shakur]
1309539               [happy, charitytuesday, thenspcc, sparkscharity, speakinguph4h]
Name: wnl, dtype: object

### WordNet lemmatizer with stop words removal

In [36]:
t = time()
df['wnl_sw'] = df['wnl'].apply(remove_sw)
elapsed = time() - t
print("Stop words removed, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['wnl_sw'].head()

Stop words removed, took 46.74 seconds (0.78 minutes)


0               [switchfoot, awww, bummer, shoulda, got, david, carr, third, day, d;D]
1    [upset, update, facebook, texting, might, cry, result, school, today, also, blah]
2              [kenichan, dived, many, time, ball, managed, save, 50, rest, go, bound]
3                                               [whole, body, feel, itchy, like, fire]
4                                                              [kwesidei, whole, crew]
Name: wnl_sw, dtype: object

In [37]:
df['wnl_sw'].tail()

1309535                                [woke, school, best, feeling, ever]
1309536                    [thewdb, com, cool, hear, old, walt, interview]
1309537                               [ready, mojo, makeover, ask, detail]
1309538     [happy, 38th, birthday, boo, alll, time, tupac, amaru, shakur]
1309539    [happy, charitytuesday, thenspcc, sparkscharity, speakinguph4h]
Name: wnl_sw, dtype: object