# 2019 Canadian Election tweets
# OSEMN Step 3: Explore
# Sentiment analysis of Sentiment 140 dataset
# Comparison of different text vectorization techniques

This notebook describes part of Step 3: Explore of OSEMN methodology. It covers exploration of different text vectorization techniques on Sentiment 140 dataset.

## Import dependencies

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from time import time

sns.set()
pd.options.display.max_colwidth = 200

In [6]:
from src.proc_utils import string_concat, tfm_2class

In [7]:
data_dir = '../../data/sentiment140/'
os.listdir(data_dir)

['sentiment140_train_cleaned.csv',
 'sentiment140_train_nodup.csv',
 'testdata.manual.2009.06.14.csv',
 'training.1600000.processed.noemoticon.csv']

## Load cleaned Sentiment 140 dataset

In [10]:
t = time()
df = pd.read_csv(data_dir + 'sentiment140_train_nodup.csv')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) +
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df.shape[0], df.shape[1]) +
      "\n-- Column names:\n", df.columns)

----- DataFrame loaded
in 3.70 seconds
with 1,311,110 rows
and 8 columns
-- Column names:
 Index(['sentiment', 'ids', 'date', 'query', 'user', 'text', 'hashtags',
       'handles'],
      dtype='object')


## Split documents into tokens

### Tokenization

In [11]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [18]:
t = time()
df['token'] = df['text'].apply(tokenizer)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['token'].head()

Documents were split into tokens, took 3.55 seconds (0.06 minutes)


0                            [@switchfoot, -, Awww,, that's, a, bummer., You, shoulda, got, David, Carr, of, Third, Day, to, do, it., ;D]
1    [is, upset, that, he, can't, update, his, Facebook, by, texting, it..., and, might, cry, as, a, result, School, today, also., Blah!]
2                             [@Kenichan, I, dived, many, times, for, the, ball., Managed, to, save, 50%, The, rest, go, out, of, bounds]
3                                                                               [my, whole, body, feels, itchy, and, like, its, on, fire]
4                                                                                                      [@Kwesidei, not, the, whole, crew]
Name: token, dtype: object

### Porter stemmer

In [4]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [19]:
t = time()
df['token'] = df['text'].apply(tokenizer_porter)
elapsed = time() - t
print("Documents were split into tokens, took {0:,.2f} seconds ({1:,.2f} minutes)"
      .format(elapsed, elapsed / 60))
df['token'].head()

KeyboardInterrupt: 

### Snowball stemmer

### Lancaster stemmer

In [5]:
lancaster = LancasterStemmer()
def tokenizer_lancaster(text):
    return [lancaster.stem(word) for word in text.split()]
tokenizer_lancaster('runners like running and thus they run')

['run', 'lik', 'run', 'and', 'thu', 'they', 'run']

## Bag of words (BoW) 
### Unigrams

In [10]:
count = CountVectorizer()

{'the': 6,
 'sun': 4,
 'is': 1,
 'shining': 3,
 'weather': 8,
 'sweet': 5,
 'and': 0,
 'one': 2,
 'two': 7}

### Bigrams

In [14]:
count = CountVectorizer(ngram_range=(2,2))

{'the sun': 9,
 'sun is': 7,
 'is shining': 1,
 'the weather': 10,
 'weather is': 11,
 'is sweet': 2,
 'shining the': 6,
 'sweet and': 8,
 'and one': 0,
 'one and': 4,
 'one is': 5,
 'is two': 3}

## Term Frequency-Inverse Document Frequency (TF-IDF)


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)