# 2019 Canadian Election tweets
# OSEMN Step 4: Model
# Sentiment analysis of Sentiment 140 dataset

This notebook describes part of Step 4: Explore of OSEMN methodology. It covers sentiment analysis of Sentiment 140.

## Import dependencies

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import CountVectorizer
from time import time

sns.set()

In [None]:
from src.proc_utils import string_concat, tfm_2class

In [2]:
data_dir = '../../data/sentiment140/'
os.listdir(data_dir)

['sentiment140_train_cleaned.csv',
 'sentiment140_train_nodup.csv',
 'testdata.manual.2009.06.14.csv',
 'training.1600000.processed.noemoticon.csv']

## Load cleaned Sentiment 140 dataset

In [3]:
t = time()
df = pd.read_csv(data_dir + 'sentiment140_train_cleaned.csv')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) +
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df.shape[0], df.shape[1]) +
      "\n-- Column names:\n", df.columns)


----- DataFrame loaded
in 5.65 seconds
with 1,599,306 rows
and 8 columns
-- Column names:
 Index(['sentiment', 'ids', 'date', 'query', 'user', 'text', 'hashtags',
       'handles'],
      dtype='object')


## Bag of words (BoW) 
### Unigrams

In [10]:
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, '
                 'and one and one is two'])
bag = count.fit_transform(docs)
count.vocabulary_

{'the': 6,
 'sun': 4,
 'is': 1,
 'shining': 3,
 'weather': 8,
 'sweet': 5,
 'and': 0,
 'one': 2,
 'two': 7}

In [11]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]])

### Bigrams

In [14]:
count = CountVectorizer(ngram_range=(2,2))
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, '
                 'and one and one is two'])
bag = count.fit_transform(docs)
count.vocabulary_

{'the sun': 9,
 'sun is': 7,
 'is shining': 1,
 'the weather': 10,
 'weather is': 11,
 'is sweet': 2,
 'shining the': 6,
 'sweet and': 8,
 'and one': 0,
 'one and': 4,
 'one is': 5,
 'is two': 3}

## Term Frequency-Inverse Document Frequency (TF-IDF)