# DS1801 Project: Sentiment Analysis

## Part 1: Naive Bayes

In this notebook, you will learn how to use different packages in Python to build a complete pipeline for solving sentiment analysis problem. We will be using Mutinomial NB in this class.

### Pipeline

<img src="pipeline.png" width="800px">

### Get Familiar with dataset

In [None]:
import pandas as pd
import nltk
import string

In [None]:
# Load csv file into DataFrame
train = pd.read_csv('train.csv')

In [None]:
train?

In [None]:
print("sentiment  :", train.sentiment[0])
print("reviewText :", train.reviewText[0])

In [None]:
train.shape

In [None]:
# Get a sample (head) of the data frame
train.head()

In [None]:
# Statics on tags
train.sentiment.value_counts()

See [Pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/10min.html?highlight=data%20frame) for more details.

### Load Data

In [None]:
def load_data(path):
    data = pd.read_csv(path)
    x = data['reviewText'].tolist()
    y = data['sentiment'].tolist()
    return x, y

In [None]:
train_x, train_y = load_data('train.csv')
test_x, test_y = load_data('test.csv')

In [None]:
print('training size:', len(train_x))
print('test size:', len(test_x))

### Preprocessing

In [None]:
lemmatizer = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
transtbl = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) # Remove punctuation

In [None]:
print(stopwords)

In [None]:
string.punctuation

In [None]:
'ababc'.translate(str.maketrans('abc','def'))

In [None]:
def preprocessing(line):
    line = line.replace('<br />', '')   # Remove html tag (<br />)
    line = line.translate(transtbl)     # Remove punctuation
    
    # Get tokens
    tokens = []
    for t in nltk.word_tokenize(line):
        t = t.lower()
        if t not in stopwords:
            lemma = lemmatizer.lemmatize(t, 'v')
            tokens.append(lemma)
            
    return ' '.join(tokens)

In [None]:
# Yet a more compact way to write the code
def preprocessing(line: str) -> str:
    line = line.replace('<br />', '').translate(transtbl)
    
    tokens = [lemmatizer.lemmatize(t.lower(),'v')  # What to put in the list
              for t in nltk.word_tokenize(line)    # Where 
              if t.lower() not in stopwords]       # If
    
    return ' '.join(tokens)

In [None]:
test_str = "I bought several books yesterday<br /> and I really love them!"
preprocessing(test_str)

In [None]:
nltk.download()

In [None]:
# Preprocess all data
train_x = [preprocessing(x) for x in train_x]
test_x = [preprocessing(x) for x in test_x]

In [None]:
# Yet a more modern way to write code
train_x = list(map(preprocessing, train_x))
test_x = list(map(preprocessing, test_x))

In [None]:
# multi-task function
from nbmultitask import ThreadWithLogAndControls
from time import sleep

def preprocess_dataset(thread_print, datas, output):
    """
    Preprocess dataset in the background thread, so that
    it won't block the notebook from running other code snippets.
    Do not update output anywhere outside this background thread.

    Args:
        thread_print: for printing in nbmultitask, necessary
        datas: dict containing all datasets to be processed
        output: shared variable for storing output
    """
    
    sleep(0.5) # For consistent output format
    
    for name, data in datas.items():
        output[name] = []
        thread_print("\nPreprocessing " + name)
        
        m = len(data) / 4 # print progress every 25%
        for i, x in enumerate(data):
            output[name].append(preprocessing(x))
            if (i + 1) % m == 0:
                thread_print("Processed: %d%%" % ((i + 1) / m * 25))
        thread_print("Done.")
    
    thread_print("All done.")

In [None]:
# dict to store output
out = {}

In [None]:
def multitask_wrapper(thread_print):
    preprocess_dataset(thread_print, {"Training data": train_x, "Testing data": test_x}, out)

In [None]:
task = ThreadWithLogAndControls(target=multitask_wrapper, name="Preprocessing Data")
task.control_panel()

In [None]:
train_x = out['Training data']
test_x = out['Testing data']

### Some modern functions to introduce
- map
- reduce
- filter

They are very useful when running the project on a cluster or distributed compute system like Hadoop or Spark.

In [2]:
# Some useful modern functions
l = [0,1,2,3,4,5,6,7,8,9]

# Map
def square(x: int) -> int:
    return x * x

print(list(map(square,1)))

TypeError: 'int' object is not iterable

In [None]:
print( list(map(square, l)) )

In [None]:
# Using lambda function
print( list(map(lambda x: x * x, l)) )

In [None]:
# Reduce
# reduce function is moved to functools
def add(x: int, y: int) -> int:
    return x + y

import functools
rst = functools.reduce(add, l)

print ("reduce", l, "by add:", rst)

In [None]:
# Using lambda function
# reduce is moved to functools in Python 3
rst = functools.reduce(lambda x, y: x + y, l)
print ("reduce", l, "by add:", rst)

In [None]:
rst = functools.reduce(lambda x, y: min(x, y), l)
print ("reduce", l, "by min:", rst)

In [None]:
# Filter
# Much faster than loop, similar with list comprehension
list(filter(lambda x: x < 5, l))

### Build Vocabulary

In [None]:
# Push all tokens and compute frequency of words
all_words = []
for line in train_x:
    words = line.split()
    for w in words:
        all_words.append(w)
        
voca = nltk.FreqDist(all_words)

In [None]:
# Yet another more python-y style
all_words = [w for line in train_x for w in line.split()]
voca = nltk.FreqDist(all_words)

In [None]:
print(voca)

In [None]:
voca.most_common(10)

In [None]:
topwords = [fpair[0] for fpair in list(voca.most_common(10000))]

### Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cnt_vec = CountVectorizer()
cnt_vec

In [None]:
# Create our BAG of words (specify words we care about)
cnt_vec.fit(topwords)

#### Tf–idf term weighting: 
use a TF-IDF score (Term Frequency, Inverse Document Frequency) on top of our Bag of Words model. TF-IDF weighs words by how rare they are in our dataset, discounting words that are too frequent and just add to the noise

- Tf: term-frequency
- idf: inverse document-frequency
- Tf-idf = $tf(t,d) \times idf(t)$

$$
idf(t) = log{\frac{1 + n_d}{1 + df(d, t)}} + 1
$$

![](http://www.onemathematicalcat.org/Math/Algebra_II_obj/Graphics/log_base_gt1.gif)

> Sentence 1: The boy **love** the toy

> Sentence 2: The boy **hate** the toy

In [None]:
transformer = TfidfTransformer(smooth_idf=False)
transformer

In [None]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf

In [None]:
tfidf.toarray()

<span style="color:red">**Tips:**</span>

tf-idfs are computed slightly different in sklearn, where:

$$
idf(t) = log{\frac{n_d}{1 + df(d, t)}}
$$

With `smooth_idf=True` set to `True`, the formula is:

$$
idf(t) = log{\frac{n_d}{df(d, t)}} + 1
$$

In [None]:
# Since CountVectorizer and TfidTransformer are often used together
# There is a class named TfidfVectorizer that combine these two steps
tf_vec = TfidfVectorizer()
tf_vec.fit(topwords)

In [None]:
t_corpus = ['the boy love the toy', 'the boy hate the toy'] # Voc = ['boy', 'hate', 'love', 'the', 'toy']
t_cnt_vec = CountVectorizer()
t_cnt_vec.fit(' '.join(t_corpus).split())
t_cnt_vec.transform(t_corpus).toarray()

In [None]:
# Tfidf on the test corpus
t_tfidf_vec = TfidfVectorizer()
t_tfidf_vec.fit(' '.join(t_corpus).split())
t_tfidf_vec.transform(t_corpus).toarray()

### Feature Extraction

In [None]:
# Extract features from training set
# Vocabulary is from topwords
train_features = tf_vec.transform(train_x)

In [None]:
# Array[n_train_data * n_features]
train_features.shape

In [None]:
tf_vec = TfidfVectorizer(vocabulary=topwords)
train_features = tf_vec.fit_transform(train_x)
train_features.shape

In [None]:
# Extract features from test set
test_features = tf_vec.transform(test_x)

In [None]:
test_features.shape

### [Multinomial NB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

The multinomial Naive Bayes classifier is suitable for **classification with discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb_model = MultinomialNB()
mnb_model

In [None]:
# Train Model
import time

start = time.time()
mnb_model.fit(train_features, train_y)
end = time.time()

print("Multinomial NB model trained in %f seconds" % (end-start))

In [None]:
# Predict
pred = mnb_model.predict(test_features)
print(pred)

In [None]:
# Metrics
# metrics.accuracy_score(y_true, y_pred)
from sklearn import metrics
accuracy = metrics.accuracy_score(pred,test_y)
print(accuracy)

In [None]:
# Use keyword arguments to set arguments explicitly
print(metrics.classification_report(y_true=test_y, y_pred=pred))

In [None]:
# Example from sklearn documentation

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(metrics.classification_report(y_true, y_pred, target_names=target_names))

### Predict new sentences

In [None]:
# Predict a new sentence
# vectorizer needs to be pre-fitted
# At the end of the project, the function signature should be something like:
# predict_new(sentent: str, vec, model) -> str

def predict_new(sentence: str):
    sentence = preprocessing(sentence)
    features = tf_vec.transform([sentence])
    pred = mnb_model.predict(features)
    return pred[0]

In [None]:
predict_new('It looks nice')

### Select Top N

In [None]:
def train_with_n_topwords(n: int, tfidf=False) -> tuple:
    """
    Train and get the accuracy with different model settings
    Args:
        n: number of features (top frequent words in the vocabulary)
        tfidf: whether do tf-idf re-weighting or not
    Outputs:
        tuple: (accuracy score, classifier, vectorizer)
    """
    topwords = [fpair[0] for fpair in list(voca.most_common(n))]
    
    if tfidf:
        vec = TfidfVectorizer(vocabulary=topwords)
    else:
        vec = CountVectorizer(vocabulary=topwords)
    
    # Generate feature vectors
    train_features = vec.fit_transform(train_x)
    test_features  = vec.transform(test_x)
    
    # NB
    mnb_model = MultinomialNB()
    mnb_model.fit(train_features, train_y)
    
    # Test predict
    pred = mnb_model.predict(test_features)
    
    return metrics.accuracy_score(pred, test_y), mnb_model, vec

In [None]:
train_with_n_topwords(500, tfidf=True)

In [None]:
from my_utils import print_progress

possible_n = [500 * i for i in range(1, 20)]

cnt_accuracies = []
tfidf_accuracies = []

for i, n in enumerate(possible_n):
    cnt_accuracies.append(train_with_n_topwords(n)[0])
    print_progress(bar_length=50, decimals=0, iteration=2 * i + 1, total=2*len(possible_n), prefix='Train and verify:')
    
    tfidf_accuracies.append(train_with_n_topwords(n, tfidf=True)[0])
    print_progress(bar_length=50, decimals=0, iteration=2 * i + 2, total=2*len(possible_n), prefix='Train and verify:')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(possible_n, cnt_accuracies, label='Word Count')
plt.plot(possible_n, tfidf_accuracies, label='Tf-idf')
plt.legend()

**Expected**:

<img src="plot.png" width="400">

### Save model

In [None]:
best = train_with_n_topwords(3000, tfidf=True) # best = (acc, model, vec)

In [None]:
import pickle

# Save vectorizer
with open('tf_vec.pkl', 'wb') as pkl_file:
    pickle.dump(best[2], pkl_file)

In [None]:
with open('mnb_model.pkl', 'wb') as pkl_file:
    pickle.dump(best[1], pkl_file)