<a href="https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import csv
import numpy as np
import re
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import tqdm as tqdm
import random
import joblib
import time
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

In [None]:
%%capture .logs
# Getting Text Processing Tools

nltk.download('all')

In [None]:
# Importing Tools
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from string import punctuation
from nltk.corpus import stopwords

stopword_set = set(stopwords.words('english'))

In [None]:
with open('train.json', 'r+') as f:
    records = json.load(f)

with open('test.json', 'r') as f:
    gold_test_list = json.load(f)

In [None]:
X_train = []
Y_train = []

X_test = []
Y_test = []

for item in records:
  X_train.append(item['content'])
  Y_train.append(item['label'])

for item in gold_test_list:
  X_test.append(item['content'])
  Y_test.append(item['label'])

In [None]:
def clean(s):
    # takes an input string
    # preprocesses it for the tf-idf vectorizer
    s.replace("\n", " ")
    tokens = word_tokenize(s)
    output = ""
    
    for token in tokens:
        unit = token.strip().lower()
        if unit in stopword_set or unit in punctuation:
            continue
        output = output + " " + unit
        
    return output.strip()

In [None]:
vectorizer = TfidfVectorizer(
        sublinear_tf = True,
        norm = "l2",
        encoding = 'utf-8',
        max_features = 512,
        stop_words = 'english',
        ngram_range = (1, 3),
        strip_accents = 'unicode',
        smooth_idf = True)

In [None]:
# To verify correctness of Vectorizer
X_train_vec = vectorizer.fit_transform(X_train)
print(np.shape(X_train_vec))

(25000, 512)


In [None]:
print("Size of Train: " + str(len(X_train)))
print("Size of Test: " + str(len(X_test)))
max_feature_size = 10000

Size of Train: 25000
Size of Test: 25000


In [None]:
def train(X, y, active = 'identity', solve = 'sgd', approach = 'mlp'):
    start = time.time()
    vec = vectorizer.fit(X)
    X_train_vec = vec.transform(X)
    
    if approach == 'lda':
        model = LinearDiscriminantAnalysis()
        model.fit(X_train_vec.toarray(), y)
    
    elif approach == 'mlp':
        model = MLPClassifier(alpha = 0,
                              hidden_layer_sizes = (512, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 1),
                              random_state = 2020,
                              activation = active,
                              max_iter = int(1e3),
                              solver = solve,
                              learning_rate = 'adaptive',
                              early_stopping = True,
                              momentum = 0.9,
                              batch_size = 512)
        
        model.fit(X_train_vec, y)
    
    end = time.time()
    time_to_train = int(round(end - start))

    hours = int(time_to_train / 3600)
    minutes = int(int(time_to_train % 3600) / 60)
    seconds = int(time_to_train % 60)

    print()
    print('Time taken for training: ' + str(hours).zfill(2) + ':' +
          str(minutes).zfill(2) + ':' + str(seconds).zfill(2))
    return vec, model

In [None]:
def get_res(vec, clf):
    X_test_vec = vec.transform(X_test)
    pred_Y_test = clf.predict(X_test_vec)
    print("Number of Features: " + str(np.shape(X_test_vec)[1]))
    print(classification_report(Y_test, pred_Y_test, digits = 6))
    return

In [None]:
# To Try out all possibilities
try_all = False

if try_all == True:
    activations = ['identity', 'tanh', 'relu']
    solvers = ['adam', 'sgd', 'lbfgs']
else:
    activations = ['tanh']
    solvers = ['sgd']

for active in activations:
    for solver in solvers:
        if active == 'tanh' and solver == 'lbfgs':
            continue
        vec, model = train(X_train, Y_train, active, solver)
        print("Hidden Layer Activation = " + str(active) + ", Solver = " + str(solver))
        get_res(vec, model)


Time taken for training: 00:29:42
Hidden Layer Activation = tanh, Solver = sgd
Number of Features: 512
              precision    recall  f1-score   support

           0   0.845930  0.814800  0.830073     12500
           1   0.821373  0.851600  0.836214     12500

    accuracy                       0.833200     25000
   macro avg   0.833652  0.833200  0.833144     25000
weighted avg   0.833652  0.833200  0.833144     25000



In [None]:
# Testing out a basic pipeline
pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])
pred_Y_test = pipe.predict(X_test)
print(classification_report(Y_test, pred_Y_test, digits = 6))

              precision    recall  f1-score   support

           0   0.845930  0.814800  0.830073     12500
           1   0.821373  0.851600  0.836214     12500

    accuracy                       0.833200     25000
   macro avg   0.833652  0.833200  0.833144     25000
weighted avg   0.833652  0.833200  0.833144     25000



In [None]:
# K-fold Cross Validation

X = X_train
Y = Y_train

def cross_val(algo = 'mlp', splits = 5):
    global X, Y
    splits = int(splits)
    if splits > 9 or splits < 3:
        splits = 5
    print("Classification Technique: " + str(algo))
    kf = KFold(n_splits = splits, shuffle = True, random_state = 2020)
    index = 1    

    for train_index, test_index in kf.split(X):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []

        for idx in train_index:
            X_train.append(X[idx])
            Y_train.append(Y[idx])

        for idx in test_index:
            X_test.append(X[idx])
            Y_test.append(Y[idx])

        if algo == 'lda':
            vec, model = train(X_train, Y_train, '', '', 'lda')
        else:
            vec, model = train(X_train, Y_train, 'tanh', 'sgd', 'mlp')

        pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])
        pred_Y_test = pipe.predict(X_test)

        print("Fold Index: " + str(index))
        index += 1
        print(classification_report(Y_test, pred_Y_test, digits = 6))
        
    return

In [None]:
# Performing K-Fold Cross Validation using LDA
cross_val('lda', splits = 3)

Classification Technique: lda

Time taken for training: 00:00:39
Fold Index: 1
              precision    recall  f1-score   support

           0   0.838678  0.819029  0.828737      4183
           1   0.821840  0.841243  0.831429      4151

    accuracy                       0.830094      8334
   macro avg   0.830259  0.830136  0.830083      8334
weighted avg   0.830292  0.830094  0.830078      8334


Time taken for training: 00:00:38
Fold Index: 2
              precision    recall  f1-score   support

           0   0.840903  0.813250  0.826846      4166
           1   0.819238  0.846172  0.832487      4167

    accuracy                       0.829713      8333
   macro avg   0.830071  0.829711  0.829666      8333
weighted avg   0.830069  0.829713  0.829667      8333


Time taken for training: 00:00:39
Fold Index: 3
              precision    recall  f1-score   support

           0   0.848914  0.809444  0.828709      4151
           1   0.819200  0.857006  0.837677      4182

    a

In [None]:
# Performing K-Fold Cross Validation using MLP
cross_val('mlp', splits = 3)

Classification Technique: mlp

Time taken for training: 00:44:47
Fold Index: 1
              precision    recall  f1-score   support

           0   0.797916  0.787234  0.792539      4183
           1   0.788448  0.799085  0.793731      4151

    accuracy                       0.793137      8334
   macro avg   0.793182  0.793159  0.793135      8334
weighted avg   0.793200  0.793137  0.793133      8334


Time taken for training: 00:12:53
Fold Index: 2
              precision    recall  f1-score   support

           0   0.740989  0.345415  0.471185      4166
           1   0.573306  0.879290  0.694071      4167

    accuracy                       0.612384      8333
   macro avg   0.657147  0.612352  0.582628      8333
weighted avg   0.657137  0.612384  0.582641      8333


Time taken for training: 00:11:12
Fold Index: 3
              precision    recall  f1-score   support

           0   0.623016  0.529511  0.572470      4151
           1   0.593548  0.681970  0.634695      4182

    a

In [None]:
# Training a LDA Classifier on the complete dataset
# And saving the full pipeline into a Model

vec, model = train(X, Y, '', '', 'lda')

pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])
joblib.dump(pipe, "tf-idf_lda_model.pkl")


Time taken for training: 00:00:57


['tf-idf_lda_model.pkl']

In [None]:
# Training a MLP Classifier on the complete dataset
# And saving the full pipeline into a Model

vec, model = train(X, Y, 'tanh', 'sgd', 'mlp')

pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])
joblib.dump(pipe, "tf-idf_mlp_model.pkl")


Time taken for training: 00:27:22


['tf-idf_mlp_model.pkl']

In [None]:
# Testing out the saved pipeline on all train samples
saved_pipe = joblib.load("tf-idf_lda_model.pkl")

pred_Y_all = saved_pipe.predict(X)
print(classification_report(Y, pred_Y_all, digits = 6))

              precision    recall  f1-score   support

           0   0.854967  0.826240  0.840358     12500
           1   0.831889  0.859840  0.845633     12500

    accuracy                       0.843040     25000
   macro avg   0.843428  0.843040  0.842996     25000
weighted avg   0.843428  0.843040  0.842996     25000



In [None]:
# Testing out Saved LDA Model on Test Data

saved_pipe = joblib.load("tf-idf_lda_model.pkl")

X_gold_test = []
Y_gold_test = []

for unit in gold_test_list:
    X_gold_test.append(unit['content'])
    Y_gold_test.append(unit['label'])
    
pred_Y_gold_test = saved_pipe.predict(X_gold_test)
print(classification_report(Y_gold_test, pred_Y_gold_test, digits = 6))

              precision    recall  f1-score   support

           0   0.843693  0.816560  0.829905     12500
           1   0.822276  0.848720  0.835289     12500

    accuracy                       0.832640     25000
   macro avg   0.832984  0.832640  0.832597     25000
weighted avg   0.832984  0.832640  0.832597     25000



In [None]:
# Testing out Saved MLP Model on Test Data

saved_pipe = joblib.load("tf-idf_mlp_model.pkl")

X_gold_test = []
Y_gold_test = []

for unit in gold_test_list:
    X_gold_test.append(unit['content'])
    Y_gold_test.append(unit['label'])
    
pred_Y_gold_test = saved_pipe.predict(X_gold_test)
print(classification_report(Y_gold_test, pred_Y_gold_test, digits = 6))

              precision    recall  f1-score   support

           0   0.845930  0.814800  0.830073     12500
           1   0.821373  0.851600  0.836214     12500

    accuracy                       0.833200     25000
   macro avg   0.833652  0.833200  0.833144     25000
weighted avg   0.833652  0.833200  0.833144     25000



### $Exercise$:

#### $Replace\ TF-IDF\ with\ GloVe\ Vectors,\ re-run\ experiments.$

In [None]:
# ^_^ Thank You