# Load files

In [1]:
import os
current_path = os.getcwd()
train = open(current_path + "/Data/amazon_reviews_train.csv","r+")
test = open(current_path + "/Data/amazon_reviews_test.csv","r+")
train = train.read()
test = test.read()


# Imports

In [2]:
import numpy as np
import pandas as pd
import nltk
import sklearn
import string
import textblob
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.svm import SVC, LinearSVC

# Define necessary functions

In [3]:
def messy_text_to_df(text):
    documents = text.split("\n")
    df = pd.DataFrame()
    data = []
    labels = []
    for document in documents:
        labels.append(document.split("\t",1)[0])
        text = document.split('\t')[1]
        data.append(text)
    labels = np.array(labels)
    labels[np.where(labels=='__label__2')] = "Positive"
    labels[np.where(labels=='__label__1')] = "Negative"
    df["Data"] = data
    df["Label"] = labels
    
    return df

def remove_punctuation_and_numbers(text,replacements):
    for key,value in replacements.items():
        text = text.replace(key,value)
    text = text.translate(str.maketrans('','',';"#$%&\'()*+/<=>?@[\\]^_`{|}~0123456789')).translate(str.maketrans('!.-:,','     '))
    return text
def remove_non_words(data,replacements):
    res = data.apply(lambda x: remove_punctuation_and_numbers(x,replacements))
    return res


def remove_words_single(string,words_to_be_removed):
    words = nltk.word_tokenize(string)
    filtered_words = []
    for i in range(len(words)):
        if words[i] not in words_to_be_removed:
            filtered_words.append(words[i])
    return ' '.join(filtered_words)

def remove_words(data,words_to_be_removed):
    res = data.apply(lambda x : remove_words_single(x,words_to_be_removed))
    return res
    
    for text,label in documents:
        labels.append(document.split("\t",1)[0])
        text = document.split('\t')[1]
        for key,value in replacements.items():
            text = text.replace(key,value)
            text = text.translate(str.maketrans('','',';"#$%&\'()*+/<=>?@[\\]^_`{|}~0123456789')).translate(str.maketrans('!.-:,','     '))
        words = nltk.word_tokenize(text)
        filtered_words = []
        for i in range(len(words)):
            if words[i] not in words_to_be_removed:
                filtered_words.append(stemmer.stem(words[i]))
        
        res = ' '.join(filtered_words)
        data.append(res)
    labels = np.array(labels)
    labels[np.where(labels=='__label__2')] = "Positive"
    labels[np.where(labels=='__label__1')] = "Negative"
    return data,labels

def stem_single_string(string,nltkstemmer):
    words = nltk.word_tokenize(string)
    stemmed_list = []
    for word in words:
        stemmed_list.append(nltkstemmer.stem(word))
    return ' '.join(stemmed_list)
    

def stem(data):
    stemmer = SnowballStemmer("english")
    res = data.apply(lambda x : stem_single_string(x,stemmer))
    return res

def find_rare_words(data,max_frequency=4):
    
    vectoriser = get_vectorizer(data)
    
    
    temp = ' '.join(data)
    frequencies = (nltk.FreqDist(nltk.word_tokenize(temp)))
    
    fs = np.array(frequencies.most_common())
    fs = pd.DataFrame(fs)
    fs.columns = ["word","count"]
    fs["freq"] = fs["count"].astype(int)
    fs = fs.drop("count",axis=1)
    
    rare_words = list(fs[fs["freq"]<=max_frequency]["word"])
    
    return rare_words

def get_vectorizer(data,vectorizer="CountVectorizer"):
    
    if vectorizer == "Tfidf":
        tfidf = TfidfVectorizer()
        tfidf.fit(data)
        return tfidf
    cv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    cv.fit(data)
    
    return cv

def vectorize_data(data,vectorizer="CountVectorizer"):
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    
    if vectorizer == "Tfidf":
        tfidf = TfidfVectorizer()
        tfidf.fit(data)
        
        return tfidf.transform(data).toarray()
    cv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    cv.fit(data)
    
    return cv.transform(data).toarray()


In [4]:
def remove_symbols_stopwords_and_stem(data):
    data = messy_text_to_df(data)
    data["Data"] = remove_non_words(data["Data"],replacements)
    data["Data"] = remove_words(data["Data"],stopwords)
    data["Data"] = stem(data["Data"])
    
    return data


# Preprocessing 

In [5]:
replacements = {"can't" : 'can not',"shan't":'shall not',"won't":'will not',"'ve" : " have", "'d" : " would", "'m" : " am", "'ll" : " will", "'s" : "", "n't" : " not","'re" : "are"}
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove("not")
stemmer = SnowballStemmer("english")
stopwords = set(["can","could","would","have","go","went","zero","one","two","three","four","five","six","seven","eight","nine","ten"]) | set(stopwords)

train = remove_symbols_stopwords_and_stem(train)
test = remove_symbols_stopwords_and_stem(test)

rare_words = find_rare_words(train["Data"])

train["Data"] = remove_words(train["Data"],rare_words)
test["Data"] = remove_words(test["Data"],rare_words)

In [8]:
test.head()

Unnamed: 0,Data,Label
0,stune even non this sound track beauti it pain...,Positive
1,the best soundtrack ever anyth i read lot revi...,Positive
2,amaz this soundtrack favorit music time hand t...,Positive
3,excel soundtrack i truli like soundtrack i enj...,Positive
4,rememb pull your jaw off the floor after hear ...,Positive


# Vectorizer

In [9]:
count_vectorizer = get_vectorizer(train["Data"], vectorizer = "CountVectorizer")
tfidf_vectorizer = get_vectorizer(train["Data"], vectorizer = "Tfidf")

# Writing to readme.md

In [10]:
def insert_row_to_readme(list_of_values, file=current_path+'/readme.md'):
    vals = [ str(x) for x in list_of_values ]
    out_line = '|' + '|'.join(vals) + '|\n'
    writer = open(file, "a+")
    writer.write(out_line)
    pass

# insert_row_to_readme(['Model', 'Vectorizer', 'Accuracy', 'F1(Macro)', 'Hyperparameters'])
# insert_row_to_readme(['---','---','---','---','---'])

# Model Training

In [9]:
f1_score = sklearn.metrics.f1_score
accuracy_score = sklearn.metrics.accuracy_score

In [41]:
for nb_classifier in [MultinomialNB, GaussianNB]:
    for vectorizer in [count_vectorizer, tfidf_vectorizer]:        
        model = nb_classifier()
        model.fit(vectorizer.transform(train["Data"]).toarray(), train["Label"])
        preds = model.predict(vectorizer.transform(test["Data"]).toarray())
        f1_scores = f1_score(y_true = test["Label"], y_pred = preds, average = 'macro')
        accuracy = accuracy_score(y_true = test["Label"], y_pred = preds)
        list_of_values = [model.__class__.__name__, vectorizer.__class__.__name__, accuracy, f1_scores, None]
        insert_row_to_readme(list_of_values)
        print("\nVectorizer = ", vectorizer.__class__.__name__, "\nModel = ", model.__class__.__name__, "\nAccuracy : ",accuracy,"\nF1 Scores : ", f1_scores)


Vectorizer =  CountVectorizer 
Model =  MultinomialNB 
Accuracy :  0.821 
F1 Scores :  0.820999820999821

Vectorizer =  TfidfVectorizer 
Model =  MultinomialNB 
Accuracy :  0.818 
F1 Scores :  0.8179883512544802

Vectorizer =  CountVectorizer 
Model =  GaussianNB 
Accuracy :  0.692 
F1 Scores :  0.6917225502952657

Vectorizer =  TfidfVectorizer 
Model =  GaussianNB 
Accuracy :  0.695 
F1 Scores :  0.694706613055146


In [51]:
count_scores = []
tfidf_scores = []
for vectorizer in [count_vectorizer,tfidf_vectorizer]:
    
    for c in range(-10,11):
        model = LinearSVC(C=10**c, max_iter=10000)
        model.fit(vectorizer.transform(train["Data"]).toarray(), train["Label"])
        preds = model.predict(vectorizer.transform(test["Data"]).toarray())
        f1_scores = f1_score(y_true = test["Label"], y_pred = preds, average = 'macro')
        accuracy = accuracy_score(y_true = test["Label"], y_pred = preds)
        
        
        list_of_values = [model.__class__.__name__, vectorizer.__class__.__name__, accuracy, f1_scores, {'C' : 10**c}]
        insert_row_to_readme(list_of_values)
#         if vectorizer.__class__.__name__ == "CountVectorizer":
#             count_scores.append(("C = "+str(c), accuracy,f1_scores))
#             pass
#         if vectorizer.__class__.__name__ == "TfidfVectorizer":
#             tfidf_scores.append(("C = "+str(c), accuracy,f1_scores))
#             pass

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [None]:
# count_scores = []
# tfidf_scores = []
# model = SVC()
# for vectorizer in [count_vectorizer,tfidf_vectorizer]:
#     for c in range(-10,11):
#         model.C = 10**c
#         for kernel in ['linear', 'rbf']:
#             model.kernel = kernel
#             if kernel == 'linear':
#                 model.fit(vectorizer.transform(train["Data"]).toarray(), train["Label"])
#                 preds = model.predict(vectorizer.transform(test["Data"]).toarray())
#                 f1_scores = f1_score(y_true = test["Label"], y_pred = preds, average = 'macro')
#                 accuracy = accuracy_score(y_true = test["Label"], y_pred = preds)
#                 list_of_values = [model.__class__.__name__, vectorizer.__class__.__name__, accuracy, f1_scores, {'C' : 10**c, 'kernel' : kernel}]
#                 insert_row_to_readme(list_of_values)
#                 pass
#             if kernel == "rbf":
#                 for gamma in range(-10, 11):
#                     model.gamma = 10**gamma
#                     model.fit(vectorizer.transform(train["Data"]).toarray(), train["Label"])
#                     preds = model.predict(vectorizer.transform(test["Data"]).toarray())
#                     f1_scores = f1_score(y_true = test["Label"], y_pred = preds, average = 'macro')
#                     accuracy = accuracy_score(y_true = test["Label"], y_pred = preds)
#                     list_of_values = [model.__class__.__name__, vectorizer.__class__.__name__, accuracy, f1_scores, {'C' : 10**c, 'kernel' : kernel, "gamma" : 10**gamma}]
#                     insert_row_to_readme(list_of_values)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [45]:
import time
for c in [0.1, 1, 10, 100]:
    for gamma in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]:
        
        before = time.time()
        svc = SVC(C=c, kernel = 'rbf', gamma = gamma)
        svc.fit(tfidf_vectorizer.transform(train["Data"]).toarray(), train["Label"])
        preds = svc.predict(tfidf_vectorizer.transform(test["Data"]).toarray())
        f1 = sklearn.metrics.f1_score(y_true = test["Label"], y_pred = preds, average = "macro")
        acc = sklearn.metrics.accuracy_score(y_true = test["Label"], y_pred = preds)
        print('C : ', c, 'gamma : ', gamma, 'acc : ', acc, 'f1 : ', f1)
        after = time.time()
        insert_row_to_readme(["SVC", "Tfidf", acc, f1, {"C" : c, "kernel" : "rbf", "gamma" : gamma}])
        print("Time taken : ", after - before)

  'precision', 'predicted', average, warn_for)


C :  0.1 gamma :  0.0001 acc :  0.5 f1 :  0.3333333333333333
Time taken :  675.2670350074768


  'precision', 'predicted', average, warn_for)


C :  0.1 gamma :  0.001 acc :  0.5 f1 :  0.3333333333333333
Time taken :  693.1976277828217


  'precision', 'predicted', average, warn_for)


C :  0.1 gamma :  0.01 acc :  0.5 f1 :  0.3333333333333333
Time taken :  694.112625837326
C :  0.1 gamma :  0.1 acc :  0.765 f1 :  0.7593670226328115
Time taken :  716.3489379882812
C :  0.1 gamma :  1 acc :  0.829 f1 :  0.828979306496086
Time taken :  663.3432328701019


  'precision', 'predicted', average, warn_for)


C :  0.1 gamma :  10 acc :  0.5 f1 :  0.3333333333333333
Time taken :  971.3392922878265


  'precision', 'predicted', average, warn_for)


C :  0.1 gamma :  100 acc :  0.5 f1 :  0.3333333333333333
Time taken :  1005.3757529258728


  'precision', 'predicted', average, warn_for)


C :  1 gamma :  0.0001 acc :  0.5 f1 :  0.3333333333333333
Time taken :  677.2333979606628


  'precision', 'predicted', average, warn_for)


C :  1 gamma :  0.001 acc :  0.5 f1 :  0.3333333333333333
Time taken :  675.7112131118774
C :  1 gamma :  0.01 acc :  0.798 f1 :  0.7974523110490768
Time taken :  630.7418649196625
C :  1 gamma :  0.1 acc :  0.861 f1 :  0.8609931886662446
Time taken :  449.7343330383301
C :  1 gamma :  1 acc :  0.876 f1 :  0.875987598759876
Time taken :  469.9936339855194
C :  1 gamma :  10 acc :  0.538 f1 :  0.4126292342821109
Time taken :  1180.2235629558563
C :  1 gamma :  100 acc :  0.538 f1 :  0.4126292342821109
Time taken :  1053.8554270267487


  'precision', 'predicted', average, warn_for)


C :  10 gamma :  0.0001 acc :  0.5 f1 :  0.3333333333333333
Time taken :  638.2842588424683
C :  10 gamma :  0.001 acc :  0.799 f1 :  0.7984758356485218
Time taken :  630.3827059268951
C :  10 gamma :  0.01 acc :  0.864 f1 :  0.8639804131794978
Time taken :  433.34253096580505
C :  10 gamma :  0.1 acc :  0.873 f1 :  0.8729785333721398
Time taken :  312.2916078567505
C :  10 gamma :  1 acc :  0.877 f1 :  0.8769851151989391
Time taken :  911.1613948345184
C :  10 gamma :  10 acc :  0.538 f1 :  0.4126292342821109
Time taken :  2256.328818321228
C :  10 gamma :  100 acc :  0.538 f1 :  0.4126292342821109
Time taken :  2156.4311220645905
C :  100 gamma :  0.0001 acc :  0.799 f1 :  0.7984758356485218
Time taken :  628.8431558609009
C :  100 gamma :  0.001 acc :  0.864 f1 :  0.8639804131794978
Time taken :  429.485271692276
C :  100 gamma :  0.01 acc :  0.871 f1 :  0.8709781953150082
Time taken :  296.7538843154907
C :  100 gamma :  0.1 acc :  0.849 f1 :  0.8489817267889415
Time taken :  292.6

In [44]:
sklearn.metrics.f1_score(y_true = test["Label"], y_pred = preds, average = "macro")
sklearn.metrics.accuracy_score(y_true = test["Label"], y_pred = preds)

0.876

In [40]:
#joblib.dump(model,"sentiment_analysis_naive_bayes_model.pkl")

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

# Evaluation

In [None]:
F1_Score = sklearn.metrics.f1_score(y_pred = preds,y_true = test["Label"], average = None)
Accuracy = sklearn.metrics.accuracy_score(y_true = test["Label"],y_pred = preds)
print("Model trained.\n""F1 Score : ",F1_Score,"\nAccuracy : ",Accuracy)

In [None]:
tfidf_vectorizer