In [336]:
import numpy as np
import pandas as pd
import timeit
import re

import string as str
from bs4 import BeautifulSoup

In [337]:
#clean data with below components removed or replaced
url = re.compile(r"(?:(http[s]?://\S+)|((//)?(\w+\.)?\w+\.\w+/\S+))")
user_mention = re.compile(r"(?:(?<!\w)@\w+\b)")
number = re.compile(r"(?:\b\d+\b)")
repeated_char = '([a-zA-Z])\\1+'
length_repeated_char = '\\1\\1'

def clean(raw):
  #convert HTML encoding to text
  new_row = BeautifulSoup(raw, 'html.parser').get_text()
  
  #Change all text to lower case
  new_row = new_row.lower()
  
  #Replaces any url with class URL
  new_row = re.sub(url, '', new_row)
  
  #replace any @username with class USERNAME
  new_row = re.sub(user_mention, '', new_row)
  
  #Strips repeated chars
  new_row = re.sub(repeated_char, length_repeated_char, new_row)
  
  #Replaces #hashtag with hashtag
  new_row = re.sub(r'#(\S+)', r' \1 ', new_row)
  
  #Remove numbers
  new_row = re.sub(number, '', new_row)
  
  #decode text with 'utf-8-sig'
  try:
    temp_row = new_row.decode("utf-8-sig").replace(u"\ufffd", "?")     
  except:
    temp_row = new_row
  
  #Removes emojis
#   new_row = handle_emojis(temp_row);
  
  return new_row

In [338]:
df_train_raw = pd.read_csv("SSTB_train.csv")
df_train_raw

Unnamed: 0.1,Unnamed: 0,sentence_index,sentence,splitset_label,id,values
0,0,1,The Rock is destined to be the 21st Century 's...,1,226166,0.69444
1,1,2,The gorgeously elaborate continuation of `` Th...,1,226300,0.83333
2,2,61,Singer\/composer Bryan Adams contributes a sle...,1,225801,0.62500
3,3,62,You 'd think by now America would have had eno...,1,14646,0.50000
4,4,63,Yet the act is still charming here .,1,14644,0.72222
...,...,...,...,...,...,...
8539,8539,11851,A real snooze .,1,222071,0.11111
8540,8540,11852,No surprises .,1,225165,0.22222
8541,8541,11853,We 've seen the hippie-turned-yuppie plot befo...,1,226985,0.75000
8542,8542,11854,Her fans walked out muttering words like `` ho...,1,223632,0.13889


In [339]:
df_test_raw = pd.read_csv("SSTB_test.csv")
df_test_raw

Unnamed: 0.1,Unnamed: 0,sentence_index,sentence,splitset_label,id,values
0,0,3,Effective but too-tepid biopic,2,13995,0.513890
1,1,4,If you sometimes like to go to the movies to h...,2,14123,0.736110
2,2,5,"Emerges as something rare , an issue movie tha...",2,13999,0.861110
3,3,6,The film provides some great insight into the ...,2,14498,0.597220
4,4,7,Offers that rare combination of entertainment ...,2,14351,0.833330
...,...,...,...,...,...,...
2205,2205,11621,An imaginative comedy\/thriller .,2,13851,0.777780
2206,2206,11623,"( A ) rare , beautiful film .",2,18182,0.916670
2207,2207,11626,( An ) hilarious romantic comedy .,2,23211,0.888890
2208,2208,11628,Never ( sinks ) into exploitation .,2,26177,0.625000


In [340]:
clean_train = []
clean_test = []

In [341]:
for i in range(0, 8544):
    clean_train.append(clean(df_train_raw['sentence'][i]))

for i in range(0, 2210):
    clean_test.append(clean(df_test_raw['sentence'][i]))

In [348]:
#save to csv file
clean_df_train = pd.DataFrame(clean_train, columns=['sentence'])
clean_df_test = pd.DataFrame(clean_test, columns = ['sentence'])

clean_df_train['values'] = df_train_raw["values"]
clean_df_test['values'] = df_test_raw["values"]

clean_df_train.to_csv('SSTB_clean_train.csv',encoding='utf-8')
clean_df_test.to_csv("SSTB_clean_test.csv", encoding = "utf-8")

In [349]:
# Using columns 1,2 because col 0 is the number of the row, could be changed if the row number is removed from the csv file.
df_train = pd.read_csv('SSTB_clean_train.csv', usecols = [1,2], encoding='latin-1')
df_test = pd.read_csv("SSTB_clean_test.csv", usecols = [1,2], encoding = "latin-1")

df_train

Unnamed: 0,sentence,values
0,the rock is destined to be the 21st century 's...,0.69444
1,the gorgeously elaborate continuation of `` th...,0.83333
2,singer\/composer bryan adams contributes a sle...,0.62500
3,you 'd think by now america would have had eno...,0.50000
4,yet the act is still charming here .,0.72222
...,...,...
8539,a real snooze .,0.11111
8540,no surprises .,0.22222
8541,we 've seen the hippie-turned-yuppie plot befo...,0.75000
8542,her fans walked out muttering words like `` ho...,0.13889


In [350]:
# function for converting the y values to positive or negative values

def convert_to_pos_neg(y):
    for i in range(len(y)):
        if y[i] >= 3:
            y[i] = 1
        elif y[i] <= 1:
            y[i] = 0
    return y

In [351]:
def convert_to_fine_grain(y):
    for i in range(len(y)):
        if y[i] >= 0 and y[i] <= 0.2:
            y[i] = 0
        elif y[i] > 0.2 and y[i] < 0.4:
            y[i] = 1
        elif y[i] > 0.4 and y[i] < 0.6:
            y[i] = 2
        elif y[i] > 0.6 and y[i] < 0.8:
            y[i] = 3
        elif y[i] > 0.8 and y[i] <= 1.0:
            y[i] = 4
    return y


In [352]:
# Convert the dataset labels into fine grain classification
df_train["values"] = convert_to_fine_grain(df_train["values"])
df_test["values"] = convert_to_fine_grain(df_test["values"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the docume

In [353]:
# Convert the dataset into positive / negative binary classification
# In order to not to alter the original dataset, we make a copy of each
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [354]:
# Firstly, convert the target values into positive / negative, except for neutral ones
# which will be removed after
df_train_copy["values"] = convert_to_pos_neg(df_train_copy["values"])
df_test_copy["values"] = convert_to_pos_neg(df_test_copy["values"])
df_train_copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,sentence,values
0,the rock is destined to be the 21st century 's...,1.0
1,the gorgeously elaborate continuation of `` th...,1.0
2,singer\/composer bryan adams contributes a sle...,1.0
3,you 'd think by now america would have had eno...,2.0
4,yet the act is still charming here .,1.0
...,...,...
8539,a real snooze .,0.0
8540,no surprises .,0.0
8541,we 've seen the hippie-turned-yuppie plot befo...,1.0
8542,her fans walked out muttering words like `` ho...,0.0


In [355]:
# Drop the neutral values for positive / negative classification
df_train_copy.drop(df_train_copy[df_train_copy["values"] == 2].index, inplace=True)
df_test_copy.drop(df_test_copy[df_test_copy["values"] == 2].index, inplace=True)
df_test_copy

Unnamed: 0,sentence,values
1,if you sometimes like to go to the movies to h...,1.0
2,"emerges as something rare , an issue movie tha...",1.0
4,offers that rare combination of entertainment ...,1.0
5,perhaps no picture ever made has more literall...,1.0
6,steers turns in a snappy screenplay that curls...,1.0
...,...,...
2205,an imaginative comedy\/thriller .,1.0
2206,"( a ) rare , beautiful film .",1.0
2207,( an ) hilarious romantic comedy .,1.0
2208,never ( sinks ) into exploitation .,1.0


In [356]:
X_train_posneg = df_train_copy["sentence"]
y_train_posneg = df_train_copy["values"]

X_test_posneg = df_test_copy["sentence"]
y_test_posneg = df_test_copy["values"]

X_train_finegrain = df_train["sentence"]
y_train_finegrain = df_train["values"]

X_test_finegrain = df_test["sentence"]
y_test_finegrain = df_test["values"]

In [357]:
#===============================================================

In [358]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
from sklearn.metrics import accuracy_score

In [359]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()

X_train_posneg_cvec = cvec.fit_transform(X_train_posneg, y_train_posneg)
X_train_posneg_tf = tfidf.fit_transform(X_train_posneg, y_train_posneg)

X_test_posneg_cvec = cvec.transform(X_test_posneg)
X_test_posneg_tf = tfidf.transform(X_test_posneg)

X_train_finegrain_cvec = cvec.fit_transform(X_train_finegrain, y_train_finegrain)
X_train_finegrain_tf = tfidf.fit_transform(X_train_finegrain, y_train_finegrain)

X_test_finegrain_cvec = cvec.transform(X_test_finegrain)
X_test_finegrain_tf = tfidf.transform(X_test_finegrain)

In [360]:
#=====================Generic 5-fold Cross Validation Score On Classifying Positive/Negative==================
# This cross validation will only operate on the training set
# Start the Timer
start = timeit.default_timer()

mnb = MultinomialNB()
results_cv = model_selection.cross_val_score(mnb, X_train_posneg_cvec, y_train_posneg, cv = 5)
print("Cross validation with countvectorizer: ", results_cv.mean())

results_tf = model_selection.cross_val_score(mnb, X_train_posneg_tf, y_train_posneg, cv = 5)
print("Cross validation with TfidfVectorizer: ", results_tf.mean())
# mnb.fit(X_train_tf, y_train)
# y_pred = mnb.predict(X_test_tf)

# print(accuracy_score(y_pred, y_test))

# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))

Cross validation with countvectorizer:  0.7929190751445085
Cross validation with TfidfVectorizer:  0.7880057803468208
Time Execution: 0.08872743800020544


In [361]:
#=====================Generic 5-fold Cross Validation Score on Classifying Fine Grain==================
# This cross validation will only operate on the training set
# Start the Timer
start = timeit.default_timer()

mnb = MultinomialNB()
results_cv = model_selection.cross_val_score(mnb, X_train_finegrain_cvec, y_train_finegrain, cv = 5)
print("Cross validation with countvectorizer: ", results_cv.mean())

results_tf = model_selection.cross_val_score(mnb, X_train_finegrain_tf, y_train_finegrain, cv = 5)
print("Cross validation with TfidfVectorizer: ", results_tf.mean())
# mnb.fit(X_train_tf, y_train)
# y_pred = mnb.predict(X_test_tf)

# print(accuracy_score(y_pred, y_test))

# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))

Cross validation with countvectorizer:  0.3957121777381964
Cross validation with TfidfVectorizer:  0.3957132081460396
Time Execution: 0.14975032799702603


In [362]:
#===========================Positive/Negative Classification=========================

In [363]:
#============================Finding Optimal N_gram=============================

In [405]:
# function for evaluating the accuracy of a given classifier and vectorizer
def nfeature_accuracy_checker(X_train, y_train, X_test, y_test, vectorizer=None, n_features=None, stop_words=None, ngram_range=(1, 1), classifier=None):
    result = []
    print (classifier)
    print ("\n")
    for n in n_features:
        vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
        t0 = time()
        sentiment_fit = checker_pipeline.fit(X_train, y_train)
        y_pred = sentiment_fit.predict(X_test)
        train_test_time = time() - t0
        accuracy = accuracy_score(y_pred, y_test)
        print("accuracy with", n, "features:", accuracy, "time:", train_test_time)
    print("\n")

In [406]:
from sklearn.pipeline import Pipeline
from time import time

In [407]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()
mnb = MultinomialNB()

In [408]:
stop_word_extractor = [None, "english"]
vectorizer = [cvec, tfidf]

In [409]:
n_features = np.arange(10000,50001,10000)

# unigram case 
for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(X_train_posneg, y_train_posneg, X_test_posneg, y_test_posneg,
                                                                                   vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.814936847885777 time: 0.24288511276245117
accuracy with 20000 features: 0.8160351455244371 time: 0.2191309928894043
accuracy with 30000 features: 0.8160351455244371 time: 0.2085590362548828
accuracy with 40000 features: 0.8160351455244371 time: 0.2638392448425293
accuracy with 50000 features: 0.8160351455244371 time: 0.2991969585418701


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.8160351455244371 time: 0.2213451862335205
accuracy with 20000 features: 0.8088962108731467 time: 0.23039674758911133
accuracy with 30000 features: 0.8088962108731467 time: 0.22635698318481445
accuracy with 40000 features: 0.8088962108731467 time: 0.22919416427612305
accuracy with 50000 features: 0.8088962108731467 time: 0.25876474380493164


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.8034047226798462 time: 0.210703849792

In [410]:
# unigram + bigram case 
# in here, we modify the number of features from the range 30000 to 130000, since 
# much more features will be created

n_features = np.arange(30000,130001,10000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(X_train_posneg, y_train_posneg, X_test_posneg, y_test_posneg,
                                                                vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 2), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 30000 features: 0.8165842943437671 time: 0.7560849189758301
accuracy with 40000 features: 0.8193300384404174 time: 0.7224512100219727
accuracy with 50000 features: 0.8198791872597474 time: 0.6592221260070801
accuracy with 60000 features: 0.8182317408017573 time: 0.6419909000396729
accuracy with 70000 features: 0.8182317408017573 time: 0.6413350105285645
accuracy with 80000 features: 0.8226249313563976 time: 0.640186071395874
accuracy with 90000 features: 0.8220757825370676 time: 0.638437032699585
accuracy with 100000 features: 0.8220757825370676 time: 0.6400349140167236
accuracy with 110000 features: 0.8220757825370676 time: 0.6420121192932129
accuracy with 120000 features: 0.8220757825370676 time: 0.6386301517486572
accuracy with 130000 features: 0.8220757825370676 time: 0.7007782459259033


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 30000 features: 0.8160351455244371 time: 0.727

In [411]:
# unigram + bigram + trigram case
# We furthur extent the number of features to the range of 120000

n_features = np.arange(60000,160001,10000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(X_train_posneg, y_train_posneg, X_test_posneg, y_test_posneg,
                                                                vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 3), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 60000 features: 0.814387699066447 time: 1.2241559028625488
accuracy with 70000 features: 0.8121911037891268 time: 1.385418176651001
accuracy with 80000 features: 0.814387699066447 time: 1.1598639488220215
accuracy with 90000 features: 0.8154859967051071 time: 1.284280776977539
accuracy with 100000 features: 0.814936847885777 time: 1.4914159774780273
accuracy with 110000 features: 0.8171334431630972 time: 1.2772462368011475
accuracy with 120000 features: 0.8165842943437671 time: 1.210932970046997
accuracy with 130000 features: 0.8176825919824272 time: 1.3571701049804688
accuracy with 140000 features: 0.8154859967051071 time: 1.2905611991882324
accuracy with 150000 features: 0.8116419549697969 time: 1.1901488304138184
accuracy with 160000 features: 0.8132894014277869 time: 1.2022037506103516


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 60000 features: 0.8066996155958265 time: 1.1755

In [371]:
# So far, the bigram case with CountVectorizer 80000 features has the best result

In [481]:
mnbGV = MultinomialNB()


cvecGV = CountVectorizer(ngram_range = (1, 2), max_features = 80000)
X_cvec = cvecGV.fit_transform(X_train_posneg)

In [412]:
def hyper_parameter_tuning(alphas):
    best_alpha = None
    best_result = 0
    

    for a in alphas:
        mnb_model = MultinomialNB(alpha = a)
        t0 = time()
        X_final = cvecGV.transform(X_test_posneg)
        mnb_model.fit(X_cvec, y_train_posneg)
        y_pred = mnb_model.predict(X_final)
        result = accuracy_score(y_pred, y_test_posneg)
        train_test_time = time() - t0
        print("accuracy for alpha =", a, ":", result, "time:", train_test_time)
        if result > best_result:
            best_result = result
            best_alpha = a
        
    print("The best result is",best_result,"with alpha of", best_alpha)

In [413]:
alphas = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2]
hyper_parameter_tuning(alphas)

accuracy for alpha = 0.7 : 0.8209774848984075 time: 0.11713171005249023
accuracy for alpha = 0.8 : 0.8193300384404174 time: 0.08710002899169922
accuracy for alpha = 0.9 : 0.8215266337177375 time: 0.08791804313659668
accuracy for alpha = 1.0 : 0.8226249313563976 time: 0.09646797180175781
accuracy for alpha = 1.1 : 0.8198791872597474 time: 0.08778977394104004
accuracy for alpha = 1.2 : 0.8193300384404174 time: 0.08960604667663574
The best result is 0.8226249313563976 with alpha of 1.0


In [414]:
# The best hyperparameter value so far is 1.0

In [415]:
#====================Bagging Classifier with NB Model================

In [416]:
from sklearn.ensemble import BaggingClassifier
from sklearn import model_selection

# Timer begins
# start = timeit.default_timer()


In [482]:
mnb = MultinomialNB(alpha = 1.0)
X_final = cvecGV.transform(X_test_posneg)

In [418]:
def bagging_max_sample_optimization(samples):
    best_sample = 0
    best_result = 0
    
    for s in samples:
        bg = BaggingClassifier(mnb, max_samples = s, max_features = 0.5, n_estimators = 200)
        t0 = time()
        bg.fit(X_cvec, y_train_posneg)
        y_pred = bg.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_posneg)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(s, result, train_test_time))
        if result > best_result:
            best_result = result
            best_sample = s
    print("The best result is {} ratio of sample with accuracy of {}".format(best_sample, best_result))

In [419]:
samples = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bagging_max_sample_optimization(samples)

Bagging Classifier with 0.5 ratio of sample is 0.8127402526084568 with the time of 4.0490710735321045
Bagging Classifier with 0.6 ratio of sample is 0.8099945085118067 with the time of 4.096158027648926
Bagging Classifier with 0.7 ratio of sample is 0.8176825919824272 with the time of 3.7774651050567627
Bagging Classifier with 0.8 ratio of sample is 0.8165842943437671 with the time of 4.001101970672607
Bagging Classifier with 0.9 ratio of sample is 0.8154859967051071 with the time of 3.8665168285369873
Bagging Classifier with 1.0 ratio of sample is 0.8187808896210873 with the time of 4.048224210739136
The best result is 1.0 ratio of sample with accuracy of 0.8187808896210873


In [420]:
# The best ratio is so far is 0.9, so we set the best sample ratio as a variable
bestSampleRate = 1.0

In [421]:
def bagging_max_feature_optimization(features):
    best_feature = 0
    best_result = 0
    
    for f in features:
        bg = BaggingClassifier(mnb, max_samples = bestSampleRate, max_features = f, n_estimators = 200)
        t0 = time()
        bg.fit(X_cvec, y_train_posneg)
        y_pred = bg.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_posneg)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(f, result, train_test_time))
        if result > best_result:
            best_result = result
            best_feature = f
    print("The best result is {} ratio of sample with accuracy of {}".format(best_feature, best_result))

In [422]:
features = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
bagging_max_feature_optimization(features)

# # Timer stops
# stop = timeit.default_timer()
# print("Time Execution bagging classifier: {}".format(stop - start))

Bagging Classifier with 0.4 ratio of sample is 0.8160351455244371 with the time of 3.7128469944000244
Bagging Classifier with 0.5 ratio of sample is 0.8209774848984075 with the time of 3.822340726852417
Bagging Classifier with 0.6 ratio of sample is 0.8187808896210873 with the time of 4.365931987762451
Bagging Classifier with 0.7 ratio of sample is 0.8187808896210873 with the time of 5.061239004135132
Bagging Classifier with 0.8 ratio of sample is 0.8215266337177375 with the time of 5.336562156677246
Bagging Classifier with 0.9 ratio of sample is 0.8193300384404174 with the time of 5.809247970581055
The best result is 0.8 ratio of sample with accuracy of 0.8215266337177375


In [423]:
bestFeatureRatio = 0.8

In [424]:
def bagging_max_estimator_optimization(estimators):
    best_estim = 0
    best_result = 0
    
    for e in estimators:
        bg = BaggingClassifier(mnb, max_samples = bestSampleRate, max_features = bestFeatureRatio, n_estimators = e)
        t0 = time()
        bg.fit(X_cvec, y_train_posneg)
        y_pred = bg.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_posneg)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(e, result, train_test_time))
        if result > best_result:
            best_result = result
            best_estim = e
    print("The best result is {} estimators with accuracy of {}".format(best_estim, best_result))

In [425]:
estimators = [200, 400, 600, 800, 1000]
bagging_max_estimator_optimization(estimators)

# This is the final result of mnb with all the techniques above

Bagging Classifier with 200 ratio of sample is 0.8171334431630972 with the time of 5.558647155761719
Bagging Classifier with 400 ratio of sample is 0.8154859967051071 with the time of 11.622493982315063
Bagging Classifier with 600 ratio of sample is 0.8171334431630972 with the time of 17.5408718585968
Bagging Classifier with 800 ratio of sample is 0.8171334431630972 with the time of 22.187037229537964
Bagging Classifier with 1000 ratio of sample is 0.8193300384404174 with the time of 37.463013887405396
The best result is 1000 estimators with accuracy of 0.8193300384404174


In [426]:
#==============================================================

In [472]:
# =================== AdaBoost Optimization===================
from sklearn.ensemble import AdaBoostClassifier

In [483]:
mnb = MultinomialNB(alpha = 0.9)
X_final = cvecGV.transform(X_test_posneg)

In [490]:
def boosting_max_estimator_optimization(estimators):
    best_estim = 0
    best_result = 0
    
    for e in estimators:
        adb = AdaBoostClassifier(mnb, n_estimators = e, learning_rate = 1.2)
        t0 = time()
        adb.fit(X_cvec, y_train_posneg)
        y_pred = adb.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_posneg)
        print("Boosting Classifier with {} ratio of sample is {} with the time of {}".format(e, result, train_test_time))
        if result > best_result:
            best_result = result
            best_estim = e
    print("The best result is {} estimators with accuracy of {}".format(best_estim, best_result))

In [491]:
estimators = [200, 400, 600, 800, 1000]
boosting_max_estimator_optimization(estimators)

Boosting Classifier with 200 ratio of sample is 0.6996155958264689 with the time of 3.0156829357147217
Boosting Classifier with 400 ratio of sample is 0.7841845140032949 with the time of 5.851296901702881
Boosting Classifier with 600 ratio of sample is 0.800658978583196 with the time of 9.773756265640259
Boosting Classifier with 800 ratio of sample is 0.8017572762218561 with the time of 12.311228275299072
Boosting Classifier with 1000 ratio of sample is 0.8017572762218561 with the time of 11.153487920761108
The best result is 800 estimators with accuracy of 0.8017572762218561


In [488]:
bestEstimators = 600

def boosting_best_lr_optimization(learningrates):
    best_lr = 0
    best_result = 0
    
    for lr in learningrates:
        adb = AdaBoostClassifier(mnb, n_estimators = bestEstimators, learning_rate = lr)
        t0 = time()
        adb.fit(X_cvec, y_train_posneg)
        y_pred = adb.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_posneg)
        print("Boosting Classifier with {} ratio of sample is {} with the time of {}".format(lr, result, train_test_time))
        if result > best_result:
            best_result = result
            best_lr= lr
    print("The best result is {} estimators with accuracy of {}".format(best_lr, best_result))

In [489]:
learningrates = [0.8, 1, 1.1, 1.2, 1.3]
boosting_best_lr_optimization(learningrates)

Boosting Classifier with 0.8 ratio of sample is 0.7869302580999451 with the time of 6.500154733657837
Boosting Classifier with 1 ratio of sample is 0.786381109280615 with the time of 5.996299982070923
Boosting Classifier with 1.1 ratio of sample is 0.7913234486545854 with the time of 6.469648122787476
Boosting Classifier with 1.2 ratio of sample is 0.800658978583196 with the time of 9.436261892318726
Boosting Classifier with 1.3 ratio of sample is 0.7896760021965953 with the time of 9.451599836349487
The best result is 1.2 estimators with accuracy of 0.800658978583196


In [435]:
#===================================End of Positive / Negative Section==================================

In [436]:
#==================================Fine Grain Section===============================================

In [437]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()
mnb = MultinomialNB()

In [438]:
stop_word_extractor = [None, "english"]
vectorizer = [cvec, tfidf]

In [439]:
n_features = np.arange(2500,20001,2500)

# unigram case 
for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(X_train_finegrain, y_train_finegrain, X_test_finegrain, y_test_finegrain,
                                                                                   vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 2500 features: 0.41357466063348414 time: 0.31301403045654297
accuracy with 5000 features: 0.4176470588235294 time: 0.27707886695861816
accuracy with 7500 features: 0.4149321266968326 time: 0.2858729362487793
accuracy with 10000 features: 0.4085972850678733 time: 0.28822875022888184
accuracy with 12500 features: 0.4108597285067873 time: 0.3307499885559082
accuracy with 15000 features: 0.4117647058823529 time: 0.31243109703063965
accuracy with 17500 features: 0.4113122171945701 time: 0.27562594413757324
accuracy with 20000 features: 0.4113122171945701 time: 0.27642393112182617


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 2500 features: 0.40090497737556563 time: 0.28070712089538574
accuracy with 5000 features: 0.40316742081447965 time: 0.27507996559143066
accuracy with 7500 features: 0.4 time: 0.2819690704345703
accuracy with 10000 features: 0.40316742081447965 time: 0.28792476654052

In [440]:
# unigram + bigram case 
# in here, we modify the number of features from the range 30000 to 130000, since 
# much more features will be created

n_features = np.arange(2500,25001,2500)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(X_train_finegrain, y_train_finegrain, X_test_finegrain, y_test_finegrain,
                                                                vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 2), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 2500 features: 0.3909502262443439 time: 0.8842999935150146
accuracy with 5000 features: 0.4 time: 0.9687440395355225
accuracy with 7500 features: 0.40542986425339367 time: 1.0097119808197021
accuracy with 10000 features: 0.4149321266968326 time: 0.8717038631439209
accuracy with 12500 features: 0.4108597285067873 time: 0.8585162162780762
accuracy with 15000 features: 0.4108597285067873 time: 0.8899269104003906
accuracy with 17500 features: 0.40588235294117647 time: 0.9269199371337891
accuracy with 20000 features: 0.4076923076923077 time: 0.8048450946807861
accuracy with 22500 features: 0.4072398190045249 time: 0.805927038192749
accuracy with 25000 features: 0.4108597285067873 time: 0.812014102935791


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 2500 features: 0.3832579185520362 time: 0.786909818649292
accuracy with 5000 features: 0.38868778280542987 time: 0.807304859161377
accuracy 

In [441]:
# unigram + bigram + trigram case
# We furthur extent the number of features to the range of 120000

n_features = np.arange(20000,70001,5000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(X_train_finegrain, y_train_finegrain, X_test_finegrain, y_test_finegrain,
                                                                vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 3), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 20000 features: 0.41402714932126694 time: 1.535559892654419
accuracy with 25000 features: 0.4090497737556561 time: 1.5398168563842773
accuracy with 30000 features: 0.4144796380090498 time: 1.611750841140747
accuracy with 35000 features: 0.4081447963800905 time: 1.5501508712768555
accuracy with 40000 features: 0.40588235294117647 time: 1.7565877437591553
accuracy with 45000 features: 0.4004524886877828 time: 1.4938406944274902
accuracy with 50000 features: 0.3990950226244344 time: 1.5747942924499512
accuracy with 55000 features: 0.39773755656108595 time: 1.5381629467010498
accuracy with 60000 features: 0.39592760180995473 time: 1.8488011360168457
accuracy with 65000 features: 0.39773755656108595 time: 1.469877004623413
accuracy with 70000 features: 0.40180995475113124 time: 1.617008924484253


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 20000 features: 0.3918552036199095 time: 1.720

In [442]:
# So far, the unigram case with CountVectorizer 5000 features has the best result

In [465]:
mnbGV = MultinomialNB()


cvecGV = CountVectorizer(ngram_range = (1, 1), max_features = 5000)
X_cvec = cvecGV.fit_transform(X_train_finegrain)

In [466]:
def hyper_parameter_tuning_fg(alphas):
    best_alpha = None
    best_result = 0
    
    for a in alphas:
        mnb_model = MultinomialNB(alpha = a)
        X_final = cvecGV.transform(X_test)
        t0 = time()
        mnb_model.fit(X_cvec, y_train_finegrain)
        y_pred = mnb_model.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_finegrain)
        print("accuracy for alpha =", a, ":", result, "time:", train_test_time)
        if result > best_result:
            best_result = result
            best_alpha = a
        
    print("The best result is",best_result,"with alpha of", best_alpha)

In [470]:
alphas = [0.6, 0.7, 0.8, 0.9, 1.0]
hyper_parameter_tuning_fg(alphas)

accuracy for alpha = 0.6 : 0.4176470588235294 time: 0.012890100479125977
accuracy for alpha = 0.7 : 0.41990950226244345 time: 0.006804943084716797
accuracy for alpha = 0.8 : 0.4167420814479638 time: 0.006578922271728516
accuracy for alpha = 0.9 : 0.4153846153846154 time: 0.009605884552001953
accuracy for alpha = 1.0 : 0.4176470588235294 time: 0.007088899612426758
The best result is 0.41990950226244345 with alpha of 0.7


In [279]:
# So far, the best alpha is 0.5

In [448]:
mnb = MultinomialNB(alpha = 0.5)
X_final = cvecGV.transform(X_test_finegrain)

In [449]:
def bagging_max_sample_optimization_fg(samples):
    best_sample = 0
    best_result = 0
    
    for s in samples:
        bg = BaggingClassifier(mnb, max_samples = s, max_features = 0.5, n_estimators = 200)
        t0 = time()
        bg.fit(X_cvec, y_train_finegrain)
        y_pred = bg.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_finegrain)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(s, result, train_test_time))
        if result > best_result:
            best_result = result
            best_sample = s
    print("The best result is {} ratio of sample with accuracy of {}".format(best_sample, best_result))

In [450]:
features = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bagging_max_sample_optimization_fg(features)

Bagging Classifier with 0.4 ratio of sample is 0.4095022624434389 with the time of 1.7723388671875
Bagging Classifier with 0.5 ratio of sample is 0.4171945701357466 with the time of 1.7434899806976318
Bagging Classifier with 0.6 ratio of sample is 0.41990950226244345 with the time of 1.5583748817443848
Bagging Classifier with 0.7 ratio of sample is 0.41809954751131223 with the time of 1.516690969467163
Bagging Classifier with 0.8 ratio of sample is 0.41357466063348414 with the time of 1.6426348686218262
Bagging Classifier with 0.9 ratio of sample is 0.42036199095022625 with the time of 1.5503368377685547
Bagging Classifier with 1.0 ratio of sample is 0.4104072398190045 with the time of 1.5544908046722412
The best result is 0.9 ratio of sample with accuracy of 0.42036199095022625


In [451]:
# The best sample ratio was 0.9
bestSampleRatioFG = 0.9

In [452]:
def bagging_max_feature_optimization_fg(features):
    best_feature = 0
    best_result = 0
    
    for f in features:
        bg = BaggingClassifier(mnb, max_samples = bestSampleRatioFG, max_features = f, n_estimators = 200)
        t0 = time()
        bg.fit(X_cvec, y_train_finegrain)
        y_pred = bg.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_finegrain)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(f, result, train_Test_time))
        if result > best_result:
            best_result = result
            best_feature = f
    print("The best result is {} ratio of sample with accuracy of {}".format(best_feature, best_result))

In [456]:
samples = [0.6, 0.7, 0.8, 0.9, 1.0]
bagging_max_sample_optimization_fg(samples)

Bagging Classifier with 0.6 ratio of sample is 0.41402714932126694 with the time of 1.4933462142944336
Bagging Classifier with 0.7 ratio of sample is 0.4095022624434389 with the time of 1.4521987438201904
Bagging Classifier with 0.8 ratio of sample is 0.4176470588235294 with the time of 1.477698802947998
Bagging Classifier with 0.9 ratio of sample is 0.40588235294117647 with the time of 1.486508846282959
Bagging Classifier with 1.0 ratio of sample is 0.4158371040723982 with the time of 1.4859158992767334
The best result is 0.8 ratio of sample with accuracy of 0.4176470588235294


In [457]:
# The best feature ratio so far is 0.8
bestFeatureRatioFG = 0.8

In [458]:
def bagging_max_estimator_optimization(estimators):
    best_estim = 0
    best_result = 0
    
    for e in estimators:
        bg = BaggingClassifier(mnb, max_samples = bestSampleRatioFG, max_features = bestFeatureRatioFG, n_estimators = e)
        t0 = time()
        bg.fit(X_cvec, y_train_finegrain)
        y_pred = bg.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_finegrain)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(e, result, train_test_time))
        if result > best_result:
            best_result = result
            best_estim = e
    print("The best result is {} estimators with accuracy of {}".format(best_estim, best_result))

In [459]:
estimators = [400, 600, 800, 1000]
bagging_max_estimator_optimization(estimators)

Bagging Classifier with 400 ratio of sample is 0.4153846153846154 with the time of 3.294645071029663
Bagging Classifier with 600 ratio of sample is 0.41855203619909503 with the time of 4.982913017272949
Bagging Classifier with 800 ratio of sample is 0.4149321266968326 with the time of 6.9745306968688965
Bagging Classifier with 1000 ratio of sample is 0.4167420814479638 with the time of 8.740811109542847
The best result is 600 estimators with accuracy of 0.41855203619909503


In [264]:
#=======================AdaBoost Classifier==================

In [460]:
mnb = MultinomialNB(alpha = 0.9)
X_final = cvecGV.transform(X_test_finegrain)

In [461]:
def boosting_max_estimator_optimization_fg(estimators):
    best_estim = 0
    best_result = 0
    
    for e in estimators:
        adb = AdaBoostClassifier(mnb, n_estimators = e, learning_rate = 0.5)
        t0  = time()
        adb.fit(X_cvec, y_train_finegrain)
        y_pred = adb.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_finegrain)
        print("Boosting Classifier with {} ratio of sample is {} with the time of {}".format(e, result, train_test_time))
        if result > best_result:
            best_result = result
            best_estim = e
    print("The best result is {} estimators with accuracy of {}".format(best_estim, best_result))

In [462]:
estimators = [600, 800, 1000, 1200, 1400]
boosting_max_estimator_optimization_fg(estimators)

Boosting Classifier with 600 ratio of sample is 0.38144796380090495 with the time of 6.964943885803223
Boosting Classifier with 800 ratio of sample is 0.38823529411764707 with the time of 10.121622085571289
Boosting Classifier with 1000 ratio of sample is 0.39773755656108595 with the time of 12.930877923965454
Boosting Classifier with 1200 ratio of sample is 0.39547511312217193 with the time of 14.2639639377594
Boosting Classifier with 1400 ratio of sample is 0.3995475113122172 with the time of 15.808995962142944
The best result is 1400 estimators with accuracy of 0.3995475113122172


In [463]:
bestEstimators = 1400

def boosting_best_lr_optimization_fg(learningrates):
    best_lr = 0
    best_result = 0
    
    for lr in learningrates:
        adb = AdaBoostClassifier(mnb, n_estimators = bestEstimators, learning_rate = lr)
        t0 = time()
        adb.fit(X_cvec, y_train_finegrain)
        y_pred = adb.predict(X_final)
        train_test_time = time() - t0
        result = accuracy_score(y_pred, y_test_finegrain)
        print("Bagging Classifier with {} ratio of sample is {} with the time of {}".format(lr, result, train_test_time))
        if result > best_result:
            best_result = result
            best_lr= lr
    print("The best result is {} estimators with accuracy of {}".format(best_lr, best_result))

In [464]:
learningrates = [0.8, 0.9, 1.0, 1.1, 1.2]
boosting_best_lr_optimization_fg(learningrates)

Bagging Classifier with 0.8 ratio of sample is 0.40180995475113124 with the time of 18.995368003845215
Bagging Classifier with 0.9 ratio of sample is 0.3990950226244344 with the time of 18.766314029693604
Bagging Classifier with 1.0 ratio of sample is 0.39592760180995473 with the time of 17.18435502052307
Bagging Classifier with 1.1 ratio of sample is 0.39592760180995473 with the time of 17.47844386100769
Bagging Classifier with 1.2 ratio of sample is 0.39592760180995473 with the time of 17.57903003692627
The best result is 0.8 estimators with accuracy of 0.40180995475113124
