In [35]:
%matplotlib inline

import sqlite3
import numpy as np
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

from tqdm import tqdm
import os

In [2]:
if os.path.isfile('final.sqlite'):
    conn = sqlite3.connect('final.sqlite')
    final = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, conn)
    conn.close()
else:
    print("final.sqlite not found")

In [3]:
#Time based splitting

final_df = final.sort_values('Time')[:10000]

## BOW

In [4]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final_df['CleanedText'][:7000].values)

print("the type of count vectorizer ",type(final_counts))
print("the shape of out text BOW vectorizer ",final_counts.shape)

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (7000, 12679)


In [5]:
X_train = final_counts
y_train = final_df['Score'][:7000]
X_test = count_vect.transform(final_df['CleanedText'][7000:].values)
y_test = final_df['Score'][7000:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7000, 12679)
(7000,)
(3000, 12679)
(3000,)


In [6]:
accuracy = []
K_values = list(range(1,30,2))
for k in tqdm(range(1,30,2)):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    #print('mean for  K={} :'.format(k), scores.mean())
    #print('std:', scores.std())
    #print(scores)
    #print('\n')
    accuracy.append(scores.mean())
#print(accuracy)   
print('best accuracy:',accuracy[accuracy.index(max(accuracy))])
print('corresponding K_value:',K_values[accuracy.index(max(accuracy))])

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:28<00:00,  1.95s/it]


best accuracy: 0.886142857142857
corresponding K_value: 7


In [7]:
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_BOW = accuracy_score(y_test, pred)
print('The Test accuracy for Bag Of Words is:',accuracy_BOW * 100.0)

The Test accuracy for Bag Of Words is: 89.33333333333333


## TF-IDF

In [8]:
#TF-IDF

tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(final_df['CleanedText'][:7000].values)
print(final_tf_idf.shape)

(7000, 12679)


In [9]:
X_train = final_tf_idf
y_train = final_df['Score'][:7000]
X_test = tf_idf_vect.transform(final_df['CleanedText'][7000:].values)
y_test = final_df['Score'][7000:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7000, 12679)
(7000,)
(3000, 12679)
(3000,)


In [10]:
accuracy = []
K_values = list(range(1,30,2))
for k in tqdm(range(1,30,2)):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    #print('mean for  K={} :'.format(k), scores.mean())
    #print('std:', scores.std())
    #print(scores)
    #print('\n')
    accuracy.append(scores.mean())
#print(accuracy)   
print('best accuracy:',accuracy[accuracy.index(max(accuracy))])
print('corresponding K_value:',K_values[accuracy.index(max(accuracy))])

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:25<00:00,  1.77s/it]


best accuracy: 0.8947142857142858
corresponding K_value: 7


In [11]:
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_TFIDF = accuracy_score(y_test, pred)
print('The Test accuracy for TF-IDF is:',accuracy_TFIDF * 100.0)

The Test accuracy for TF-IDF is: 90.0


## W2V

In [12]:
list_of_sent=[]
for sent in final_df['CleanedText'].values:
    list_of_sent.append(sent.split())   
print(len(list_of_sent)) 

10000


In [13]:
print(final_df['CleanedText'].values[0])
print("*****************************************************************")
print(list_of_sent[0])

witti littl book make son laugh loud recit car drive along alway sing refrain hes learn whale india droop love new word book introduc silli classic book will bet son still abl recit memori colleg
*****************************************************************
['witti', 'littl', 'book', 'make', 'son', 'laugh', 'loud', 'recit', 'car', 'drive', 'along', 'alway', 'sing', 'refrain', 'hes', 'learn', 'whale', 'india', 'droop', 'love', 'new', 'word', 'book', 'introduc', 'silli', 'classic', 'book', 'will', 'bet', 'son', 'still', 'abl', 'recit', 'memori', 'colleg']


In [14]:
w2v_model = Word2Vec(list_of_sent[:7000], min_count=5, size=50)

In [15]:
w2v_model.wv['tasti'].shape

(50,)

In [16]:
w2v_words = list(w2v_model.wv.vocab)
print(len(w2v_words))

4282


## Avg-W2V

In [17]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sent): # for each review/sentence
    sent_vec = np.zeros(50) 
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:08<00:00, 1136.30it/s]


10000
50


In [18]:
sent_vectors = np.array(sent_vectors)
print(sent_vectors.shape)

(10000, 50)


In [19]:
X_train = sent_vectors[:7000]
y_train = final_df['Score'][:7000]
X_test = sent_vectors[7000:]
y_test = final_df['Score'][7000:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7000, 50)
(7000,)
(3000, 50)
(3000,)


In [20]:
accuracy = []
K_values = list(range(1,70,2))
for k in tqdm(range(1,70,2)):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    #print('mean for  K={} :'.format(k), scores.mean())
    #print('std:', scores.std())
    #print(scores)
    #print('\n')
    accuracy.append(scores.mean())
#print(accuracy)   
print('best accuracy:',accuracy[accuracy.index(max(accuracy))])
print('corresponding K_value:',K_values[accuracy.index(max(accuracy))])

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [01:37<00:00,  3.23s/it]


best accuracy: 0.8855714285714287
corresponding K_value: 11


In [21]:
model = KNeighborsClassifier(n_neighbors=11)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_Avg_W2V = accuracy_score(y_test, pred)
print('The Test accuracy for Avg-W2V is:',accuracy_Avg_W2V * 100.0)

The Test accuracy for Avg-W2V is: 89.03333333333333


## TFIDF-W2V

In [22]:
#TF-IDF

model = TfidfVectorizer()
tf_idf_matrix = model.fit_transform(final_df['CleanedText'][:7000].values)
# converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [23]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sent):  
    sent_vec = np.zeros(50) 
    weight_sum =0;
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]            
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:10<00:00, 947.80it/s]


In [24]:
tfidf_sent_vectors = np.array(tfidf_sent_vectors)
print(tfidf_sent_vectors.shape)

(10000, 50)


In [25]:
X_train = tfidf_sent_vectors[:7000]
y_train = final_df['Score'][:7000]
X_test = tfidf_sent_vectors[7000:]
y_test = final_df['Score'][7000:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7000, 50)
(7000,)
(3000, 50)
(3000,)


In [32]:
accuracy = []
K_values = list(range(1,70,2))
for k in tqdm(range(1,70,2)):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    #print('mean for  K={} :'.format(k), scores.mean())
    #print('std:', scores.std())
    #print(scores)
    #print('\n')
    accuracy.append(scores.mean())
#print(accuracy)   
print('best accuracy:',accuracy[accuracy.index(max(accuracy))])
print('corresponding K_value:',K_values[accuracy.index(max(accuracy))])

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [01:30<00:00,  2.81s/it]


best accuracy: 0.8852857142857143
corresponding K_value: 17


In [33]:
model = KNeighborsClassifier(n_neighbors=17)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_TFIDF_W2v = accuracy_score(y_test, pred)
print('The Test accuracy for TFIDF-W2V is:',accuracy_TFIDF_W2v * 100.0)

The Test accuracy for TFIDF-W2V is: 89.03333333333333


## TEST Accuracy for all four models

In [36]:
pd.DataFrame(np.array([accuracy_BOW, accuracy_TFIDF, accuracy_Avg_W2V, accuracy_TFIDF_W2v]).T * 100.0,
             index=['Bag_of_words', 'TF-IDF', 'Avg-W2V', 'TFIDF-W2V'], columns=['Accuracy(%)'])

Unnamed: 0,Accuracy(%)
Bag_of_words,89.333333
TF-IDF,90.0
Avg-W2V,89.033333
TFIDF-W2V,89.033333
