In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import cross_validation



In [3]:
con = sqlite3.connect('/mydata/final.sqlite')

In [4]:
final = pd.read_sql_query(""" SELECT * FROM Reviews""", con)

In [None]:
final = final.sort_values(by=['Time']) 

In [5]:
final = final.sample(n = 10000)
final.shape

(10000, 12)

In [7]:
l = final['Score']
final = final.drop("Score", axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(standardized_data, l, test_size=0.3, random_state=0)

In [8]:
list_clean_text = []
for sent in final['CleanedText'].values:
    list_clean_text.append(sent.split())

In [9]:
# Considering Words with minimum occurence of 5 times
w2v_model = Word2Vec(list_clean_text, min_count = 5, size = 50, workers = 2 )

In [10]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words occuring more than 5 times", len(w2v_words))
print("sample words", w2v_words[0:50])

number of words occuring more than 5 times 4479
sample words ['recent', 'purchas', 'corp', 'gopher', 'trap', 'within', 'minut', 'lay', 'catch', 'product', 'best', 'ever', 'use', 'easi', 'set', 'work', 'great', 'success', 'also', 'rememb', 'wire', 'attach', 'tie', 'steak', 'prevent', 'drag', 'hole', 'caught', 'hope', 'find', 'good', 'luck', 'michael', 'bring', 'distinguish', 'characterist', 'beetlejuic', 'mere', 'act', 'bizarr', 'often', 'stun', 'movi', 'come', 'focus', 'like', 'one', 'snack', 'popcorn', 'vacat']


In [11]:
tf_idf_vect = TfidfVectorizer(ngram_range = (1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['CleanedText'].values)


In [12]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_clean_text: # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if (word in w2v_words and (word in tfidf_feat)):
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
            
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

In [13]:
sent_vectors_final = np.asarray(tfidf_sent_vectors)

In [14]:
print(sent_vectors_final.shape)
l.shape

(10000, 50)


(10000,)

In [15]:
from sklearn.preprocessing import StandardScaler
standardized_data = StandardScaler().fit_transform(sent_vectors_final)
standardized_data.shape

(10000, 50)

In [None]:
myList = list(range(0,50))
neighbors = list(filter(lambda x: x % 2 != 0, myList))
cv_scores = []
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='kd_tree', n_jobs = -1)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

MSE = [1 - x for x in cv_scores]
optimal_k = neighbors[MSE.index(min(MSE))]
print('\nThe optimal number of neighbors is %d.' % optimal_k)

In [None]:
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

print("the misclassification error for each k value is : ", np.round(MSE,3))

In [None]:
knn_optimal = KNeighborsClassifier(n_neighbors=optimal_k, algorithm='kd_tree', n_jobs = -1)
knn_optimal.fit(X_train, y_train)
pred = knn_optimal.predict(X_test)
acc = accuracy_score(y_test, pred) * 100
print('\nThe accuracy of the knn classifier for k = %d is %f%%' % (optimal_k, acc))