In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
import warnings 
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import time
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm

In [2]:
data=pd.read_csv("preprocessed_file.csv")
data.head(2)

Unnamed: 0,Time,clean text,Score
0,939340800,witty little book makes son laugh loud recite ...,1
1,1194739200,grew reading sendak books watching really rosi...,1


In [4]:
data.isnull().any().sum()

1

In [5]:
final_data=data.dropna()

In [6]:
final_data.isnull().any().sum()

0

In [7]:
sample_data=final_data.sample(100000,random_state=42)
print(sample_data.shape)
print(sample_data.columns)

(100000, 3)
Index(['Time', 'clean text', 'Score'], dtype='object')


In [9]:
text=sample_data["clean text"]

In [10]:
i=0
list_of_sentance=[]
for sentance in text:
    list_of_sentance.append(sentance.split())

In [11]:
is_your_ram_gt_16g=False
want_to_use_google_w2v = False
want_to_train_w2v = True

if want_to_train_w2v:
    # min_count = 5 considers only words that occured atleast 5 times
    w2v_model=Word2Vec(list_of_sentance,min_count=5, workers=4)
    print(w2v_model.wv.most_similar('great'))
    print('='*50)
    print(w2v_model.wv.most_similar('worst'))
    
elif want_to_use_google_w2v and is_your_ram_gt_16g:
    if os.path.isfile('GoogleNews-vectors-negative300.bin'):
        w2v_model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        print(w2v_model.wv.most_similar('great'))
        print(w2v_model.wv.most_similar('worst'))
    else:
        print("you don't have gogole's word2vec file, keep want_to_train_w2v = True, to train your own w2v ")

[('fantastic', 0.8017511367797852), ('terrific', 0.7966592907905579), ('awesome', 0.7864436507225037), ('excellent', 0.7852773666381836), ('good', 0.7578009963035583), ('wonderful', 0.7510077357292175), ('fabulous', 0.6521478891372681), ('incredible', 0.6284111738204956), ('amazing', 0.6250677704811096), ('phenomenal', 0.6208261251449585)]
[('nastiest', 0.7425679564476013), ('greatest', 0.7229869365692139), ('best', 0.692347526550293), ('tastiest', 0.6490675210952759), ('weakest', 0.6404266953468323), ('horrible', 0.5994212031364441), ('grossest', 0.5959147810935974), ('smoothest', 0.5945000052452087), ('disgusting', 0.5857833027839661), ('sweetest', 0.5827450156211853)]


In [12]:
w2v_words = list(w2v_model.wv.key_to_index)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  19000
sample words  ['not', 'like', 'good', 'great', 'one', 'taste', 'product', 'would', 'coffee', 'flavor', 'tea', 'love', 'no', 'get', 'food', 'really', 'amazon', 'use', 'much', 'also', 'time', 'little', 'find', 'best', 'price', 'buy', 'make', 'well', 'tried', 'even', 'better', 'try', 'chocolate', 'eat', 'sugar', 'first', 'water', 'used', 'could', 'found', 'sweet', 'made', 'bag', 'bought', 'free', 'drink', 'cup', 'dog', 'box', 'way']


In [13]:
len(w2v_words)

19000

In [14]:
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sentance): # for each review/sentence
    sent_vec = np.zeros(100) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [08:06<00:00, 205.57it/s]

100000
100





In [15]:
df = pd.DataFrame(sent_vectors)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.20329,0.157019,0.081251,0.145412,-0.184297,-0.140728,-0.551824,0.427484,-0.212026,-0.454808,...,-0.199672,0.04968,0.652167,-0.276553,0.454451,0.358438,0.519319,-0.615428,-0.161831,0.17753
1,-0.522816,0.55619,0.287653,-0.790863,0.257616,0.050956,-0.659295,0.367224,0.44807,0.065556,...,-0.194174,0.590518,0.574717,-0.693842,0.785193,0.104926,-0.235397,-0.835852,0.758061,-0.464559
2,-0.891051,-0.378911,-0.588745,-0.008627,-0.129587,0.107037,-0.356439,0.37711,0.001494,-0.287171,...,0.606864,-0.019074,-0.229705,-0.06432,0.610827,-0.058572,-0.262616,-0.62296,0.05298,0.608803
3,-0.662576,0.008383,-0.402024,-0.281394,0.563727,-0.272911,-0.51593,0.202691,-0.148611,-0.43418,...,-0.47883,0.526026,0.548745,-0.054628,0.012225,0.455824,-0.649239,-0.913782,0.39207,-0.445657
4,0.124069,-0.00211,-0.092945,0.129959,0.616278,-0.062054,-0.324667,-0.191032,-0.470558,0.249849,...,-0.050736,0.14671,-0.362903,0.294361,-0.404842,-0.925336,-0.245542,-0.478,0.219709,-0.146917


In [16]:
X=df
y=sample_data["Score"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = False)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((70000, 100), (70000,), (30000, 100), (30000,))

In [19]:
s = StandardScaler() # Since we are passing sparse matrix
# Call the fit_transform method on training data
X_train = s.fit_transform(X_train)
# Call the transform method on the test dataset
X_test = s.transform(X_test)

In [20]:
start = time.time()
# creating list of C
C_values = np.linspace(0.1,1,10)

cv_scores = [] # empty list that will hold cv scores

# Try each value of alpha in the below loop
for c in C_values:
    # Create an object of the class Logistic Regression with balanced class weights
    clf = LogisticRegression(C = c, class_weight = 'balanced',max_iter=5,solver='saga')
    # perform 5-fold cross validation
    # It returns the cv accuracy for each fold in a list
    scores = cross_val_score(clf,X_train,y_train, cv=5, scoring='accuracy')
    # Store the mean of the accuracies from all the 5 folds
    cv_scores.append(scores.mean())

# calculate misclassification error from accuracy (error = 1 - accuracy)
cv_error = [1 - x for x in cv_scores]

# optimal (best) C is the one for which error is minimum (or accuracy is maximum)
optimal_C = C_values[cv_error.index(min(cv_error))]
print('\nThe optimal alpha is', optimal_C)

end = time.time()
print("Total time in minutes = ", (end-start)/60)


The optimal alpha is 0.1
Total time in minutes =  0.6452457586924235


In [23]:
knn_optimal =LogisticRegression(C=optimal_C)
# fitting the model
knn_optimal.fit(X_train,y_train)
# predict the response
pred = knn_optimal.predict(X_test)

# evaluate accuracy
acc = accuracy_score(y_test, pred) * 100
print('\nThe accuracy of the classifier for k = %d is %f%%' % (optimal_C, acc))


The accuracy of the classifier for k = 0 is 90.333333%


In [24]:
confusion_matrix(y_test,pred)

array([[ 2636,  2105],
       [  795, 24464]], dtype=int64)