### Objective: 
To classify a review(amazon fine food reviews) into negative or positive class using Naive Bayes algorithm.

In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,train_test_split,TimeSeriesSplit,RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_distances,euclidean_distances,cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading cleaned amazon review csv
df=pd.read_csv('cleaned_amazon_reviews.csv')
df=df.dropna()
df_sample=df #here we can modify our sample size

In [3]:
df_sample=df_sample.sort_values(by=['Time'],axis=0,ascending=True)  # sorting of dataframe for Time Based splitting

In [4]:
# converting reviews to vector
list_of_sentence=[] #list of sentence to be used for countvectorizer
for sentence in df_sample['CleanText'].values:
    li=sentence
    list_of_sentence.append(li)

tfidf_model=TfidfVectorizer(ngram_range=(1, 2)) #using count-BOW and 1 & 2 grams 
tfidf_bow_review_matrix=tfidf_model.fit_transform(list_of_sentence) #training our model and converting text to vector

In [5]:
# standardization of feature matrix
X=StandardScaler(with_mean=False).fit_transform(tfidf_bow_review_matrix)
# In LR we know for negative class we take -1 and for positive class +1.
y=y=df_sample['class'].apply(lambda string: 1 if string=='positive' else -1)

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

### Using Grid search 

#### 1) using L2 Regularization

In [42]:
lr_model=LogisticRegression(random_state=0,n_jobs=-1)
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
c=[0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001,0.0000000001] #list of c to passed to the LR model
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
gs=GridSearchCV(lr_model,param_grid={'C':c},scoring='accuracy',cv=tscv.split(X_train))
gs.fit(X_train,y_train,sample_weight=y_train_sample_weight)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x3ffe484ceb48>,
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 1e-05, 1e-06, 1e-07, 1e-08, 1e-09, 1e-10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [29]:
y_predict=gs.predict(X_test)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)

0.88938416829172595

In [31]:
gs.best_params_

{'C': 1e-07}

#### 2) using L1 Regularization

In [129]:
lr_model=LogisticRegression(penalty='l1',random_state=0,n_jobs=-1)
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
c=[100,10,1,0.1,0.01,0.001] #list of c to passed to the LR model
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
gs=GridSearchCV(lr_model,param_grid={'C':c},scoring='accuracy',cv=tscv.split(X_train))
gs.fit(X_train,y_train,sample_weight=y_train_sample_weight)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x3fff0083c570>,
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [100, 10, 1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [130]:
y_predict=gs.predict(X_test)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)

0.8491609411664528

In [131]:
gs.best_params_

{'C': 0.01}

### Using Random Search 

#### 1) using L2 Regularization

In [132]:
lr_model=LogisticRegression(random_state=0,n_jobs=-1)
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
param_grid=np.random.uniform(0.00000001,0.000001,20)
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
rs=RandomizedSearchCV(lr_model,param_distributions={'C':param_grid},scoring='accuracy',cv=tscv.split(X_train))
rs.fit(X_train,y_train,sample_weight=y_train_sample_weight)


RandomizedSearchCV(cv=<generator object TimeSeriesSplit.split at 0x3fff0b876728>,
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': array([  4.96788e-07,   8.81614e-07,   2.06406e-07,   6.43269e-07,
         8.39432e-07,   5.33538e-07,   4.41028e-07,   2.38218e-08,
         4.05763e-07,   7.12149e-07,   2.13623e-07,   6.39345e-07,
         6.08885e-07,   3.15056e-07,   8.01074e-07,   7.61407e-07,
         1.15046e-07,   1.39763e-07,   3.42908e-07,   5.00267e-07])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='accuracy', verbose=0)

In [133]:
y_predict=rs.predict(X_test)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)

0.88284830761180488

In [134]:
rs.best_params_

{'C': 1.3976274849389749e-07}

#### 2) using L1 Regularization

In [144]:
lr_model=LogisticRegression(penalty='l1',random_state=0,n_jobs=-1)
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
param_grid=np.random.uniform(0.001,1,20)
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
rs=RandomizedSearchCV(lr_model,param_distributions={'C':param_grid},scoring='accuracy',cv=tscv.split(X_train))
rs.fit(X_train,y_train,sample_weight=y_train_sample_weight)


RandomizedSearchCV(cv=<generator object TimeSeriesSplit.split at 0x3ffef6264fc0>,
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': array([ 0.88585,  0.17497,  0.93794,  0.49264,  0.2191 ,  0.14032,
        0.24433,  0.99066,  0.45169,  0.69966,  0.79476,  0.2439 ,
        0.96409,  0.09913,  0.49571,  0.94768,  0.30067,  0.34672,
        0.45416,  0.32356])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='accuracy', verbose=0)

In [145]:
y_predict=rs.predict(X_test)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)

0.82299552894908301

In [146]:
rs.best_params_

{'C': 0.94768333445684139}

### Increase in sparsity(decrease in non zero element) with increase in Lambda(decrese in C parameter) in L1 regularizatation

In [143]:
c_list=[100,10,1,0.1,0.01,0.001,0.0001]
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
for c in c_list:
    lr_model=LogisticRegression(penalty='l1',C=c,random_state=0,n_jobs=-1)
    lr_model.fit(X_train,y_train,y_train_sample_weight)
    y_predict=lr_model.predict(X_test)
    error=1-accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)
    print('lambda= ',1/c,' no of non-zero in weight vector',np.count_nonzero(lr_model.coef_) , 'error= ',error)


lambda=  0.01  no of non-zero in weight vector 773201 error=  0.204399925328
lambda=  0.1  no of non-zero in weight vector 458474 error=  0.218262914994
lambda=  1.0  no of non-zero in weight vector 200697 error=  0.177229013404
lambda=  10.0  no of non-zero in weight vector 158700 error=  0.170447921502
lambda=  100.0  no of non-zero in weight vector 125392 error=  0.150839058834
lambda=  1000.0  no of non-zero in weight vector 21333 error=  0.111753077968
lambda=  10000.0  no of non-zero in weight vector 34 error=  0.248966762935


### feature collinearity test-perbutation test

In [2]:
df=pd.read_csv('cleaned_amazon_reviews.csv')
df=df.dropna()

In [4]:
#for balanced dataset where n=#for each class
n=2500
df_positive=df[df['class']=='positive'].sample(n)
df_negative=df[df['class']=='negative'].sample(n)
df_sample=pd.concat([df_positive,df_negative])
df_sample=df_sample.sort_values(by=['Time'],axis=0,ascending=True)  #dataframe being sorted based by time 

list_of_sentence=[] # list of sentences 
for sentence in df_sample['CleanText'].values:
    list_of_sentence.append(sentence)
tfidf_model=TfidfVectorizer(ngram_range=(1,2))    
tfidf_review_matrix=tfidf_model.fit_transform(list_of_sentence) #training our tf-idf model

X=StandardScaler(with_mean=False).fit_transform(tfidf_review_matrix)
# In LR we know for negative class we take -1 and for positive class +1.
y=y=df_sample['class'].apply(lambda string: 1 if string=='positive' else -1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [5]:
lr_model=LogisticRegression(random_state=0,n_jobs=-1)
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
c=[0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001,0.0000000001] #list of c to passed to the LR model
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
gs=GridSearchCV(lr_model,param_grid={'C':c},scoring='accuracy',cv=tscv.split(X_train))
gs.fit(X_train,y_train,sample_weight=y_train_sample_weight)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x3ffefeea8308>,
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 1e-05, 1e-06, 1e-07, 1e-08, 1e-09, 1e-10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [6]:
y_predict=gs.predict(X_test)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)

0.83675213675213689

In [7]:
gs.best_params_

{'C': 1e-07}

In [8]:
w1=gs.best_estimator_.coef_ # storing w as w1 before adding noise to the data

In [10]:
X_noise=X_train+csr_matrix(np.random.normal(0,0.10,X_train.shape)) # addding noise to the data

In [11]:
lr_model=LogisticRegression(random_state=0,n_jobs=-1)
tscv=TimeSeriesSplit(n_splits=10) # timeseries split for using timeseries based cross-validation
c=[0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001,0.0000000001] #list of c to passed to the LR model
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
gs=GridSearchCV(lr_model,param_grid={'C':c},scoring='accuracy',cv=tscv.split(X_train))
gs.fit(X_noise,y_train,sample_weight=y_train_sample_weight)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x3fff01cb6eb8>,
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 1e-05, 1e-06, 1e-07, 1e-08, 1e-09, 1e-10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [12]:
y_predict=gs.predict(X_test)
y_test_sample_weight=compute_sample_weight(class_weight='balanced',y=y_test)
accuracy_score(y_test,y_predict,sample_weight=y_test_sample_weight)

0.83643162393162407

In [13]:
gs.best_params_

{'C': 1e-07}

In [14]:
w2=gs.best_estimator_.coef_ # storing w as w2 before adding noise to the data

In [18]:
dist=DistanceMetric.get_metric('euclidean').pairwise(w1,w2) #

In [19]:
dist

array([[ 0.00011359]])

In [27]:
cosine_similarity(w1,w2)

array([[ 0.99538119]])

### Result:
    As we can see  that cosine_similarity(w1,w2)=0.99538119 which is approx equal to 1 we can say that mullticollineariry is not present in the data and if it is there it is very very small and hence we can use weight vector as feature importance.

### Feature Imporantance

In [34]:
#reading cleaned amazon review csv
df=pd.read_csv('cleaned_amazon_reviews.csv')
df=df.dropna()
df_sample=df #here we can modify our sample size
df_sample=df_sample.sort_values(by=['Time'],axis=0,ascending=True)  # sorting of dataframe for Time Based splitting
# converting reviews to vector
list_of_sentence=[] #list of sentence to be used for countvectorizer
for sentence in df_sample['CleanText'].values:
    li=sentence
    list_of_sentence.append(li)

tfidf_model=TfidfVectorizer(ngram_range=(1, 2)) #using count-BOW and 1 & 2 grams 
tfidf_bow_review_matrix=tfidf_model.fit_transform(list_of_sentence) #training our model and converting text to vector

# standardization of feature matrix
X=StandardScaler(with_mean=False).fit_transform(tfidf_bow_review_matrix)
# In LR we know for negative class we take -1 and for positive class +1.
y=y=df_sample['class'].apply(lambda string: 1 if string=='positive' else -1)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=False) # 70-30 split without shuffling(used for TBS)

In [35]:
X_train.shape

(230100, 3178748)

In [38]:
n=100 #top n features 
y_train_sample_weight=compute_sample_weight(class_weight='balanced',y=y_train)
lr_model=LogisticRegression(C=1e-07,random_state=0,n_jobs=-1) # training the model for Weight vector
lr_model.fit(X_train,y_train,sample_weight=y_train_sample_weight)

di_positive={}#dict for storing key as +ve w components  and value as index corrosponding to that component
di_negative={}#dict for storing key as -ve w components  and value as index corrosponding to that component

li=lr_model.coef_[0,:]
feature_name=tfidf_model.get_feature_names() # features name 
for value,key in enumerate(li):
    if key>0:
        di_positive[key]=value
    else:
        di_negative[abs(key)]=value
top_positive_features=[]
top_negative_features=[]
for key in sorted(di_positive,reverse=True)[:n]: #sorted the dict in descending order to get  n imp +ve weights components
    index=di_positive[key] #fetching the
    top_positive_features.append(feature_name[index])
    
for key in sorted(di_negative,reverse=True)[:n]: #sorted the dict in descending order to get  n imp -ve weights components
    index=di_negative[key] #fetching the
    top_negative_features.append(feature_name[index])

In [39]:
top_positive_features # positive features

['great',
 'love',
 'best',
 'delici',
 'good',
 'find',
 'perfect',
 'favorit',
 'make',
 'use',
 'easi',
 'high recommend',
 'excel',
 'nice',
 'keep',
 'enjoy',
 'wonder',
 'this',
 'snack',
 'year',
 'store',
 'littl',
 'alway',
 'add',
 'high',
 'tasti',
 'thank',
 'quick',
 'recommend',
 'also',
 'well',
 'price',
 'without',
 'carri',
 'happi',
 'day',
 'everi',
 'it',
 'tast great',
 'found',
 'these',
 'smooth',
 'ive',
 'amaz',
 'fresh',
 'healthi',
 'morn',
 'famili',
 'fast',
 'tea',
 'glad',
 'satisfi',
 'treat',
 'friend',
 'this best',
 'need',
 'right',
 'this great',
 'addict',
 'bit',
 'time',
 'rich',
 'great tast',
 'mix',
 'hard find',
 'long',
 'great product',
 'cook',
 'yummi',
 'definit',
 'flavor',
 'free',
 'awesom',
 'husband',
 'avail',
 'fantast',
 'work',
 'beat',
 'breakfast',
 'home',
 'great price',
 'meal',
 'hook',
 'sweet',
 'they',
 'kid',
 'help',
 'one best',
 'local',
 'make great',
 'calori',
 'cold',
 'especi',
 'abl',
 'realli good',
 'conven

In [40]:
top_negative_features

['disappoint',
 'not',
 'return',
 'wast',
 'wast money',
 'worst',
 'not buy',
 'aw',
 'horribl',
 'would not',
 'terribl',
 'bad',
 'money',
 'threw',
 'not recommend',
 'not worth',
 'refund',
 'stale',
 'disgust',
 'veri disappoint',
 'wont buy',
 'not good',
 'dont wast',
 'thought',
 'descript',
 'not order',
 'not purchas',
 'tast like',
 'away',
 'unfortun',
 'wors',
 'poor',
 'mayb',
 'pictur',
 'never buy',
 'receiv',
 'didnt',
 'nasti',
 'bewar',
 'gross',
 'not even',
 'throw',
 'expir',
 'bland',
 'yuck',
 'tasteless',
 'trash',
 'rip',
 'contact',
 'stuck',
 'mistak',
 'threw away',
 'would',
 'sorri',
 'review',
 'noth like',
 'weak',
 'never order',
 'ined',
 'two star',
 'throw away',
 'bad batch',
 'tast aw',
 'end throw',
 'buyer bewar',
 'mislead',
 'worst tast',
 'tast bad',
 'hope',
 'unpleas',
 'cancel',
 'item',
 'list',
 'label',
 'disappoint product',
 'china',
 'product not',
 'tast horribl',
 'not tast',
 'will not',
 'want like',
 'definit not',
 'tast noth