In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

reviews = pd.read_csv('../input/Reviews.csv')
reviews.head(1)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...


In [2]:
reviews['smry_txt'] = reviews['Summary'].astype(str) + ' ' + reviews['Text']
dataset = reviews.loc[reviews['Score'] != 3][['smry_txt','Score']]

dataset.loc[dataset['Score']==1, 'Score'] = 'negative'
dataset.loc[dataset['Score']==2, 'Score'] = 'negative'
dataset.loc[dataset['Score']==4, 'Score'] = 'positive'
dataset.loc[dataset['Score']==5, 'Score'] = 'positive'

dataset.head()

Unnamed: 0,smry_txt,Score
0,Good Quality Dog Food I have bought several of...,positive
1,Not as Advertised Product arrived labeled as J...,negative
2,"""Delight"" says it all This is a confection tha...",positive
3,Cough Medicine If you are looking for the secr...,negative
4,Great taffy Great taffy at a great price. The...,positive


# Logistic Regression

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# 1.  GridSearchCV with l2 regularizer

In [4]:
count_vec_total = CountVectorizer()

x_data = count_vec_total.fit_transform(dataset.smry_txt)
y_data = dataset['Score']

y_data.head()

print(dataset.iloc[0]['smry_txt'], end='\n\n')
print(count_vec_total.inverse_transform(x_data[0]), end='\n\n')
print(x_data[0])

Good Quality Dog Food I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.

[array(['most', 'this', 'appreciates', 'she', 'finicky', 'is', 'labrador',
       'my', 'better', 'smells', 'it', 'meat', 'processed', 'than', 'stew',
       'like', 'more', 'looks', 'product', 'be', 'to', 'all', 'them',
       'found', 'and', 'products', 'canned', 'vitality', 'the', 'of',
       'several', 'bought', 'have', 'food', 'dog', 'quality', 'good'],
      dtype='<U124')]

  (0, 74176)	1
  (0, 108779)	1
  (0, 9541)	1
  (0, 97878)	1
  (0, 47541)	1
  (0, 61725)	1
  (0, 65358)	1
  (0, 75122)	1
  (0, 20970)	2
  (0, 100122)	1
  (0, 61875)	1
  (0, 71077)	1
  (0, 87084)	1
  (0, 108307)	2
  (0, 103477)	1
  (0, 66977)	1
  (0, 73996)	1
  (0, 67944)	1
  (0, 87176)	2
  (0, 19907)	1
  (0, 109606)	1
  (0, 749

In [20]:
#   GridsearchCV will build classifier specified (logistic regression),
# with the optimal C value, with 3-fold cross validation by defalut. 
# So, we don't need to build a classifier again with this model..
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
log_reg_l2 = LogisticRegression(penalty='l2')
log_reg_l2_grid_clf = GridSearchCV(log_reg_l2, param_grid=param_grid,n_jobs=-1, verbose=40)
log_reg_l2_grid_clf

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=40)

In [5]:
x_data[:121]

<121x121022 sparse matrix of type '<class 'numpy.int64'>'
	with 6125 stored elements in Compressed Sparse Row format>

In [6]:
y_data.shape

(525814,)

# lets split the data into train and test data

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

y_train.shape

(394360,)

In [21]:
from datetime import datetime
startTime = datetime.now()

#lets run GridsearchCV on our data
log_reg_l2_grid_clf.fit(x_train, y_train)

print(datetime.now() - startTime)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.001 .........................................................
[CV] C=0.001 .........................................................
[CV] C=0.001 .........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] C=10 ............................................................
[CV] C=10 .......

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.7min


[CV] ................ C=0.001, score=0.9176435684236952, total= 2.1min


[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  2.3min remaining: 14.8min


[CV] ................ C=0.001, score=0.9193399922405727, total= 3.0min


[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:  3.1min remaining: 12.5min


[CV] ................. C=0.01, score=0.9407620974797076, total= 5.3min


[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  5.5min remaining: 15.1min


[CV] ................. C=0.01, score=0.9427247761557362, total= 6.1min


[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:  6.3min remaining: 12.6min


[CV] ................. C=0.01, score=0.9406408325345749, total= 6.7min


[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  6.8min remaining: 10.2min


[CV] .................. C=0.1, score=0.9520969471978578, total=13.4min


[Parallel(n_jobs=-1)]: Done   7 out of  15 | elapsed: 13.6min remaining: 15.5min


[CV] .................. C=0.1, score=0.9513362190288545, total=14.9min


[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 15.1min remaining: 13.3min


[CV] .................. C=0.1, score=0.9517930226543125, total=15.1min


[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed: 15.3min remaining: 10.2min


[CV] ..................... C=10, score=0.95484317588796, total=26.0min


[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 26.6min remaining: 13.3min


[CV] .................... C=1, score=0.9549040341414802, total=27.5min


[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed: 27.8min remaining: 10.1min


[CV] .................... C=1, score=0.9550485724935909, total=27.7min


[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 28.0min remaining:  7.0min


[CV] ................... C=10, score=0.9552159326907717, total=29.1min


[Parallel(n_jobs=-1)]: Done  13 out of  15 | elapsed: 29.5min remaining:  4.5min


[CV] ................... C=10, score=0.9542349415004489, total=29.6min
[CV] .................... C=1, score=0.9552467022684742, total=30.1min


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 30.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 30.3min finished


0:38:36.342776


In [22]:
log_reg_l2_grid_clf.best_params_ ,log_reg_l2_grid_clf.best_estimator_, log_reg_l2_grid_clf.best_score_

({'C': 1},
 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 0.95506643675829195)

In [23]:
y_true, y_pred = y_test, log_reg_l2_grid_clf.predict(x_test)

In [24]:
from sklearn.metrics import precision_recall_fscore_support, classification_report

print(classification_report(y_test, y_pred))
print(precision_recall_fscore_support(y_test, y_pred, average='micro'))

             precision    recall  f1-score   support

   negative       0.88      0.83      0.86     20629
   positive       0.97      0.98      0.97    110825

avg / total       0.96      0.96      0.96    131454

(0.95639539306525478, 0.95639539306525478, 0.95639539306525478, None)


In [25]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

y_pred = np.where(y_pred == 'positive', 1, 0)
y_true = np.where(y_true == 'positive', 1, 0)

fpr, tpr, threshold = roc_curve(y_true, y_pred)
print("FPR: {}, \nTPR: {}, \nAUC : {}".format(fpr[1], tpr[1], auc(fpr,tpr)))

FPR: 0.16820980173542102, 
TPR: 0.9795894428152493, 
AUC : 0.9056898205399141


# 2. GridSearchCV with l1 regularizer 

In [9]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
log_reg_l1 = LogisticRegression(penalty='l1')
log_reg_l1_grid_clf = GridSearchCV(log_reg_l1, param_grid=param_grid,n_jobs=-1, verbose=40)
log_reg_l1_grid_clf

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=40)

In [10]:
y_train.shape

(394360,)

In [11]:
from datetime import datetime
startTime = datetime.now()

#lets run GridsearchCV on our data
log_reg_l1_grid_clf.fit(x_train, y_train)

print(datetime.now() - startTime)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.001 .........................................................
[CV] C=0.001 .........................................................
[CV] C=0.001 .........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] C=10 ............................................................
[CV] C=10 .......

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   21.9s


[CV] ................ C=0.001, score=0.8720921386949047, total=  22.4s


[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:   27.2s remaining:  2.9min


[CV] ................ C=0.001, score=0.8727910355792564, total=  24.9s


[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:   30.0s remaining:  2.0min


[CV] ................. C=0.01, score=0.9254796354618345, total=  30.4s


[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   36.2s remaining:  1.7min


[CV] ................. C=0.01, score=0.9259887564376621, total=  32.2s


[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:   39.9s remaining:  1.3min


[CV] ................. C=0.01, score=0.9248780933109172, total=  35.4s


[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:   43.5s remaining:  1.1min


[CV] .................. C=0.1, score=0.9463306276768122, total=  51.4s


[Parallel(n_jobs=-1)]: Done   7 out of  15 | elapsed:  1.0min remaining:  1.2min


[CV] .................. C=0.1, score=0.9453949320289381, total=  59.0s


[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.2min remaining:  1.0min


[CV] .................. C=0.1, score=0.9456920291508817, total= 1.1min


[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed:  1.2min remaining:   47.6s


[CV] .................... C=1, score=0.9543106661696577, total= 1.2min


[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  1.4min remaining:   42.9s


[CV] .................... C=1, score=0.9543794787530239, total= 1.3min


[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed:  1.4min remaining:   31.6s


[CV] .................... C=1, score=0.9539150875217759, total= 1.2min


[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:  1.5min remaining:   22.2s


[CV] ................... C=10, score=0.9506135272683012, total= 1.3min


[Parallel(n_jobs=-1)]: Done  13 out of  15 | elapsed:  1.6min remaining:   15.1s


[CV] ................... C=10, score=0.9507200292119616, total= 1.3min
[CV] ................... C=10, score=0.9500053250566738, total= 1.4min


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.7min finished


0:02:24.135731


In [12]:
log_reg_l1_grid_clf.best_params_ ,log_reg_l1_grid_clf.best_estimator_, log_reg_l1_grid_clf.best_score_

({'C': 1},
 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 0.95420174459884366)

In [13]:
y_true, y_pred = y_test, log_reg_l1_grid_clf.predict(x_test)

In [16]:
print(classification_report(y_test, y_pred))
print(precision_recall_fscore_support(y_test, y_pred, average='micro'))

             precision    recall  f1-score   support

   negative       0.88      0.83      0.86     20629
   positive       0.97      0.98      0.97    110825

avg / total       0.96      0.96      0.96    131454

(0.95599221020280856, 0.95599221020280856, 0.95599221020280856, None)


In [17]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

y_pred = np.where(y_pred == 'positive', 1, 0)
y_true = np.where(y_true == 'positive', 1, 0)

fpr, tpr, threshold = roc_curve(y_true, y_pred)
print("FPR: {}, \nTPR: {}, \nAUC : {}".format(fpr[1], tpr[1], auc(fpr,tpr)))

FPR: 0.167967424499491, 
TPR: 0.9790660951951274, 
AUC : 0.9055493353478182


# Feature Importance

In [27]:
best_estimator = log_reg_l1_grid_clf.best_estimator_

In [29]:
weights_vectors = pd.DataFrame()
weights_vectors['feature'] = count_vec_total.get_feature_names()
weights_vectors['weight'] = np.transpose(best_estimator.coef_)
# convert positive and negative weights to absolute values
weights_vectors['weight'] = weights_vectors['weight'].map(lambda x: abs(x))

In [30]:
sorted_weights = weights_vectors.sort_values(by='weight', ascending=False)
sum(sorted_weights['weight'] == 0)

108451

> We have 108451 features that has weight almost 0

## Important features are : 

In [35]:
imp_features = sorted_weights[sorted_weights['weight'] >= 0.0001]['feature'].values

In [36]:
imp_features, len(imp_features)

(array(['excellently', 'emeraldforest', 'abottle', ..., 'calls', 'chun',
        'remover'], dtype=object), 12569)

# top 100 most important features are..

In [None]:
imp_features[:100]

array(['excellently', 'emeraldforest', 'abottle', 'yirgacheffe',
       'b000sqn3og', 'ramada', 'hestitate', 'lifespans', 'myer',
       'manipulating', 'looming', 'blehhhh', 'afro', 'chedder', 'vagan',
       'b000et93n2', 'unwearable', 'incurred', 'leap', 'weakest', 'yuch',
       'prepped', 'b003crknpk', 'slag', 'glumpy', 'seperatly',
       'replentishment', 'artificials', 'cues', 'juiciest', 'maxes',
       'inspector', 'storebrand', 'fragmentatized', 'decar', 'bonsais',
       'upgrading', 'coincident', 'lyles', 'holle', 'unlemoned', 'chimed',
       'brûl', 'meshed', 'bumpy', 'wrinkly', 'alginate', 'ovaries',
       'shrapnel', 'originates', '157', 'solving', 'snapdragon',
       'devastated', '280mg', 'heroes', 'abominable', 'dissapointing',
       'wolfing', 'todd', 'reinstate', 'flovoured', 'gush', 'humdrum',
       'pitt', 'ehhhh', 'becausei', 'subjectively', 'hoffman',
       'unimpressive', 'ingrediantes', 'iprozon', 'dizzied', 'shippments',
       'miracles', 'botch', 'no