In [29]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import scipy
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from pickle import dump
from pickle import load
from nltk.corpus import stopwords
from math import sqrt

# Import Data

In [30]:
# Import data 

# Read the CSV file into a DataFrame: df
df = pd.read_csv("Amazon_Unlocked_Mobile.csv")
# Sample 10% of dataset
df = df.sample(frac=0.1, random_state=10)

In [31]:
df.shape

(41384, 6)

In [32]:
df = df[pd.notnull(df['Reviews'])]
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [33]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = re.sub(r'[/(){}\[\]\|@,;.#+_]',' ', text) 
    text = re.sub(r'[^0-9a-z ]','', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [34]:
df['Reviews'] = df['Reviews'].apply(clean_text)

In [35]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,good one better samsung iphones quality camera...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,phone needed sim card would nice know,1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,3 months away upgrade stratosphere kept crappi...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,experience want forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,great phone work according expectations,1.0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41374 entries, 394349 to 109303
Data columns (total 6 columns):
Product Name    41374 non-null object
Brand Name      34837 non-null object
Price           40753 non-null float64
Rating          41374 non-null int64
Reviews         41374 non-null object
Review Votes    40184 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.2+ MB


In [37]:
df.describe()

Unnamed: 0,Price,Rating,Review Votes
count,40753.0,41374.0,40184.0
mean,227.631428,3.815319,1.49674
std,277.011277,1.551319,8.451689
min,1.73,1.0,0.0
25%,79.95,3.0,0.0
50%,140.0,5.0,0.0
75%,269.99,5.0,1.0
max,2408.73,5.0,524.0


In [38]:
df['Brand Name'].value_counts().head()

Samsung       6539
BLU           6295
Apple         5623
LG            2272
BlackBerry    1699
Name: Brand Name, dtype: int64

In [39]:
df.dropna(inplace=True) # drop any rows with missing values

In [40]:
# assuming rating with 3 are neutral reviews
# so drop rows with rating = 3 (by chosing all the rows with rating!=3)

df = df[df['Rating']!=3]

In [41]:
# assuming rating with greater than 3 are rated as postive
# so we assign 1 to Positively rated and 0 to those are not
# if Rating > 3, then 'Positively Rated' = 1, else 'Positively Rated' = 0

df['Positively Rated'] = np.where(df['Rating']>3, 1, 0) 

In [42]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,phone needed sim card would nice know,1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,3 months away upgrade stratosphere kept crappi...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,experience want forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,great phone work according expectations,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,fell love phone everything suppose 3g network ...,0.0,1


In [43]:
df['Positively Rated'].mean()

0.7471776686078667

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'],
                                                    df['Positively Rated'],
                                                test_size=0.2,random_state=0)

In [48]:
X_train.shape

(24589,)

# CountVectorizer

In [19]:
# We'll need to convert text into a numeric so that scikit-learn can use
# The bag-of-words approach ignores structure and only counts how often each word occurs
# CountVectorizer use the bag-of-words by converting text into a matrix of token counts.
# First, we instantiate the CountVectorizer and fit it to our training data.

# Fitting the CountVectorizer consists of the 
#     tokenization of the trained data and 
#     building of the vocabulary

# Fitting the CountVectorizer 
#     tokenizes each document by finding 
#         all sequences of characters of 
#             at least two letters or 
#             numbers separated by word boundaries. 
# Converts everything to 
#     lowercase and 
#     builds a vocabulary using these tokens.

In [49]:
vect = CountVectorizer().fit(X_train)
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [50]:
len(vect.get_feature_names())

21907

In [51]:
vect.get_feature_names()[0:10]

['00',
 '000',
 '0000',
 '000000',
 '000mah',
 '002order',
 '00k',
 '00us',
 '01',
 '0100']

In [52]:
vect.get_feature_names()[::4000]

['00', 'cdblackberry', 'fineoverall', 'mayor', 'reen', 'trim']

In [24]:
# We use transform method to transform X_train to a document term matrix
# giving us the bag-of-word representation of X_train

# This representation is stored in a SciPy sparse matrix where 
#     each row corresponds to a document and 
#     each column a word from our training vocabulary.

# The entries in this matrix are the number of times each word appears in each document.

# Because the number of words in the vocabulary is so much larger 
# than the number of words that might appear in a single review, 
# most entries of this matrix are zero.

# and the shape will be 
#     number of document/rows(here in dataframe)/reviews(in this case) *
#     number of words in the vocabulary/tokens

# Here's a trivial example ... Let's suppose we have 3 documents:

#     Doc1: Hello, World, the sun is shining
#     Doc2: Hello world, the weather is nice
#     Doc3: Hello world, the wind is cold


# Then, our vocabulary would look like this (using 1-grams without stop word removal):

#     Vocabulary: [hello, world, the, wind, weather, sun, is, shining, nice, cold]


# The corresponding, binary feature vectors are:

#     Doc1: [1, 1, 1, 0, 0, 0, 1, 1, 0, 0]
#     Doc2: [1, 1, 1, 0, 0, 1, 0, 1, 1, 0]
#     Doc3: [1, 1, 1, 1, 0, 0, 1, 0, 0, 1]


# Which we use to construct the dense matrix / document term matrix:

#     [[1, 1, 1, 0, 0, 0, 1, 1, 0, 0]
#      [1, 1, 1, 0, 0, 1, 0, 1, 1, 0]
#      [1, 1, 1, 1, 0, 0, 1, 0, 0, 1]]

In [53]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<24589x21907 sparse matrix of type '<class 'numpy.int64'>'
	with 431916 stored elements in Compressed Sparse Row format>

In [54]:
X_train_vectorized.shape

(24589, 21907)

In [27]:
#model = LogisticRegression()
#model.fit(X_train_vectorized, y_train)

# Model Tuning

# logistic regression

In [55]:
# LogisticRegression Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = LogisticRegression()
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}
scoring = 'roc_auc'
logreg_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
logresult=logreg_cv.fit(X_train_vectorized, y_train)
print("Best: %f using %s" % (logresult.best_score_, logresult.best_params_))
means = logresult.cv_results_['mean_test_score']
stds = logresult.cv_results_['std_test_score']
params = logresult.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.959506 using {'penalty': 'l2', 'C': 0.4393970560760795}
0.898060 (0.010759) with: {'penalty': 'l2', 'C': 163789.3706954068}
0.872224 (0.012931) with: {'penalty': 'l1', 'C': 1389495.494373136}
0.884193 (0.010541) with: {'penalty': 'l1', 'C': 19306.977288832535}
0.500000 (0.000000) with: {'penalty': 'l1', 'C': 8.483428982440725e-05}
0.954771 (0.002731) with: {'penalty': 'l1', 'C': 0.4393970560760795}
0.959506 (0.002424) with: {'penalty': 'l2', 'C': 0.4393970560760795}
0.926465 (0.004603) with: {'penalty': 'l2', 'C': 268.2695795279727}
0.931290 (0.005036) with: {'penalty': 'l1', 'C': 31.622776601683793}
0.869153 (0.011385) with: {'penalty': 'l1', 'C': 100000000.0}
0.954474 (0.003170) with: {'penalty': 'l2', 'C': 0.05179474679231213}


In [61]:
print("Accuracy: {}".format(logreg_cv.score(vect.transform(X_test), y_test)))

Accuracy: 0.9603940947754046


In [57]:
# Predict the labels of the test set: y_pred using Logistic Regression
y_pred5 = logreg_cv.predict(vect.transform(X_test))
# Compute and print metrics

print(classification_report(y_test, y_pred5))
print("Tuned Model Parameters: {}".format(logreg_cv.best_params_))

             precision    recall  f1-score   support

          0       0.88      0.80      0.84      1536
          1       0.94      0.96      0.95      4612

avg / total       0.92      0.92      0.92      6148

Tuned Model Parameters: {'penalty': 'l2', 'C': 0.4393970560760795}


In [59]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents

print('AUC: ', roc_auc_score(y_test, y_pred5))

AUC:  0.8819604428664354


# Nomial NaÏve Bayes

In [63]:
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = MultinomialNB()
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)
param_grid = {'alpha': alphas}
scoring = 'roc_auc'
nb_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
nbresult=nb_cv.fit(X_train_vectorized, y_train)
print("Best: %f using %s" % (nbresult.best_score_, nbresult.best_params_))
means = nbresult.cv_results_['mean_test_score']
stds = nbresult.cv_results_['std_test_score']
params = nbresult.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Best: 0.946520 using {'alpha': 0.4}
0.847892 (0.002268) with: {'alpha': 0.0}
0.942690 (0.002308) with: {'alpha': 0.1}
0.945221 (0.002068) with: {'alpha': 0.2}
0.946190 (0.002043) with: {'alpha': 0.30000000000000004}
0.946520 (0.002114) with: {'alpha': 0.4}
0.946520 (0.002180) with: {'alpha': 0.5}
0.946335 (0.002338) with: {'alpha': 0.6000000000000001}
0.946028 (0.002462) with: {'alpha': 0.7000000000000001}
0.945647 (0.002532) with: {'alpha': 0.8}
0.945098 (0.002626) with: {'alpha': 0.9}


In [64]:
# Predict the labels of the test set: y_pred using Logistic Regression
y_pred5 = nb_cv.predict(vect.transform(X_test))
# Compute and print metrics
print("Accuracy: {}".format(nb_cv.score(vect.transform(X_test), y_test)))
print(classification_report(y_test, y_pred5))
print("Tuned Model Parameters: {}".format(nb_cv.best_params_))

Accuracy: 0.9450336333884432
             precision    recall  f1-score   support

          0       0.86      0.80      0.83      1536
          1       0.93      0.96      0.94      4612

avg / total       0.91      0.92      0.91      6148

Tuned Model Parameters: {'alpha': 0.4}


In [65]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents

print('AUC: ', roc_auc_score(y_test, y_pred5))

AUC:  0.8761044557675629


# Tf–idf

In [66]:
# Tf–idf, or Term frequency-inverse document frequency
# allows us to weight terms based on how important they are to a document.
# high weight is given to terms that appear often in a particular document, 
# but don't appear often in the corpus. 

# Features with low tf–idf are either commonly used across all documents 
# or rarely used and only occur in long documents.

# Features with high tf–idf are frequently used within specific documents, 
# but rarely used across all documents.
# Similar to how we used CountVectorizer, 
# we'll instantiate the tf–idf vectorizer and fit it to our training data.

# mindf, which allows us to specify a minimum number of documents 
# in which a token needs to appear to become part of the vocabulary

In [67]:
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

5577

In [68]:
X_train_vectorized = vect.transform(X_train)

In [69]:
# LogisticRegression Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = LogisticRegression()
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}
scoring = 'roc_auc'
logreg_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
logresult=logreg_cv.fit(X_train_vectorized, y_train)
print("Best: %f using %s" % (logresult.best_score_, logresult.best_params_))
means = logresult.cv_results_['mean_test_score']
stds = logresult.cv_results_['std_test_score']
params = logresult.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.956385 using {'penalty': 'l2', 'C': 0.05179474679231213}
0.933282 (0.003799) with: {'penalty': 'l2', 'C': 2275.845926074791}
0.887157 (0.008853) with: {'penalty': 'l1', 'C': 2275.845926074791}
0.886106 (0.018924) with: {'penalty': 'l2', 'C': 11787686.347935867}
0.875220 (0.011438) with: {'penalty': 'l2', 'C': 100000000.0}
0.500000 (0.000000) with: {'penalty': 'l1', 'C': 0.0007196856730011522}
0.858738 (0.002169) with: {'penalty': 'l2', 'C': 8.483428982440725e-05}
0.500000 (0.000000) with: {'penalty': 'l1', 'C': 8.483428982440725e-05}
0.956385 (0.002599) with: {'penalty': 'l2', 'C': 0.05179474679231213}
0.866194 (0.008350) with: {'penalty': 'l1', 'C': 19306.977288832535}
0.912671 (0.007466) with: {'penalty': 'l2', 'C': 19306.977288832535}


In [70]:
print("Accuracy: {}".format(logreg_cv.score(vect.transform(X_test), y_test)))

Accuracy: 0.9553116784339766


In [71]:
# Predict the labels of the test set: y_pred using Logistic Regression
y_pred5 = logreg_cv.predict(vect.transform(X_test))
# Compute and print metrics

print(classification_report(y_test, y_pred5))
print("Tuned Model Parameters: {}".format(logreg_cv.best_params_))

             precision    recall  f1-score   support

          0       0.94      0.46      0.61      1536
          1       0.85      0.99      0.91      4612

avg / total       0.87      0.86      0.84      6148

Tuned Model Parameters: {'penalty': 'l2', 'C': 0.05179474679231213}


In [72]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents

print('AUC: ', roc_auc_score(y_test, y_pred5))

AUC:  0.7230941362207286


In [73]:
# MultinomialNB Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = MultinomialNB()
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)
param_grid = {'alpha': alphas}
scoring = 'roc_auc'
nb_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
nbresult=nb_cv.fit(X_train_vectorized, y_train)
print("Best: %f using %s" % (nbresult.best_score_, nbresult.best_params_))
means = nbresult.cv_results_['mean_test_score']
stds = nbresult.cv_results_['std_test_score']
params = nbresult.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Best: 0.964037 using {'alpha': 0.4}
0.911889 (0.007580) with: {'alpha': 0.0}
0.963279 (0.001009) with: {'alpha': 0.1}
0.963804 (0.000965) with: {'alpha': 0.2}
0.964006 (0.000928) with: {'alpha': 0.30000000000000004}
0.964037 (0.000936) with: {'alpha': 0.4}
0.964028 (0.000924) with: {'alpha': 0.5}
0.963994 (0.000890) with: {'alpha': 0.6000000000000001}
0.963994 (0.000878) with: {'alpha': 0.7000000000000001}
0.963989 (0.000834) with: {'alpha': 0.8}
0.963993 (0.000813) with: {'alpha': 0.9}


In [74]:
# Predict the labels of the test set: y_pred using Logistic Regression
y_pred5 = nb_cv.predict(vect.transform(X_test))
# Compute and print metrics
print("Accuracy: {}".format(nb_cv.score(vect.transform(X_test), y_test)))
print(classification_report(y_test, y_pred5))
print("Tuned Model Parameters: {}".format(nb_cv.best_params_))

Accuracy: 0.9629978379544305
             precision    recall  f1-score   support

          0       0.89      0.72      0.80      1536
          1       0.91      0.97      0.94      4612

avg / total       0.91      0.91      0.90      6148

Tuned Model Parameters: {'alpha': 0.4}


In [75]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents

print('AUC: ', roc_auc_score(y_test, y_pred5))

AUC:  0.8470184776127494


# TfidfVectorizer+n-gram

In [76]:
# One way we can add some context is by adding sequences of word features known as n-grams. 

# For example, bigrams, which count pairs of adjacent words, 
# could give us features such as is working versus not working. 
# And trigrams, which give us triplets of adjacent words, 
# could give us features such as not an issue.

# To create these n-gram features, 
# we'll pass in a tuple to the parameter ngram_range, 
# where the values correspond to the minimum length and maximum lengths of sequences.

# For example, if I pass in the tuple, 1, 2, 
# CountVectorizer will create features using the individual words, 
# as well as the bigrams.

In [77]:
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

17594

In [78]:
# LogisticRegression Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = LogisticRegression()
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}
scoring = 'roc_auc'
logreg_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
logresult=logreg_cv.fit(X_train_vectorized, y_train)
print("Best: %f using %s" % (logresult.best_score_, logresult.best_params_))
means = logresult.cv_results_['mean_test_score']
stds = logresult.cv_results_['std_test_score']
params = logresult.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.965729 using {'penalty': 'l2', 'C': 268.2695795279727}
0.941318 (0.005657) with: {'penalty': 'l1', 'C': 163789.3706954068}
0.954273 (0.003934) with: {'penalty': 'l1', 'C': 268.2695795279727}
0.929916 (0.006720) with: {'penalty': 'l1', 'C': 11787686.347935867}
0.955284 (0.002589) with: {'penalty': 'l2', 'C': 100000000.0}
0.965729 (0.002511) with: {'penalty': 'l2', 'C': 268.2695795279727}
0.879088 (0.002774) with: {'penalty': 'l2', 'C': 8.483428982440725e-05}
0.935931 (0.007165) with: {'penalty': 'l1', 'C': 1389495.494373136}
0.500000 (0.000000) with: {'penalty': 'l1', 'C': 0.0007196856730011522}
0.960535 (0.003025) with: {'penalty': 'l2', 'C': 2275.845926074791}
0.500000 (0.000000) with: {'penalty': 'l1', 'C': 1e-05}


In [79]:
print("Accuracy: {}".format(logreg_cv.score(vect.transform(X_test), y_test)))

Accuracy: 0.966251493499747


In [80]:
# Predict the labels of the test set: y_pred using Logistic Regression
y_pred5 = logreg_cv.predict(vect.transform(X_test))
# Compute and print metrics

print(classification_report(y_test, y_pred5))
print("Tuned Model Parameters: {}".format(logreg_cv.best_params_))

             precision    recall  f1-score   support

          0       0.86      0.84      0.85      1536
          1       0.95      0.95      0.95      4612

avg / total       0.92      0.93      0.92      6148

Tuned Model Parameters: {'penalty': 'l2', 'C': 268.2695795279727}


In [82]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents

print('AUC: ', roc_auc_score(y_test, y_pred5))

AUC:  0.896070204087164


In [83]:
# MultinomialNB Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = MultinomialNB()
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)
param_grid = {'alpha': alphas}
scoring = 'roc_auc'
nb_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
nbresult=nb_cv.fit(X_train_vectorized, y_train)
print("Best: %f using %s" % (nbresult.best_score_, nbresult.best_params_))
means = nbresult.cv_results_['mean_test_score']
stds = nbresult.cv_results_['std_test_score']
params = nbresult.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Best: 0.970141 using {'alpha': 0.1}
0.903113 (0.003296) with: {'alpha': 0.0}
0.970141 (0.000443) with: {'alpha': 0.1}
0.970069 (0.000578) with: {'alpha': 0.2}
0.969918 (0.000617) with: {'alpha': 0.30000000000000004}
0.969764 (0.000666) with: {'alpha': 0.4}
0.969693 (0.000708) with: {'alpha': 0.5}
0.969630 (0.000723) with: {'alpha': 0.6000000000000001}
0.969615 (0.000765) with: {'alpha': 0.7000000000000001}
0.969634 (0.000793) with: {'alpha': 0.8}
0.969644 (0.000818) with: {'alpha': 0.9}


In [84]:
# Predict the labels of the test set: y_pred using Logistic Regression
y_pred5 = nb_cv.predict(vect.transform(X_test))
# Compute and print metrics
print("Accuracy: {}".format(nb_cv.score(vect.transform(X_test), y_test)))
print(classification_report(y_test, y_pred5))
print("Tuned Model Parameters: {}".format(nb_cv.best_params_))

Accuracy: 0.9694993613806374
             precision    recall  f1-score   support

          0       0.88      0.80      0.84      1536
          1       0.93      0.97      0.95      4612

avg / total       0.92      0.92      0.92      6148

Tuned Model Parameters: {'alpha': 0.1}


In [85]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents

print('AUC: ', roc_auc_score(y_test, y_pred5))

AUC:  0.8817424878939
