In [8]:
import pandas as pd
import numpy as np
import json

df = pd.read_json('/Users/rickroma/PycharmProjects/BIA_660_D/Assignment_03/reviews.json')

In [9]:
df.dropna(inplace=True)
df[df['stars'] != 3]
df['good'] = np.where(df['stars'] > 3, 1, 0)
df.head()

Unnamed: 0,date,headline,name,stars,text,good
0,2017-03-04,Quality made and a fantastic deal! Let me sho...,GardenGuy,5,These Rockbirds LED Flashlights I bought came...,1
1,2018-02-10,"Miniature in size & price, but big on light ou...",NSB,4,"Given my many prior, poor experiences with sev...",1
10,2017-12-10,Not happy with modes!!!!,BillyBobBooooooJohnson,2,I ordered similar lights to this in the past.....,0
100,2017-07-13,Perfect size for dusk dog walking,Crazy Cattle Dog Lady,5,I bought these in lightening deal not entirely...,1
101,2013-05-16,"The best thing I've ever purchased for $5, han...",BiPolar Bear,5,It was a love at first sight. Perfect pocket s...,1


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['good'], random_state = 0)
print(X_train.head(10))
print(X_train.shape)

326    Slow ship from Digital center. Ordered Nov 13t...
7      Don't be fooled by their small appearance they...
175    I loved these Rockbird flashlights. I already ...
971                                   NICE FOR THE PRICE
166    Tell you the truth... i bought them for my mot...
806    The model I recieved was a simple on/off versi...
691                                               I like
517    One out of 4 light was bad, other than that wa...
927    For the price, these flashlights are great.  T...
132    I got one a couple of weeks ago and used it a ...
Name: text, dtype: object
(750,)


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
vect.get_feature_names()[::2000]

['00', 'promised']

In [13]:
len(vect.get_feature_names())

2960

In [14]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<750x2960 sparse matrix of type '<class 'numpy.int64'>'
	with 23574 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
from sklearn.metrics import roc_auc_score
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.71369047619


In [17]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['not' 'junk' 'weak' 'after' 'working' 'tho' 'doesn' 'don' 'but' 'turn']

Largest Coefs: 
['great' 'little' 'bright' 'excellent' 'aa' 'price' 'clip' 'easy' 'bought'
 'each']



In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

685

In [19]:
X_train_vectorized = vect.transform(X_train)
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.560119047619


In [23]:
feature_names = np.array(vect.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
print('Smallest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest Tfidf: 
['rain' 'resistant' 'stays' 'note' 'body' 'bulb' 'nimh' 'backup' 'showed'
 'edc']

Largest Tfidf: 
['perfect' 'good' 'like' 'ok' 'weak' 'great' 'junk' 'thank' 'gifts' 'value']



In [27]:
print(model.predict(vect.transform(['Took a while since it shipped from China. It was worth the wait. Great little flashlight at a great price. Runs on 1 AA battery and puts out a lot of light.',"""Stopped working just after return period. Save your money.""","""I received the product 2 weeks after ordering. There wasn't any documentation for it, but it's a put the battery in and push the button product. The zoom feature is quite bright. It makes a square with grid lines caused by the magnification of the led. Get past that, and it's functional. I tested the lumens against my 300lm streamlight, and it's very close, 225-275lm. I will be purchasing more as the flood focus is just as functional. Great product.""","""Terrible. Did not light up consistently while pushing the button. Both flashlights had the same issue as they would eventually turn on after pushing the button repeatedly (5 or 6 times). These are junk."""])))

[1 1 1 0]


In [28]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

1525

In [29]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.69880952381


In [30]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coef: \n{}\n'.format(feature_names[sorted_coef_index][:10]))
print('Largest Coef: \n{}\n'.format(feature_names[sorted_coef_index][:-11:-1]))

Smallest Coef: 
['not' 'after' 'junk' 'cheap and' 'weak' 'working' 'no' 'bright as' 'doesn'
 'but']

Largest Coef: 
['great' 'bright' 'little' 'for the' 'excellent' 'much' 'aa' 'to be' 'clip'
 'works']



In [32]:
print(model.predict(vect.transform(['Took a while since it shipped from China. It was worth the wait. Great little flashlight at a great price. Runs on 1 AA battery and puts out a lot of light.',"""Stopped working just after return period. Save your money.""","""I received the product 2 weeks after ordering. There wasn't any documentation for it, but it's a put the battery in and push the button product. The zoom feature is quite bright. It makes a square with grid lines caused by the magnification of the led. Get past that, and it's functional. I tested the lumens against my 300lm streamlight, and it's very close, 225-275lm. I will be purchasing more as the flood focus is just as functional. Great product.""","""Terrible. Did not light up consistently while pushing the button. Both flashlights had the same issue as they would eventually turn on after pushing the button repeatedly (5 or 6 times). These are junk."""])))

[1 0 1 0]
