In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [3]:
df.isnull().sum().sum()

83462

In [4]:
len(df)

413840

In [5]:
df.dropna(inplace = True)
len(df)

334335

In [6]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [7]:
# Remove neutral ratings (=3)
df = df[df["Rating"] != 3]

# 4 & 5 -> Positive(1)
# 1 & 2 - > Negative(0)
df['Positively Rated'] = np.where(df["Rating"] > 3, 1, 0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1


In [8]:
df['Positively Rated'].mean()

0.7482686025879323

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["Reviews"], df["Positively Rated"], random_state = 0)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((231207,), (77070,), (231207,), (77070,))

In [11]:
print("X_train first entry:\n\n", X_train[0])
print("\n\nX_train shape: ", X_train.shape)

X_train first entry:

 I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!


X_train shape:  (231207,)


# CountVectorizer

In [12]:
vect = CountVectorizer().fit(X_train)

In [13]:
vect.get_feature_names()[::2000]

['00',
 '4less',
 'adr6275',
 'assignment',
 'blazingly',
 'cassettes',
 'condishion',
 'debi',
 'dollarsshipping',
 'esteem',
 'flashy',
 'gorila',
 'human',
 'irullu',
 'like',
 'microsaudered',
 'nightmarish',
 'p770',
 'poori',
 'quirky',
 'responseive',
 'send',
 'sos',
 'synch',
 'trace',
 'utiles',
 'withstanding']

In [14]:
len(vect.get_feature_names())

53216

In [15]:
vect.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '0000000',
 '0000from',
 '0001',
 '0004',
 '000ma',
 '000mah',
 '000mh',
 '000restricted',
 '001',
 '002',
 '0051',
 '006',
 '007',
 '00am',
 '00bucks',
 '00emotional',
 '00for',
 '00it',
 '00k',
 '00now',
 '00pm',
 '00so',
 '00time',
 '00us',
 '01',
 '01008',
 '011',
 '012',
 '013287002557427',
 '013435003182980',
 '014',
 '0155379',
 '016',
 '016g',
 '016s',
 '018633051660f',
 '019s',
 '02',
 '02may13',
 '02mbps',
 '03',
 '032g',
 '032port',
 '0330',
 '03pm',
 '04',
 '0400',
 '0412',
 '04pm',
 '04th',
 '05',
 '050',
 '0500tkx',
 '050mms',
 '050prot',
 '051',
 '056',
 '0572013',
 '05th',
 '05the',
 '05using',
 '06',
 '061',
 '062',
 '0630',
 '066',
 '06pm',
 '07',
 '0700',
 '07am',
 '07nov2015',
 '08',
 '0804245',
 '0808',
 '0825',
 '0829',
 '087',
 '087581287',
 '08in',
 '08mms',
 '08this',
 '09',
 '0909853',
 '09on',
 '0_1439_7',
 '0_1507_7',
 '0_print_120716',
 '0_user_manual',
 '0a',
 '0also',
 '0an',
 '0b3tbzlidhq7dce1bv05qdefaota',
 

In [16]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<231207x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 6117776 stored elements in Compressed Sparse Row format>

In [17]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
predictions = model.predict(vect.transform(X_test))
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.9206361352734463


In [19]:
feature_names = np.array(vect.get_feature_names())

In [20]:
sorted_coef_index = model.coef_[0].argsort()

In [21]:
model.coef_[0]

array([-0.25641072,  0.19678614,  0.02262264, ...,  0.00115638,
        0.15394112,  0.01221644])

In [22]:
sorted_coef_index

array([52310, 21272, 26705, ..., 18547, 18377, 18376])

In [23]:
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'garbage' 'junk' 'unusable' 'false' 'worthless' 'useless'
 'crashing' 'disappointing' 'awful']

Largest Coefs: 
['excelent' 'excelente' 'exelente' 'loving' 'loves' 'perfecto' 'excellent'
 'complaints' 'awesome' 'buen']



# Tfidf

In [24]:
vect = TfidfVectorizer(min_df = 5).fit(X_train)

In [25]:
vect.get_feature_names()

['00',
 '000',
 '0000',
 '000000',
 '000mah',
 '007',
 '00pm',
 '01',
 '02',
 '03',
 '032g',
 '04',
 '04th',
 '05',
 '051',
 '06',
 '07',
 '08',
 '09',
 '0a',
 '0c',
 '0ghz',
 '0hd',
 '0ii',
 '0k',
 '0l',
 '0mp',
 '0s',
 '0stars',
 '10',
 '100',
 '1000',
 '10000',
 '1001multi',
 '100gb',
 '100hours',
 '100mb',
 '100s',
 '100x',
 '101',
 '102',
 '1020',
 '103',
 '104',
 '105',
 '106',
 '107',
 '1080',
 '1080i',
 '1080p',
 '1080x1920',
 '109',
 '10am',
 '10gb',
 '10mbps',
 '10min',
 '10mins',
 '10pm',
 '10screen',
 '10th',
 '10x',
 '10year',
 '10yo',
 '11',
 '110',
 '11059mem',
 '1109miami',
 '110v',
 '111',
 '112',
 '1122',
 '113',
 '114',
 '115',
 '118',
 '119',
 '119gb',
 '11gb',
 '11pm',
 '11th',
 '11yr',
 '12',
 '120',
 '1200',
 '120fps',
 '120mb',
 '1217asus',
 '123',
 '124',
 '124gb',
 '125',
 '126',
 '128',
 '1280',
 '1280x720',
 '128g',
 '128gb',
 '128mb',
 '129',
 '12gb',
 '12hrs',
 '12mm',
 '12mp',
 '12pm',
 '12th',
 '13',
 '130',
 '1300',
 '1320',
 '133',
 '1334',
 '135',
 '1

In [26]:
len(vect.get_feature_names())

17951

In [27]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<231207x17951 sparse matrix of type '<class 'numpy.float64'>'
	with 6056695 stored elements in Compressed Sparse Row format>

In [28]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
predictions = model.predict(vect.transform(X_test))

In [30]:
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.9265848398605042


In [32]:
feature_names = np.array(vect.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
print("Smallest tfidf:\n{}\n".format(feature_names[sorted_tfidf_index[:10]]))
print("Largest tfidf: \n{}\n".format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['commenter' 'pthalo' 'warmness' 'storageso' 'aggregration' '1300'
 '625nits' 'a10' 'submarket' 'brawns']

Largest tfidf: 
['defective' 'batteries' 'gooood' 'epic' 'luis' 'goood' 'basico'
 'aceptable' 'problems' 'excellant']



In [33]:
sorted_coef_index = model.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['not' 'worst' 'useless' 'disappointed' 'terrible' 'return' 'waste' 'poor'
 'horrible' 'doesn']

Largest Coefs: 
['love' 'great' 'excellent' 'perfect' 'amazing' 'awesome' 'perfectly'
 'easy' 'best' 'loves']



In [34]:
print(model.predict(vect.transform(["not an issue, phone is working",
                                    "an issue, phone is not working"])))

[0 0]


# n-grams

In [35]:
vect = CountVectorizer(min_df = 5, ngram_range = (1, 2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [36]:
vect.get_feature_names()

['00',
 '00 activation',
 '00 also',
 '00 am',
 '00 and',
 '00 as',
 '00 bucks',
 '00 but',
 '00 cheaper',
 '00 compared',
 '00 dlls',
 '00 dollars',
 '00 for',
 '00 however',
 '00 if',
 '00 in',
 '00 is',
 '00 it',
 '00 less',
 '00 mo',
 '00 month',
 '00 more',
 '00 no',
 '00 not',
 '00 on',
 '00 or',
 '00 per',
 '00 phone',
 '00 phones',
 '00 plus',
 '00 pm',
 '00 price',
 '00 smartwatch',
 '00 so',
 '00 that',
 '00 the',
 '00 this',
 '00 to',
 '00 too',
 '00 total',
 '00 unlocked',
 '00 usd',
 '00 with',
 '00 you',
 '000',
 '000 200',
 '000 but',
 '000 colors',
 '000 feet',
 '000 for',
 '000 mah',
 '000 models',
 '000 on',
 '000 would',
 '0000',
 '000000',
 '000mah',
 '000mah battery',
 '007',
 '007 james',
 '00pm',
 '01',
 '01 16',
 '01 24',
 '01 and',
 '01 day',
 '01 is',
 '01 un',
 '02',
 '02 and',
 '02 lolipop',
 '03',
 '032g',
 '032g gn6ma',
 '04',
 '04 12',
 '04 2016',
 '04th',
 '04th of',
 '05',
 '05 18',
 '051',
 '051 that',
 '06',
 '06 and',
 '07',
 '07 10',
 '07 16',
 '07 

In [37]:
len(vect.get_feature_names())

198917

In [38]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
predictions = model.predict(vect.transform(X_test))

In [40]:
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.9597157039590599


In [41]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['no good' 'not happy' 'not worth' 'junk' 'worst' 'not satisfied'
 'garbage' 'not good' 'defective' 'terrible']

Largest Coefs: 
['excelent' 'excelente' 'excellent' 'not bad' 'exelente' 'perfect'
 'no problems' 'awesome' 'no issues' 'perfecto']



In [42]:
print(model.predict(vect.transform(["not an issue, phone is working",
                                    "an issue, phone is not working"])))

[1 0]
