In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import string

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [4]:
df.isnull().sum().sum()

83462

In [5]:
len(df)

413840

In [6]:
df.dropna(inplace = True)
len(df)

334335

In [7]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [8]:
# Remove neutral ratings (=3)
df = df[df["Rating"] != 3]

# 4 & 5 -> Positive(1)
# 1 & 2 - > Negative(0)
df['Positively Rated'] = np.where(df["Rating"] > 3, 1, 0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1


In [9]:
df["Positively Rated"].mean()

0.7482686025879323

In [10]:
# Lower Case
df["Reviews"] = df["Reviews"].str.lower()
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,i feel so lucky to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,it works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,great phone to replace my lost phone. the only...,0.0,1


In [11]:
df.drop(["Product Name", "Brand Name", "Price", "Review Votes", "Rating"], axis = 1, inplace = True)
df.head()

Unnamed: 0,Reviews,Positively Rated
0,i feel so lucky to have found this used (phone...,1
1,"nice phone, nice up grade from my pantach revu...",1
2,very pleased,1
3,it works good but it goes slow sometimes but i...,1
4,great phone to replace my lost phone. the only...,1


In [12]:
# removing punctuations
def remove_punctuation(x):
    return x.translate(str.maketrans('', '', string.punctuation))
df["Reviews"] = df["Reviews"].apply(remove_punctuation)
df.head()

Unnamed: 0,Reviews,Positively Rated
0,i feel so lucky to have found this used phone ...,1
1,nice phone nice up grade from my pantach revue...,1
2,very pleased,1
3,it works good but it goes slow sometimes but i...,1
4,great phone to replace my lost phone the only ...,1


In [13]:
# Removing Stopwords
STOPWORDS = set(stopwords.words("english"))
STOPWORDS

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [14]:
def remove_stopwords(x):
    return " ".join([word for word in str(x).split() if word not in STOPWORDS])
df["Reviews"] = df["Reviews"].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,Reviews,Positively Rated
0,feel lucky found used phone us used hard phone...,1
1,nice phone nice grade pantach revue clean set ...,1
2,pleased,1
3,works good goes slow sometimes good phone love,1
4,great phone replace lost phone thing volume bu...,1


In [15]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(x):
    return " ".join([lemmatizer.lemmatize(word) for word in x.split()])
df["Reviews"] = df["Reviews"].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,Reviews,Positively Rated
0,feel lucky found used phone u used hard phone ...,1
1,nice phone nice grade pantach revue clean set ...,1
2,pleased,1
3,work good go slow sometimes good phone love,1
4,great phone replace lost phone thing volume bu...,1


In [16]:
X_train, X_test, y_train, y_test = train_test_split(df["Reviews"], df["Positively Rated"], random_state = 0)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((231207,), (77070,), (231207,), (77070,))

In [18]:
print("X_train first entry:\n\n", X_train[0])
print("\n\nX_train shape: ", X_train.shape)

X_train first entry:

 feel lucky found used phone u used hard phone line someone upgraded sold one son liked old one finally fell apart 25 year didnt want upgrade thank seller really appreciate honesty said used phonei recommend seller highly would


X_train shape:  (231207,)


# Count Vectorizer

In [19]:
vect = CountVectorizer().fit(X_train)

In [20]:
vect.get_feature_names()[::2000]

['00',
 '24284',
 '5band',
 'accesswhat',
 'amaterials',
 'aroundd',
 'barlessmy',
 'bluvalue',
 'cali',
 'charger4no',
 'compatiblehowever',
 'coveredmake',
 'defaultthe',
 'disconnectssort',
 'eaten',
 'eurasia',
 'fascinate',
 'fornow',
 'glassesother',
 'handeye',
 'housing',
 'innovating',
 'itprosperformance',
 'launchmicrosdup',
 'lovee',
 'messages6',
 'movie',
 'nirvana',
 'on4',
 'p9060',
 'phoneii',
 'poolcamera',
 'programsits',
 'rceived',
 'reporte',
 's5511',
 'sensitizing',
 'skinoverall',
 'specifiedclick',
 'summed',
 'thanksupdate',
 'tooeasy',
 'unedited',
 'v20',
 'wazegoogle',
 'worksince']

In [21]:
len(vect.get_feature_names())

91596

In [22]:
vect.get_feature_names()

['00',
 '0000',
 '00000',
 '000000',
 '0000000i',
 '00001',
 '00004102101008022010rm357nokia',
 '0000dns2',
 '0000from',
 '0000then',
 '0001',
 '0008mms',
 '000restricted',
 '001',
 '002',
 '00217825652089853',
 '002mbps',
 '005',
 '007',
 '01',
 '010',
 '0100',
 '0102mm',
 '01062014',
 '01082014both',
 '0109',
 '010min',
 '010minute',
 '010ovi',
 '011616mate',
 '012012',
 '01242016',
 '0128',
 '013287002557427',
 '013435003182980',
 '0142009r',
 '015gbroot',
 '016',
 '018633051660f',
 '01gbday',
 '01mp',
 '02',
 '020',
 '02042013update',
 '02042013well',
 '0205',
 '0206',
 '02062014',
 '020912',
 '02152013',
 '02152016to',
 '02172016',
 '02182016the',
 '02202014if',
 '02212016',
 '022212',
 '02222013',
 '02345',
 '024',
 '025',
 '026mm25d',
 '028',
 '02may13',
 '03',
 '03112016after',
 '031ah',
 '032016',
 '032415',
 '032415so',
 '032513',
 '03252016',
 '0328',
 '033',
 '0330',
 '035mm',
 '037',
 '039',
 '03mm',
 '03mp',
 '04',
 '0400',
 '040212the',
 '04052009',
 '041016well',
 '0411

In [23]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<231207x91596 sparse matrix of type '<class 'numpy.int64'>'
	with 3971125 stored elements in Compressed Sparse Row format>

In [24]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [25]:
predictions = model.predict(vect.transform(X_test))
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.9135687390601409


In [26]:
feature_names = np.array(vect.get_feature_names())

In [27]:
sorted_coef_index = model.coef_[0].argsort()

In [28]:
model.coef_[0]

array([-0.00307734,  0.08068761,  0.06906712, ...,  0.49364468,
        0.49364468,  0.04117734])

In [29]:
sorted_coef_index

array([90173, 13568, 90154, ..., 30427, 30688, 30425], dtype=int64)

In [30]:
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worthless' 'blacklist' 'worst' 'false' 'mony' 'nope' 'junk'
 'unsatisfied' 'po' 'messing']

Largest Coefs: 
['excelent' 'exelente' 'excelente' 'lovely' 'amazed' 'loving' 'perfecto'
 'excellent' 'wonderfully' 'accident']



# Tfidf

In [31]:
vect = TfidfVectorizer(min_df = 5).fit(X_train)

In [32]:
vect.get_feature_names()

['0000',
 '000000',
 '007',
 '01',
 '0100',
 '02',
 '020',
 '03',
 '031ah',
 '03mm',
 '04',
 '04th',
 '05',
 '06',
 '07',
 '08',
 '09',
 '099',
 '0stars',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '100150',
 '1001multi',
 '10051',
 '100august',
 '100gb',
 '100hours',
 '100i',
 '100mb',
 '100saludos',
 '100thank',
 '100x',
 '101',
 '1010',
 '1011',
 '1012',
 '1015',
 '102',
 '1020',
 '102016',
 '1020i',
 '1021',
 '103',
 '1030',
 '1031',
 '104',
 '105',
 '106',
 '107',
 '1080',
 '1080i',
 '1080p',
 '1080x1920',
 '109',
 '10am',
 '10connectivity',
 '10gb',
 '10ghz',
 '10i',
 '10if',
 '10lol',
 '10min',
 '10mins',
 '10mo',
 '10month',
 '10pm',
 '10th',
 '10x',
 '10year',
 '10yo',
 '11',
 '110',
 '1100',
 '11000',
 '110000',
 '11059mem',
 '1109miami',
 '110th',
 '110v',
 '111',
 '112',
 '1122',
 '112813damage',
 '1130',
 '114',
 '115',
 '116',
 '118',
 '119',
 '119gbbluetooth',
 '11gb',
 '11pm',
 '11th',
 '11yr',
 '12',
 '120',
 '1200',
 '12000',
 '120fps',
 '120mb',
 '121',
 '1214',
 

In [33]:
len(vect.get_feature_names())

20330

In [34]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<231207x20330 sparse matrix of type '<class 'numpy.float64'>'
	with 3852416 stored elements in Compressed Sparse Row format>

In [35]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [36]:
predictions = model.predict(vect.transform(X_test))

In [37]:
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.9127167328068838


In [38]:
feature_names = np.array(vect.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
print("Smallest tfidf:\n{}\n".format(feature_names[sorted_tfidf_index[:10]]))
print("Largest tfidf: \n{}\n".format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['itdesignthe' 'testingcomparison' 'factorwith' 'nobrainernote'
 'belowwith' 'aggregration' 'driverupdate' 'usabilitypricebang'
 'highpowered' 'commenter']

Largest tfidf: 
['gostei' 'stated' 'got' 'complants' 'complaint' 'complains' 'oldschool'
 'star' 'complain' 'complacido']



In [39]:
sorted_coef_index = model.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['worst' 'waste' 'disappointed' 'return' 'useless' 'terrible' 'poor'
 'returning' 'worthless' 'horrible']

Largest Coefs: 
['love' 'great' 'excellent' 'perfect' 'amazing' 'perfectly' 'awesome'
 'far' 'best' 'easy']



In [40]:
print(model.predict(vect.transform(["not an issue, phone is working",
                                    "an issue, phone is not working"])))

[0 0]


# n-grams

In [41]:
vect = CountVectorizer(min_df = 5, ngram_range = (1, 2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [42]:
vect.get_feature_names()

['0000',
 '000000',
 '007',
 '007 james',
 '01',
 '01 day',
 '01 un',
 '0100',
 '02',
 '020',
 '020 battery',
 '03',
 '031ah',
 '031ah 310',
 '03mm',
 '03mm ultra',
 '04',
 '04th',
 '04th march',
 '05',
 '06',
 '07',
 '08',
 '09',
 '099',
 '0stars',
 '10',
 '10 10',
 '10 12',
 '10 15',
 '10 20',
 '10 2015',
 '10 2015it',
 '10 30',
 '10 8085',
 '10 amazon',
 '10 another',
 '10 apps',
 '10 battery',
 '10 buck',
 '10 card',
 '10 cent',
 '10 charge',
 '10 charging',
 '10 computer',
 '10 couldnt',
 '10 day',
 '10 delete',
 '10 depressing',
 '10 device',
 '10 different',
 '10 dollar',
 '10 downloaded',
 '10 euro',
 '10 every',
 '10 everything',
 '10 foot',
 '10 ft',
 '10 gb',
 '10 ghz',
 '10 gig',
 '10 good',
 '10 great',
 '10 happy',
 '10 hour',
 '10 hr',
 '10 im',
 '10 inch',
 '10 insider',
 '10 installed',
 '10 iphone',
 '10 item',
 '10 le',
 '10 like',
 '10 lo',
 '10 love',
 '10 make',
 '10 mbps',
 '10 min',
 '10 minute',
 '10 minutesvery',
 '10 mobile',
 '10 month',
 '10 mp',
 '10 ounce

In [43]:
len(vect.get_feature_names())

178544

In [44]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [45]:
predictions = model.predict(vect.transform(X_test))

In [46]:
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.9472744084125012


In [47]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}\n".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['ok best' 'worst' 'junk' 'garbage' 'look ok' 'horrible' 'ok could' 'nope'
 'worthless' 'cam ok']

Largest Coefs: 
['excelente' 'excelent' 'excellent' 'perfect' 'love' 'exelente' 'awesome'
 'amazing' 'perfectly' 'great']



In [69]:
print(model.predict(vect.transform(["not an issue, phone is working",
                                    "an issue, phone is not working"])))

[0 0]
