In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv('Amazon_Unlocked_Mobile.csv')
#Taking a small fraction of the dataset
df = df.sample(frac=0.2, random_state=10)

In [3]:
#print some values 
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [4]:
#dropping the missing values
df.dropna(inplace=True)
# Remove any Neutral ratings equal to 3
df = df[df['Rating'] != 3]
#Assigning positive and the negative rating accordingly
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)


Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0,1
406017,Verizon HTC Rezound 4G Android Smarphone - 8MP...,HTC,74.99,4,Great phone for the price...,0.0,1
302567,"RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...",RCA,159.99,5,My mom is not good with new technoloy but this...,4.0,1


In [5]:
df['Positively Rated'].mean()
#hence most of the ratings are the positive

0.748061952511742

In [6]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [7]:
print('X_train last entry:\n\n', X_train.iloc[-1])
print('\n\nX_train shape: ', X_train.shape)

X_train last entry:

 Within 2weeks of owning the phone it shown a update that literally fried the phone. When it worked it was enjoyable fast and responsive.


X_train shape:  (46148,)


In [8]:
#Extraction of features using countvectorizer()

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [10]:
vect.get_feature_names()[::2000]

['00',
 'adverts',
 'blitz',
 'condition',
 'distributers',
 'fiio',
 'hillside',
 'launge',
 'namely',
 'pixles',
 'remembers',
 'slooow',
 'thorn',
 'wasent']

In [11]:
len(vect.get_feature_names())

27015

In [12]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<46148x27015 sparse matrix of type '<class 'numpy.int64'>'
	with 1220684 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.9043073762434041


In [15]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()


print('lowest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Highset Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

lowest Coefs:
['worst' 'junk' 'garbage' 'upset' 'terrible' 'useless' 'poor' 'sucked'
 'slow' 'waste']

Highset Coefs: 
['excelent' 'excelente' 'excellent' 'loves' 'perfect' 'love' 'awesome'
 'amazing' 'best' 'exactly']


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5 or ignoring the features which has the frequenct less than 5

vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

7754

In [17]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AUC:  0.9037885795467111


In [18]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AUC:  0.9037885795467111


In [19]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['prohibit' 'exposing' 'accompanying' 'bokeh' 'printer' 'stealing'
 'gibberish' 'pgb' 'centigrade' 'barometer']

Largest tfidf: 
['indestructible' 'recommended' 'flimsy' 'classy' 'tough' 'supper' 'fraud'
 'sin' 'adequate' 'superb']


In [20]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'disappointed' 'slow' 'return' 'doesn' 'poor' 'terrible'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'best' 'perfect' 'amazing' 'good' 'far'
 'awesome' 'perfectly']


In [21]:
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


In [22]:
#tfidf and countvectorizer() must be combined to predict the write results 



In [23]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

51707

In [24]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AUC:  0.9262738667002678


In [25]:
#most accurate till now

In [26]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'worst' 'poor' 'terrible' 'horrible' 'not good' 'broken'
 'not happy' 'not satisfied']

Largest Coefs: 
['excellent' 'excelente' 'excelent' 'perfect' 'not bad' 'great'
 'no problems' 'awesome' 'amazing' 'love']


In [27]:
print(model.predict(vect.transform(['netwrok problem unable to fix it',
                                    'very amazing quality of camera'])))

[0 1]


In [28]:
#precting the right output