In [34]:
# Import necessary modules
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re

In [35]:
# Import data 

# Read the CSV file into a DataFrame: df
df = pd.read_csv('Amazon_Unlocked_Mobile.csv',encoding="latin-1")

In [36]:
df = df[pd.notnull(df['Reviews'])]
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0
9,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,3,It's battery life is great. It's very responsi...,0.0


In [37]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = re.sub(r'[/(){}\[\]\|@,;.]',' ', text) 
    text = re.sub(r'[^0-9a-z #+_]','', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [38]:
df['Reviews'] = df['Reviews'].apply(clean_text)

In [39]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,feel lucky found used phone us used hard phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,nice phone nice grade pantach revue clean set ...,0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,works good goes slow sometimes good phone love,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,great phone replace lost phone thing volume bu...,0.0


In [40]:
df.describe()

Unnamed: 0,Price,Rating,Review Votes
count,407848.0,413778.0,401482.0
mean,226.867148,3.819609,1.506155
std,273.019444,1.548212,9.162444
min,1.73,1.0,0.0
25%,79.99,3.0,0.0
50%,144.71,5.0,0.0
75%,269.99,5.0,1.0
max,2598.0,5.0,645.0


In [41]:
maxLen = len(max(df['Reviews'], key=len).split())
maxLen

2780

In [42]:
df.shape[0] 

413778

In [43]:
train, test = train_test_split(df, test_size=0.3, random_state = 42)

In [44]:
train_X = train['Reviews'] 

test_X = test['Reviews']  

In [45]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

In [46]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [47]:
tokenize.fit_on_texts(train_X) # only fit on train
x_train = tokenize.texts_to_matrix(train_X)
x_test = tokenize.texts_to_matrix(test_X)

In [51]:
import warnings
warnings.filterwarnings("ignore")
import keras
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(train['Rating'])
y_train = encoder.transform(train['Rating'])
y_test = encoder.transform(test['Rating'])

# convert integers to dummy variables (i.e. one hot encoded)
y_train  = keras.utils.to_categorical(y_train, num_classes = 5)
# convert integers to dummy variables (i.e. one hot encoded)
y_test  = keras.utils.to_categorical(y_test, num_classes = 5)

In [52]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (289644, 1000)
x_test shape: (124134, 1000)
y_train shape: (289644, 5)
y_test shape: (124134, 5)


In [53]:
batch_size = 32
epochs = 30
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [54]:
model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 260679 samples, validate on 28965 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1a44ba85f8>

In [56]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.8396652005100875
