In [1]:
# Import necessary modules
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re

In [2]:
# Import data 

# Read the CSV file into a DataFrame: df
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

In [3]:
df = df[pd.notnull(df['Reviews'])]
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0
374058,Samsung Galaxy S7 Edge SM-G935F 32GB Factory U...,,593.5,4,except samsung pay everything is good,0.0
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0


In [4]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = re.sub(r'[/(){}\[\]\|@,;.#+_]',' ', text) 
    text = re.sub(r'[^0-9a-z ]','', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [5]:
df['Reviews'] = df['Reviews'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,good one better samsung iphones quality camera...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,phone needed sim card would nice know,1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,3 months away upgrade stratosphere kept crappi...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,experience want forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,great phone work according expectations,1.0


In [7]:
df.describe()

Unnamed: 0,Price,Rating,Review Votes
count,40753.0,41374.0,40184.0
mean,227.631428,3.815319,1.49674
std,277.011277,1.551319,8.451689
min,1.73,1.0,0.0
25%,79.95,3.0,0.0
50%,140.0,5.0,0.0
75%,269.99,5.0,1.0
max,2408.73,5.0,524.0


In [8]:
maxLen = len(max(df['Reviews'], key=len).split())
maxLen

1408

In [9]:
df.shape[0] 

41374

In [10]:
train, test = train_test_split(df, test_size=0.3, random_state = 42)

In [11]:
train_X = train['Reviews'] 
test_X = test['Reviews']  

In [12]:
import warnings
warnings.filterwarnings("ignore")
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [13]:
max_words = 500
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [14]:
tokenize.fit_on_texts(train_X) # only fit on train

In [15]:
vocab_size = len(tokenize.word_index) + 1

In [16]:
x_train = tokenize.texts_to_sequences(train_X)
x_test = tokenize.texts_to_sequences(test_X)

In [17]:
print(x_train[2])

[38, 62, 231, 1, 198, 61, 14, 10, 248, 68, 71, 294]


In [18]:
import warnings
warnings.filterwarnings("ignore")
import keras
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(train['Rating'])
y_train = encoder.transform(train['Rating'])
y_test = encoder.transform(test['Rating'])

# convert integers to dummy variables (i.e. one hot encoded)
y_train  = keras.utils.to_categorical(y_train, num_classes = 5)
# convert integers to dummy variables (i.e. one hot encoded)
y_test  = keras.utils.to_categorical(y_test, num_classes = 5)

In [19]:
from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(x_train, maxlen=max_words, padding='post')
X_test = sequence.pad_sequences(x_test, maxlen=max_words, padding='post')

print(X_train[0])

[155   6  40  40   1  11  40 451  57   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   

In [20]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in tokenize.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [21]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=500, trainable=False)
model.add(e)
model.add(LSTM(200))
model.add(Dense(5, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          2450400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1005      
Total params: 2,692,205
Trainable params: 241,805
Non-trainable params: 2,450,400
_________________________________________________________________
None


In [22]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
batch_size = 32
epochs = 3
model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 26064 samples, validate on 2897 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a18f82c18>

In [24]:
score = model.evaluate(X_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.5414484814595688
