In [1]:
# Import necessary modules
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re

In [2]:
# Import data 

# Read the CSV file into a DataFrame: df
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

In [5]:
#recode y variable
df.dropna(inplace=True)
df[df['Rating'] != 3]
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1


In [6]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = re.sub(r'[/(){}\[\]\|@,;.#+_]',' ', text) 
    text = re.sub(r'[^0-9a-z ]','', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [7]:
df['Reviews'] = df['Reviews'].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,phone needed sim card would nice know,1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,3 months away upgrade stratosphere kept crappi...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,experience want forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,great phone work according expectations,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,fell love phone everything suppose 3g network ...,0.0,1


In [9]:
df.describe()

Unnamed: 0,Price,Rating,Review Votes,Positively Rated
count,33408.0,33408.0,33408.0,33408.0
mean,223.830373,3.82148,1.452526,0.68744
std,288.103094,1.541702,8.258734,0.463544
min,1.73,1.0,0.0,0.0
25%,74.5,3.0,0.0,0.0
50%,138.99,5.0,0.0,1.0
75%,264.1,5.0,1.0,1.0
max,2408.73,5.0,524.0,1.0


In [10]:
maxLen = len(max(df['Reviews'], key=len).split())
maxLen

1408

In [11]:
train, test = train_test_split(df, test_size=0.3, random_state = 42)

In [12]:
train_X = train['Reviews'] 
test_X= test['Reviews']
y_train=train['Positively Rated']
y_test=test['Positively Rated']

In [13]:
import warnings
warnings.filterwarnings("ignore")
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [14]:
max_words = 500
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [15]:
tokenize.fit_on_texts(train_X) # only fit on train

In [16]:
vocab_size = len(tokenize.word_index) + 1

In [17]:
x_train = tokenize.texts_to_sequences(train_X)
x_test = tokenize.texts_to_sequences(test_X)

In [18]:
from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(x_train, maxlen=max_words, padding='post')
X_test = sequence.pad_sequences(x_test, maxlen=max_words, padding='post')

In [19]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in tokenize.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [20]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=500, trainable=False)
model.add(e)
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          2183900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 2,264,401
Trainable params: 80,501
Non-trainable params: 2,183,900
_________________________________________________________________
None


In [21]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [22]:
batch_size = 32
epochs = 3
model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 21046 samples, validate on 2339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a3b270438>

In [23]:
score = model.evaluate(X_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.6853237553805063
