In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import string
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [104]:
df = pd.read_csv("data.csv")
df = df.sample(frac = 0.05, replace = True, random_state = 0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
305711,Samsung Focus I917 Unlocked Phone with Windows...,Samsung,29.99,5,the only down side is windows is moving away f...,0.0
117952,BLU Energy X 2 - With 4000 mAh Super Battery -...,BLU,109.99,4,good thank,0.0
152315,BLU Studio C Super Camera -Unlocked Smartphone...,BLU,99.0,5,Great phone.,0.0
358083,Samsung Galaxy S5 SM-G900H Factory Unlocked Ce...,Samsung,339.99,1,Cellphone stop working,0.0
359783,Samsung Galaxy S6 32GB SM-G920i - Unlocked Whi...,Samsung,459.99,5,I've only used it for about three days but I t...,0.0


In [105]:
df.isnull().sum().sum()

4167

In [106]:
len(df)

20692

In [107]:
df.dropna(inplace = True)
len(df)

16710

In [108]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
305711,Samsung Focus I917 Unlocked Phone with Windows...,Samsung,29.99,5,the only down side is windows is moving away f...,0.0
117952,BLU Energy X 2 - With 4000 mAh Super Battery -...,BLU,109.99,4,good thank,0.0
152315,BLU Studio C Super Camera -Unlocked Smartphone...,BLU,99.0,5,Great phone.,0.0
358083,Samsung Galaxy S5 SM-G900H Factory Unlocked Ce...,Samsung,339.99,1,Cellphone stop working,0.0
359783,Samsung Galaxy S6 32GB SM-G920i - Unlocked Whi...,Samsung,459.99,5,I've only used it for about three days but I t...,0.0


In [109]:
# Remove neutral ratings (=3)
df = df[df["Rating"] != 3]

# 4 & 5 -> Positive(1)
# 1 & 2 -> Negative(0)
df["Positively Rated"] = np.where(df["Rating"] > 3, 1, 0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
305711,Samsung Focus I917 Unlocked Phone with Windows...,Samsung,29.99,5,the only down side is windows is moving away f...,0.0,1
117952,BLU Energy X 2 - With 4000 mAh Super Battery -...,BLU,109.99,4,good thank,0.0,1
152315,BLU Studio C Super Camera -Unlocked Smartphone...,BLU,99.0,5,Great phone.,0.0,1
358083,Samsung Galaxy S5 SM-G900H Factory Unlocked Ce...,Samsung,339.99,1,Cellphone stop working,0.0,0
359783,Samsung Galaxy S6 32GB SM-G920i - Unlocked Whi...,Samsung,459.99,5,I've only used it for about three days but I t...,0.0,1


In [110]:
df["Positively Rated"].mean()

0.7463900091062833

In [111]:
# Lower Case
df["Reviews"] = df["Reviews"].str.lower()
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
305711,Samsung Focus I917 Unlocked Phone with Windows...,Samsung,29.99,5,the only down side is windows is moving away f...,0.0,1
117952,BLU Energy X 2 - With 4000 mAh Super Battery -...,BLU,109.99,4,good thank,0.0,1
152315,BLU Studio C Super Camera -Unlocked Smartphone...,BLU,99.0,5,great phone.,0.0,1
358083,Samsung Galaxy S5 SM-G900H Factory Unlocked Ce...,Samsung,339.99,1,cellphone stop working,0.0,0
359783,Samsung Galaxy S6 32GB SM-G920i - Unlocked Whi...,Samsung,459.99,5,i've only used it for about three days but i t...,0.0,1


In [112]:
df.drop(["Product Name", "Brand Name", "Price", "Review Votes", "Rating"], axis = 1, inplace = True)
df.head()

Unnamed: 0,Reviews,Positively Rated
305711,the only down side is windows is moving away f...,1
117952,good thank,1
152315,great phone.,1
358083,cellphone stop working,0
359783,i've only used it for about three days but i t...,1


In [113]:
# removing punctuation
def remove_punctuation(x):
  return x.translate(str.maketrans('', '', string.punctuation))
df["Reviews"] = df["Reviews"].apply(remove_punctuation)
df.head()

Unnamed: 0,Reviews,Positively Rated
305711,the only down side is windows is moving away f...,1
117952,good thank,1
152315,great phone,1
358083,cellphone stop working,0
359783,ive only used it for about three days but i th...,1


In [114]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [115]:
# removing stopwords
STOPWORDS = set(stopwords.words("english"))
STOPWORDS

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [116]:
def remove_stopwords(x):
  return " ".join([word for word in str(x).split() if word not in STOPWORDS])
df["Reviews"] = df["Reviews"].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,Reviews,Positively Rated
305711,side windows moving away platform zune pass lo...,1
117952,good thank,1
152315,great phone,1
358083,cellphone stop working,0
359783,ive used three days think get six stars really...,1


In [117]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [118]:
lemmatizer = WordNetLemmatizer()

In [119]:
def lemmatize_words(x):
  return " ".join([lemmatizer.lemmatize(word) for word in x.split()])
df["Reviews"] = df["Reviews"].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,Reviews,Positively Rated
305711,side window moving away platform zune pas long...,1
117952,good thank,1
152315,great phone,1
358083,cellphone stop working,0
359783,ive used three day think get six star really l...,1


In [120]:
df["Positively Rated"].nunique()

2

In [121]:
df["Positively Rated"].unique()

array([1, 0])

In [132]:
max_features = 500
tokenizer = Tokenizer(num_words = max_features, split = ' ')
tokenizer.fit_on_texts(df["Reviews"].values)
X = tokenizer.texts_to_sequences(df["Reviews"].values)
X = pad_sequences(X)

In [133]:
X

array([[  0,   0,   0, ..., 125,   5,  63],
       [  0,   0,   0, ...,   0,   3, 209],
       [  0,   0,   0, ...,   0,   2,   1],
       ...,
       [  0,   0,   0, ..., 131,   5,  32],
       [  0,   0,   0, ..., 164,  52,  90],
       [  0,   0,   0, ...,   2,  75, 210]], dtype=int32)

In [134]:
df["Positively Rated"]

305711    1
117952    1
152315    1
358083    0
359783    1
         ..
361206    0
381962    0
80804     0
123221    0
133927    1
Name: Positively Rated, Length: 15374, dtype: int64

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, df["Positively Rated"], test_size = 0.33, random_state = 0)

In [136]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10300, 1483), (5074, 1483), (10300,), (5074,))

In [137]:
batch_size = 16
embedding_dim = 64
lstm_out = 64

In [142]:
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout = 0.2))
model.add(Dense(1, activation = "sigmoid"))

In [143]:
model.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [144]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 1483, 64)          32000     
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 1483, 64)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 65,089
Trainable params: 65,089
Non-trainable params: 0
_________________________________________________________________


In [145]:
history = model.fit(
    X_train,
    y_train,
    epochs = 5,
    batch_size = batch_size,
    validation_data = (X_test, y_test),
    verbose = 1
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [146]:
score, acc = model.evaluate(X_test, y_test, verbose = 1, batch_size = batch_size)



In [147]:
print("Score: %.2f" % (score))
print("Acc: %.2f" % (acc))

Score: 0.27
Acc: 0.89
