In [None]:
###########################   Part 1 : Data Preprocessing   ############################

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Loading the data
file_name = "reviews.csv"
reviews = pd.read_csv(file_name)
reviews.head()

In [None]:
# Visualizing the score
sns.distplot(a=reviews["Score"], kde=False)

In [None]:
# Removing rows with score of 3
filtered_reviews = reviews.loc[reviews["Score"].isin([1, 2, 4, 5])]

def category(x):
    if x > 3:
        return 1
    else:
        return 0
    
ratings = filtered_reviews["Score"].map(category)
filtered_reviews["Score"] = ratings

filtered_reviews.head()

In [None]:
# Data cleaning
sorted_reviews = filtered_reviews.sort_values(by="ProductId", axis=0, ascending=True)
final_reviews = sorted_reviews.drop_duplicates(subset={"UserId", "ProfileName", "Time", "Text"}, keep="first", inplace=False)
final_reviews.shape

In [None]:
# % of data left
(final_reviews.size*1.0 / filtered_reviews.size*1.0) * 100

In [None]:
# Removing incorrect entries
final_reviews = final_reviews[final_reviews.HelpfulnessNumerator <= final_reviews.HelpfulnessDenominator]
final_reviews.shape

In [None]:
# Value counts
final_reviews["Score"].value_counts()

In [None]:
# Making a list of stopwords
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")

print(stop_words)

In [None]:
# Text preprocessing
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import contractions
preprocessed_reviews = []

for sentence in tqdm(final_reviews['Text'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = contractions.fix(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = sentence.lower()
    sentence = stemmer.stem(sentence)
    sentence = " ".join([word for word in sentence.split() if word not in stop_words])
    preprocessed_reviews.append(sentence)

In [None]:
# Saving the cleaned text into a csv
final_reviews["Cleaned Text"] = preprocessed_reviews
final_reviews.to_csv(path_or_buf="cleaned_reviews.csv", index=False)

In [None]:
# Creating vocubulary
corpus = []

for text in tqdm(final_reviews["Cleaned Text"]):
    for word in text.strip().split():
        corpus.append(word.strip())
    
print(len(corpus))

In [None]:
# Word count
from collections import Counter
word_count = Counter(corpus)
print("Unique words =", len(word_count))
word_count.most_common(5)

In [None]:
# Creating word count dataframe
word_count_df = []

for idx, (word, count) in enumerate(word_count.most_common(len(word_count))):
    word_count_df.append([idx+1, word, count])

word_count_df = pd.DataFrame(columns=["Index", "Word", "Count"], data=word_count_df)    
word_count_df.head()

In [None]:
# Creating word count dictionary
word_count_dict = {}

for _, row in word_count_df.iterrows():
    word_count_dict[row["Word"]] = [row["Index"], row["Count"]]

print(word_count_dict)

In [None]:
# Data preprocessing
indexed_X = []
indexed_y = []

for sentence in final_reviews["Cleaned Text"]:
    indexed_X.append([word_count_dict[word][0] for word in sentence.strip().split()])

indexed_y = final_reviews["Score"]

In [None]:
# Data loader
from sklearn.model_selection import train_test_split

def load_data(num_words):
    X = indexed_X
    y = indexed_y
    
    for i in range(len(X)):
        for j in range(len(X[i])):
            if X[i][j] > num_words:
                X[i][j] = 0
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    return (X_train, y_train), (X_test, y_test)

In [None]:
###########################   Part 2 : Model Building   ############################

In [None]:
# Importing libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = load_data(num_words=top_words)

In [None]:
# Truncate and/or pad input sequences
max_review_length = 600

X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

print(X_train.shape)
print(X_train[1])

In [None]:
# Build the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
###########################   Part 3 : Model Evaluation   ############################

In [None]:
# Run the model
model.fit(X_train, y_train, nb_epoch=10, batch_size=64)
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))