In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import wordcloud
import nltk
import re
import string         
import keras
import tensorflow as tf

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from textblob import Word
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout2D, LSTM
from keras.callbacks import EarlyStopping

In [None]:
#Read in data
df = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')
print(df.shape)

In [None]:
df.head()

## Exploratory Data Analysis

In [None]:
df.describe()

In [None]:
df.info()

Based on the information above:
- we have no null values to worry about, so no missing values
- we have two type of columns, either int64 or object, in other word strings.
- we will focus on the score, summary and text column so we can drop the rest

In [None]:
all_cols = df.columns
keep_cols = ['Score', 'Summary', 'Text']
df.drop([c for c in all_cols if c not in keep_cols], axis=1, inplace=True)
df.head()

## Score distribution before aggregation

In [None]:
plt.figure(figsize=(8,6))
plt.title('Score distribution')
sns.histplot(df['Score'], discrete=True);

## Creating a new column ‘sentiment’ based on ‘Score’


In [None]:
def sentiments(df):
    return 'Positive' if (df['Score'] > 3) else 'Negative'
df['sentiment'] = df.apply(sentiments, axis=1)
df.head()

In [None]:
df.drop(['Score'], axis=1, inplace=True)

## Plot score distribution after aggregation

In [None]:
plt.figure(figsize=(6,6))
plt.title('Sentiment distribution')
sns.histplot(df['sentiment']);

We can see that the data is highly imbalanced toward positive reviews so we need to be careful when splitting the dataset into training and testing datasets.

## Combine columns Summary with Text into full_text

In [None]:
df['full_text'] = df['Summary'] + '. ' + df['Text']
df.head()

In [None]:
df.drop(['Summary', 'Text'], axis=1, inplace=True)

## Clean the text

Data cleaning involves deleting special letters, digits, irrelevant symbols, and stop words. It is also necessary to translate the terms to their root form for easier interpretation.

In [None]:
def replace_contractions(s):
    #dictionary consisting of the contraction and the actual value
    Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will",
               "'d":" would","'ve":" have","'re":" are"}

    #replace the contractions
    for key,value in Apos_dict.items():
        if key in s:
            s=s.replace(key,value)
    return s

In [None]:
def remove_punctuation(s, punct_list):
    for punc in punct_list:
        if punc in s:
            s = s.replace(punc, ' ')
    return s.strip()


In [None]:
def truncate_large_review(s, seq_length):
    ''' Return a truncated s to the input seq_length.
    '''    
    review_len = len(s)
        
    if review_len > seq_length:
        return s[0:seq_length]
    return s

In [None]:
def cleaning(df):
    # make text lowercase
    df['full_text'] = df['full_text'].apply(lambda s: str(s).lower())
    print('To lowercase is done')
    
    # replace contractions
    df['full_text'] = df['full_text'].apply(lambda s: replace_contractions(s))
    print('Contractions replacement is done')
    
    # remove html tags
    df['full_text'] = df['full_text'].apply(lambda s: re.compile(r'<[^>]+>').sub('', s))
    print('HTML tags removal is done')
    
    # remove punctuation
    regular_punct = list(string.punctuation)
    df['full_text'] = df['full_text'].apply(lambda s: remove_punctuation(s, regular_punct))
    print('Punctuation removal is done')
    
    # split attached words
    df['full_text'] = df['full_text'].apply(lambda s: " ".join([x for x in re.split("([A-Z][a-z]+[^A-Z]*)",s) if x]))
    print('Splitting attached words is done')
    
    # Replacing the digits/numbers
    df['full_text'] = df['full_text'].apply(lambda s: re.sub(r'\d+', '', s))
    print('Numbers replacement is done')
    
    # truncate large review
    df['full_text'] = df['full_text'].apply(lambda s: truncate_large_review(s, 300))
    print('Truncation of large reviews is done')

    return df

df = cleaning(df)

In [None]:
common_words=''
for i in df.full_text:
    i = str(i)
    tokens = i.split()
    common_words += " ".join(tokens)+" "
wordcloud = wordcloud.WordCloud().generate(common_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show() 

In [None]:
df.head()

we can see in the word cloud that most reviews are positive since one of the biggest word is love, and we see words like good, great, delicious, etc. We can also see that the dataset is related to food since we see words like taste, coffee, eat, etc. 

## Splitting the dataset into training and testing set

- Since the majority of reviews are positive (5 stars), we will need to do a stratified split on the reviews score to ensure that we don’t train the classifier on imbalanced data
- We are going to use sklearn’s Stratified ShuffleSplit class

In [None]:
# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=1,
                                          test_size=0.3,
                                          random_state=42)
train_idx, test_idx = next(strat_shuf_split.split(df.full_text, df.sentiment))

# Create the dataframes
X_train = df.loc[train_idx, 'full_text']
y_train = df.loc[train_idx, 'sentiment']

X_test = df.loc[test_idx, 'full_text']
y_test = df.loc[test_idx, 'sentiment']

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

## Tokenizing the features and encoding the target

In [None]:
max_features = 20000
seq_length = 300  # How long to make our word sequences
batch_size = 1000

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

In [None]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
x_train = pad_sequences(sequences_train, maxlen=seq_length)
x_test = pad_sequences(sequences_test, maxlen=seq_length)

In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)

y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

## Use Glove pre-trained word vectors

In [None]:
embeddings_index = {}
f = open('../input/glovedata/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
## This creates a matrix where the $i$th row gives the word embedding for the word represented by integer $i$.
## Essentially, these will be the "weights" for the Embedding Layer
## Rather than learning the weights, we will use these ones and "freeze" the layer

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## LSTM construction

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [None]:
word_dimension = 100 # This is the dimension of the words we are using from GloVe
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    word_dimension,
                    weights=[embedding_matrix], # we set the weights to be the word vectors from GloVe
                    input_length=seq_length,
                    trainable=False)) # By setting trainable to False, we "freeze" the word embeddings.
model.add(LSTM(60, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=20, validation_data=(x_test, y_test), callbacks=[es])

In [None]:
def plot_loss_accuracy(history, title):
    fig = plt.figure(figsize=(12, 6))
    fig.suptitle(title)
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(history.history["loss"],'r-x', label="Train Loss")
    ax.plot(history.history["val_loss"],'b-x', label="Validation Loss")
    ax.legend()
    ax.set_title('cross_entropy loss')
    ax.grid(True)


    ax = fig.add_subplot(1, 2, 2)
    ax.plot(history.history["accuracy"],'r-x', label="Train Accuracy")
    ax.plot(history.history["val_accuracy"],'b-x', label="Validation Accuracy")
    ax.legend()
    ax.set_title('accuracy')
    ax.grid(True)
    
plot_loss_accuracy(history, "LSTM Model")