In [24]:
# Library Load Model
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Library Pre-Processing
from nltk.stem import WordNetLemmatizer
import nltk
import re
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# loading token
with open('t.pickle', 'rb') as file_1:
    t = pickle.load(file_1)

In [26]:
# Load Model
  
model_gruimp = load_model('best_model.h5')

In [27]:
model_gruimp.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 102, 128)          6469120   
                                                                 
 bidirectional (Bidirectiona  (None, 102, 64)          31104     
 l)                                                              
                                                                 
 gru_1 (GRU)                 (None, 8)                 1776      
                                                                 
 dropout (Dropout)           (None, 8)                 0         
                                                                 
 dense (Dense)               (None, 1)                 9         
                                                                 
Total params: 6,502,009
Trainable params: 0
Non-trainable params: 6,502,009
______________________________________________

In [28]:
# Adding spanish to stopwords
spanish = stopwords.words('spanish')
additional_stopwords = []
for i in spanish:
    additional_stopwords.append(i)

# Adding french to stopwords
french = stopwords.words('french')
for i in french:
    additional_stopwords.append(i)

# Adding german to stopwords
german = stopwords.words('german')
for i in german:
    additional_stopwords.append(i)

In [29]:
# Setting stopwords with english as default language
stopwords = list(set(stopwords.words('english')))
for i in additional_stopwords:
    stopwords.append(i)

In [30]:
lemmatizer = WordNetLemmatizer()
def text_processing(text):

    # Converting all text to Lowercase
    text = text.lower()

    # Removing Unicode Characters
    text = re.sub("&#[A-Za-z0-9_]+", " ", text)

    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
  
    # Removing Whitespace
    text = text.strip()

    # Removing emoji
    text = re.sub("[^A-Za-z\s']", " ", text)

    # Removing double space
    text = re.sub("\s\s+" , " ", text)
        
    # Tokenizing words
    tokens = word_tokenize(text)

    # Removing Stopwords
    text = ' '.join([word for word in tokens if word not in stopwords])

    # Lemmatizer
    text = lemmatizer.lemmatize(text)

    return text

In [31]:
df_inf = pd.read_csv('model_inference_sentiment_analysis.csv')
df_inf_copy = df_inf.copy()
df_inf_copy = df_inf.drop(['Unnamed: 0'],axis=1)
df_inf_copy.class_index.replace({1:'Negative Review',2:'Positive Review'}, inplace=True)

In [32]:
df_inf_copy

Unnamed: 0,class_index,review_title,review_text
0,Positive Review,"This Or ""Dixie Chicken"" Presents Them At A Peak",Though lyrically the overall feel of this reco...
1,Positive Review,"It's not perfect, but very good!",I have been looking for an anthology of outsta...
2,Positive Review,Well done,While the characters are not the same as in th...
3,Negative Review,"Not so great, but pretty colors",I tried/own both the pink and the orangish col...
4,Negative Review,Ruined My Valentine's Day!,I read each Amazon review on this product befo...
5,Positive Review,Concise Text of Neuroscience,I bought this book for my brother as he is tak...
6,Positive Review,GREAT!!!!!,I got this toy a couple of days ago and I ABSO...
7,Positive Review,Hot Coffee,Good coffee pot. Hot coffee. 6-8 cups makes a ...
8,Positive Review,Mystery at Walt Disney World,"Book arrived in good condition, but took quite..."
9,Negative Review,Take with a dose of skepticism...,"All readers should keep in mind that, as anoth..."


In [33]:
# Applying all preprocessing in one document

df_inf_copy['review_processed'] = df_inf_copy['review_text'].apply(lambda x: text_processing(x))
df_inf_copy

Unnamed: 0,class_index,review_title,review_text,review_processed
0,Positive Review,"This Or ""Dixie Chicken"" Presents Them At A Peak",Though lyrically the overall feel of this reco...,though lyrically overall feel record slightly ...
1,Positive Review,"It's not perfect, but very good!",I have been looking for an anthology of outsta...,looking anthology outstanding literary materia...
2,Positive Review,Well done,While the characters are not the same as in th...,characters original casting well done players ...
3,Negative Review,"Not so great, but pretty colors",I tried/own both the pink and the orangish col...,triedown pink orangish colors work way true go...
4,Negative Review,Ruined My Valentine's Day!,I read each Amazon review on this product befo...,read amazon review product made purchase full ...
5,Positive Review,Concise Text of Neuroscience,I bought this book for my brother as he is tak...,bought book brother taking class using book lo...
6,Positive Review,GREAT!!!!!,I got this toy a couple of days ago and I ABSO...,got toy couple days ago absolutely love much r...
7,Positive Review,Hot Coffee,Good coffee pot. Hot coffee. 6-8 cups makes a ...,good coffee pot hot coffee cups makes strong c...
8,Positive Review,Mystery at Walt Disney World,"Book arrived in good condition, but took quite...",book arrived good condition took quite bit lon...
9,Negative Review,Take with a dose of skepticism...,"All readers should keep in mind that, as anoth...",readers keep mind another reviewer noted arno ...


In [34]:
# Transform Inference-Set 
df_inf_transform = df_inf_copy.review_processed

In [35]:
word_index = t.word_index

In [36]:
word_index

{'<UNK>': 1,
 'time': 2,
 'movie': 3,
 'even': 4,
 'well': 5,
 'love': 6,
 'use': 7,
 'better': 8,
 'product': 9,
 'bought': 10,
 'work': 11,
 'cd': 12,
 'little': 13,
 'album': 14,
 'best': 15,
 'think': 16,
 'story': 17,
 'make': 18,
 'got': 19,
 'two': 20,
 'know': 21,
 'still': 22,
 'music': 23,
 'years': 24,
 'never': 25,
 'want': 26,
 'ive': 27,
 'see': 28,
 'people': 29,
 'money': 30,
 'books': 31,
 'made': 32,
 'find': 33,
 'didnt': 34,
 'recommend': 35,
 'bad': 36,
 'ever': 37,
 'used': 38,
 'found': 39,
 'go': 40,
 'put': 41,
 'another': 42,
 'cant': 43,
 'reading': 44,
 'say': 45,
 'songs': 46,
 'quality': 47,
 'every': 48,
 'something': 49,
 'thing': 50,
 'doesnt': 51,
 'life': 52,
 'easy': 53,
 'thought': 54,
 'game': 55,
 'lot': 56,
 'dvd': 57,
 'however': 58,
 'worth': 59,
 'long': 60,
 'give': 61,
 'hard': 62,
 'since': 63,
 'need': 64,
 'sound': 65,
 'looking': 66,
 'going': 67,
 'characters': 68,
 'film': 69,
 'real': 70,
 'makes': 71,
 'anyone': 72,
 'enough': 73,
 '

In [37]:
df_inf_transform = t.texts_to_sequences(df_inf_transform)

In [38]:
# Padding the dataset to a maximum review length in words

df_inf_transform = pad_sequences(df_inf_transform, maxlen=102) # max len = 102 words

In [39]:
df_inf_transform

array([[    0,     0,     0, ...,    79,   654,  2166],
       [    0,     0,     0, ...,  5548,   628,   149],
       [    0,     0,     0, ...,     1,  1814,  3296],
       ...,
       [    0,     0,     0, ...,     1, 30390,   405],
       [    0,     0,     0, ...,   896,   303,  1367],
       [    0,     0,     0, ...,     1,  8037,   661]], dtype=int32)

In [40]:
# Predict using selected model
y_pred_inf = model_gruimp.predict(df_inf_transform)
y_pred_inf = np.where(y_pred_inf >= 0.5, 1, 0)
y_pred_inf



array([[1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0]])

In [41]:
# Creating dataframe prediction
y_pred_inf = pd.DataFrame(y_pred_inf, columns=['Prediction'],index=df_inf_copy.index)

# Comparing the real target and prediction
model_inf = pd.concat([df_inf_copy['class_index'], y_pred_inf['Prediction']], axis=1).sort_index()
model_inf

Unnamed: 0,class_index,Prediction
0,Positive Review,1
1,Positive Review,0
2,Positive Review,1
3,Negative Review,1
4,Negative Review,0
5,Positive Review,1
6,Positive Review,0
7,Positive Review,0
8,Positive Review,1
9,Negative Review,0


In [42]:
model_inf.Prediction.replace({0:'Negative Review',1:'Positive Review'}, inplace=True)

In [43]:
model_inf

Unnamed: 0,class_index,Prediction
0,Positive Review,Positive Review
1,Positive Review,Negative Review
2,Positive Review,Positive Review
3,Negative Review,Positive Review
4,Negative Review,Negative Review
5,Positive Review,Positive Review
6,Positive Review,Negative Review
7,Positive Review,Negative Review
8,Positive Review,Positive Review
9,Negative Review,Negative Review


The model we created has 70% accuracy for this data inference, this might be due to the presence of new words that are not in the vocabulary.