In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

import pickle

from helper_functions import text_cleanup, reverse_encode, add_one_argmax_score, conf_matrix

2022-01-03 11:42:51.083750: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-03 11:42:51.083766: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
sns.set_context('poster')

In [4]:
path = "data/reviews_tgtg_v0.pkl" 

with open(path, 'rb') as f:
    df = pickle.load(f)

In [5]:
df.to_pickle('review_proto4.pkl', protocol=4)

In [6]:
df.to_csv('data/review_proto4.csv',index=False)

### One hot encoding rating column
For multi-clss classification using neural network model,the target variable needed to be one hot encoded, as the outpu layer would have 5 nodes, 1 for each rating.

In [5]:
enc = OneHotEncoder(handle_unknown='ignore')

enc_df = pd.DataFrame(enc.fit_transform(df[['rating']]).toarray())

df = df.join(enc_df)
df.head()

Unnamed: 0,review_content,rating,source,date,0,1,2,3,4
0,Great value for money. Food was still fresh an...,5,Google,2021-12-29,0.0,0.0,0.0,0.0,1.0
0,Not that great to go more like I have had this...,5,Apple,2021-02-27,0.0,0.0,0.0,0.0,1.0
0,Never been let down yet! Never been let down y...,5,Trustpilot,2021-12-29,0.0,0.0,0.0,0.0,1.0
1,occasionally a surprise bag is offered after t...,4,Google,2021-12-29,0.0,0.0,0.0,1.0,0.0
1,Barley Farm - Eccles This was the first time u...,5,Apple,2020-09-28,0.0,0.0,0.0,1.0,0.0


In [6]:
df = df.rename(columns={0: "score_1", 1:'score_2',2:'score_3',3:'score_4',4:'score_5'})
df.head()

Unnamed: 0,review_content,rating,source,date,score_1,score_2,score_3,score_4,score_5
0,Great value for money. Food was still fresh an...,5,Google,2021-12-29,0.0,0.0,0.0,0.0,1.0
0,Not that great to go more like I have had this...,5,Apple,2021-02-27,0.0,0.0,0.0,0.0,1.0
0,Never been let down yet! Never been let down y...,5,Trustpilot,2021-12-29,0.0,0.0,0.0,0.0,1.0
1,occasionally a surprise bag is offered after t...,4,Google,2021-12-29,0.0,0.0,0.0,1.0,0.0
1,Barley Farm - Eccles This was the first time u...,5,Apple,2020-09-28,0.0,0.0,0.0,1.0,0.0


### Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[['review_content']], 
                                                    df[['score_1','score_2','score_3','score_4','score_5']], 
                                                    test_size=.2, 
                                                    random_state=0)

In [8]:
y_test.to_pickle('data/y_test_1hotenc.pkl')

### Removing punctuation and tokenizing review_content column 

In [11]:
X_train['review_cleaned'] = X_train['review_content'].apply(text_cleanup)
X_test['review_cleaned'] = X_test['review_content'].apply(text_cleanup)

In [12]:
X_train.head()

Unnamed: 0,review_content,review_cleaned
7490,Good choices,good choices
17049,very helpful staff,very helpful staff
1919,"This is a nice app, and fighting food waste is...",this is a nice app and fighting food waste is ...
12674,Really good,really good
10483,Great,great


### Creating a dictionary with words that appear in reviews and an index
Creation of an index for each word in the training review dataset so when passed through the embedding layer of the neural network there is a indexed associated with each word

In [14]:
flat_review = " ".join(X_train['review_cleaned'].values)
flat_review = flat_review.lower().split()

In [15]:
len(flat_review)

434840

In [16]:
unique_list = []
for word in flat_review:
    if word not in unique_list:
        unique_list.append(word)

In [17]:
len(unique_list)

14702

In [18]:
word_index_dict = {}
for i in range(0,len(unique_list)):
    word_index_dict[unique_list[i]] = i

I added 4 other items to the dictionary that represented padding, the start of the review, unknown words, and unused words.

In [19]:
word_index_dict = {k:(v+4) for k,v in word_index_dict.items()}
word_index_dict['<PAD>'] = 0
word_index_dict['<START>'] = 1
word_index_dict['<UNK>'] = 2
word_index_dict['<UNUSED>'] = 3

### Indexing words in reviews using dictionary

In [20]:
def index_review_words(text):
    review_word_list = []
    for word in text.lower().split():
        if word in word_index_dict.keys():
            review_word_list.append(word_index_dict[word])
        else:
            review_word_list.append(word_index_dict['<UNK>'])

    return review_word_list 

In [21]:
X_train['preprocessed_review'] = X_train['review_cleaned'].apply(index_review_words)
X_test['preprocessed_review'] = X_test['review_cleaned'].apply(index_review_words)

In [22]:
X_train.head()

Unnamed: 0,review_content,review_cleaned,preprocessed_review
7490,Good choices,good choices,"[4, 5]"
17049,very helpful staff,very helpful staff,"[6, 7, 8]"
1919,"This is a nice app, and fighting food waste is...",this is a nice app and fighting food waste is ...,"[9, 10, 11, 12, 13, 14, 15, 16, 17, 10, 11, 4,..."
12674,Really good,really good,"[40, 4]"
10483,Great,great,[41]


### Modelling

Add Padding to Reviews and Capping Reviews to Length 250 Words

In [23]:
X_train = sequence.pad_sequences(X_train['preprocessed_review'],value=word_index_dict['<PAD>'],padding='post',maxlen=250)
X_test = sequence.pad_sequences(X_test['preprocessed_review'],value=word_index_dict['<PAD>'],padding='post',maxlen=250)

In [27]:
np.save('data/X_test',X_test)

### Creating and Training Neural Network Model
Here I created a neural network model with 1 embedding layer for the 17317 unique words (including the extra 4 from padding etc.), 1 dense layer with 16 nodes, and one output layer with 5 nodes for each score.

In [28]:
model = Sequential()
model.add(Embedding(14702,16))
model.add(GlobalAveragePooling1D())
model.add(Dense(16,activation='relu'))
model.add(Dense(5,activation='sigmoid'))

2022-01-02 23:59:06.543153: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-02 23:59:06.543245: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-02 23:59:06.543311: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (roxane-Katana-GF66-11UC): /proc/driver/nvidia/version does not exist
2022-01-02 23:59:06.547609: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=.2,random_state=42)

In [None]:
fitModel = model.fit(X_train,y_train,epochs=60,batch_size=250,validation_data=(X_val,y_val),verbose=0)

### Testing model
After looking at accuracy scores for the validation data (0.5568) and the test data (0.5710), we can see the neural network model performs well.

In [None]:
model.evaluate(X_val,y_val)

In [None]:
model.evaluate(X_test,y_test)

### Test Confusion Matrix
In order to create a confusion matrix, I have to reverse one hot encode the scores column and adjust the index for 0-4 back to 1-5.

#### Reverse Encode Target Into One Column for Confusion Matrix

In [None]:
reverse_encode(y_test)

In [None]:
reverse_encode(y_val)

In [None]:
y_test_predicted = add_one_argmax_score(np.argmax(model.predict(X_test),axis=1))

#### Saving Neural Network Preprocessed y Data

In [None]:
y_test.to_csv('data/y_test_nn_df.csv',index=False)
np.save('data/y_test_predicted_array',y_test_predicted)

#### Creating Confusion Matrix for Test Data

In [None]:
test_cm = confusion_matrix(list(y_test['score']), y_test_predicted)
conf_matrix(test_cm)

### Saving Model

In [None]:
model.save('models/Neural_Network.h5')

In [None]:
pickle.dump(word_index_dict,open('data/word_index_dict.pkl','wb'))