Check if GPU is connected

In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    for gpu in gpus:
        print("Found a GPU with the name:", gpu)
else:
    print("Failed to detect a GPU.")

Found a GPU with the name: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras import models,layers,optimizers
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

Load the Data

In [3]:
def load_data(file_path):
    txt = open(file_path)
    label = []
    review = []
    for line in txt:
        label.append(int(line[9])-1)
        review.append(line[10:].strip())
    return np.array(label),review

label,review = load_data("C:/Projects/Sentiment Analysis Project/data/train.ft.txt")

Split the data into train and test set

In [4]:
xtrain,xtest,ytrain,ytest = train_test_split(review,label,random_state=10,test_size=0.3)

Data Cleaning

In [5]:
not_alphanumeric = re.compile(r'[\W]')
not_ascii = re.compile(r'[^a-z0-1\s]')


In [7]:
def data_cleaner(data):
    processed_data =[]
    for text in data:
        lower_text = str.lower(text)
        cleaned_punctuations = re.sub(not_alphanumeric,r" ",lower_text)
        cleaned_ascii = re.sub(not_ascii,r"",cleaned_punctuations)
        processed_data.append(cleaned_ascii)
    return processed_data

In [10]:
train_reviews = data_cleaner(xtrain)
del(xtrain)

In [11]:
val_reviews = data_cleaner(xtest)
del(xtest)
del(review)

Create and fit the Tockenzier 

In [14]:
import pickle
max_features = 10000
tockenizer = Tokenizer(num_words=max_features)
tockenizer.fit_on_texts(train_reviews)
with open("C:/Projects/Sentiment Analysis Project/saved_obj/tokenizer.pkl","wb") as file_path:
    pickle.dump(tockenizer,file_path)
train_vectors = tockenizer.texts_to_sequences(train_reviews)
del(train_reviews)
val_vectors = tockenizer.texts_to_sequences(val_reviews)
del(val_reviews)

Calulate max length of tokens

In [23]:
max_len = max(len(train_vec) for train_vec in train_vectors)
max_len

254

Save the Max lenth for future inference

In [None]:

with open("C:/Projects/Sentiment Analysis Project/saved_obj/maxlen.pkl","wb") as file_path:
    pickle.dump(max_len,file_path)
padded_train_vectors = pad_sequences(train_vectors,maxlen=max_len)
padded_val_vectors = pad_sequences(val_vectors,maxlen=max_len)


Collect garbage values to free RAM

In [16]:
import gc
gc.collect()

92

In [17]:
import pickle
def save_data(file_path,obj):
    with open(file_path,'wb') as file:
        pickle.dump(obj,file)

Save test and validation data

In [18]:
save_data("./saved_obj/train.pkl",padded_train_vectors)
save_data("./saved_obj/train_label.pkl",ytrain)

In [19]:
save_data("./saved_obj/val.pkl",padded_val_vectors)
save_data("./saved_obj/test_label.pkl",ytest)

Load the saved test and validation data

In [None]:
def load_data(file_path):
    with open(file_path,'rb') as file:
        return pickle.load(file)
padded_train_vectors = load_data("./saved_obj/train.pkl")
ytrain = load_data("./saved_obj/train_label.pkl")
padded_val_vectors = load_data("./saved_obj/val.pkl")
ytest = load_data("./saved_obj/test_label.pkl")

In [21]:
max_features = 10000
max_len = len(padded_train_vectors[0])
max_len

254

Define the layers for RNN modeling

In [26]:
model = Sequential()
model.add(layers.Embedding(max_features,64,input_shape = (max_len,)))
model.add(layers.GRU(128,return_sequences=True))
model.add(layers.GRU(128))
model.add(layers.Dense(32,activation="relu"))
model.add(layers.Dense(100,activation="relu"))
model.add(layers.Dense(1,activation="sigmoid"))
check_point = ModelCheckpoint("./saved_obj/gru1.keras",monitor="accuracy",save_best_only = True,verbose=1)

Get model summary

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 254, 64)           640000    
                                                                 
 gru_2 (GRU)                 (None, 254, 128)          74496     
                                                                 
 gru_3 (GRU)                 (None, 128)               99072     
                                                                 
 dense_3 (Dense)             (None, 32)                4128      
                                                                 
 dense_4 (Dense)             (None, 100)               3300      
                                                                 
 dense_5 (Dense)             (None, 1)                 101       
                                                                 
Total params: 821,097
Trainable params: 821,097
Non-tr

Complie the RNN model

In [28]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])
logdir="logs"
tensor_board_visualizations= TensorBoard(log_dir=logdir)

Fit the model

In [29]:
history = model.fit(padded_train_vectors,ytrain,batch_size=128,epochs=3,validation_data=(padded_val_vectors,ytest),callbacks=[check_point,tensor_board_visualizations])

Epoch 1/3
Epoch 1: accuracy improved from -inf to 0.93714, saving model to ./saved_obj\gru1.keras
Epoch 2/3
Epoch 2: accuracy improved from 0.93714 to 0.95362, saving model to ./saved_obj\gru1.keras
Epoch 3/3
Epoch 3: accuracy improved from 0.95362 to 0.95939, saving model to ./saved_obj\gru1.keras


In [36]:
import tensorflow as tf
tf.keras.models.load_model("C:/Projects/Sentiment Analysis Project/saved_obj/gru1.keras")

<keras.engine.sequential.Sequential at 0x1ece8411f40>

Load and clean the test data

In [30]:
label,review = load_data("C:/Projects/Sentiment Analysis Project/data/test.ft.txt")

In [31]:
xtest_cleaned = data_cleaner(review)

In [32]:
xtest_tokenize = tockenizer.texts_to_sequences(xtest_cleaned)
xtest_padded = pad_sequences(xtest_tokenize,maxlen=max_len)

Save the processed test data for future inference

In [35]:
save_data("./test.pkl",xtest_padded)
save_data("./test_label.pkl",label)

Determine the predictions for test data

In [34]:
pred = model.predict(xtest_padded)



Calculate performance metrics

In [35]:
accuracy_rnn = accuracy_score(label, 1 * (pred > 0.5))
f1_rnn = f1_score(label, 1 * (pred > 0.5))
rocauc_rnn = roc_auc_score(label, pred)

print('Accuracy score of the RNN Model: {:0.3}'.format(accuracy_rnn))
print('F1 score of the RNN Model: {:0.3}'.format(f1_rnn))
print('ROC AUC score of the RNN Model: {:0.3}'.format(rocauc_rnn))

Accuracy score of the RNN Model: 0.954
F1 score of the RNN Model: 0.953
ROC AUC score of the RNN Model: 0.99


RNN model has shown strong results for sentiment analysis tasks