Group 13: group_1_11_95

Akash Tike (AU1741001)\
Smit Mandavia (AU1741011)\
Shaunak Vyas (AU1741095)

Here is our implementation of Recurrent Neural Network for text profanity detection under the joint project of the courses Artificial Intelligence and Cloud Computing.

# Data loading and pre-processing

In [None]:
# importting required library 

# numerical processing libraries
import pandas as pd
import numpy as np
from numpy.random import seed

# language processing libraries
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from gensim.models import Word2Vec

# machine learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, auc, mean_absolute_error, mean_squared_error

# timer to keep track of process durations
import time

In [None]:
# installing required packages on new runtime
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# importing data from google drive instead of uploading, for faster access to files
file_id = '1va-lNyLHRjBaUDIz8Bw2aKIInvNGOAf1'

from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

import io
from googleapiclient.http import MediaIoBaseDownload

request = drive_service.files().get_media(fileId=file_id)
downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  _, done = downloader.next_chunk()

downloaded.seek(0)
data_file = pd.read_csv(downloaded)

In [None]:
# train test splitting
data_file.describe()
target_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# independent feature (input) is comments
X = data_file['comment_text']
# 6 dependent features (output), described by target_classes
y = data_file[target_classes]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

In [None]:
# language preprocessing 
# get the list of stopwords of english language, we should be handling the stop words beforehand
stopwords_list = stopwords.words('english')

# also, add the punctuation marks in stopwords_list 
stopwords_list += list(string.punctuation)
stopwords_list += ("''","``", "'s", "\\n\\n" , '...', 'i\\','\\n', '•', "i", 'the', "'m", 'i\\', "'ve", "don\\'t", "'re", "\\n\\ni", "it\\", "'ll", 'you\\', "'d", "n't", '’', 'article', 'page', 'wikipedia')

In [None]:
# tokenize the words of training dataset
train_text = str(list(X_train)).lower()
tokens = word_tokenize(train_text)

In [None]:
# lemmatize the text
lemmatizer = WordNetLemmatizer()
lemmatized_tokens =[lemmatizer.lemmatize(w) for w in tokens]

In [None]:
# importing keras library functions 
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
# initializing tokenizer
max_features = 2000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))

In [None]:
import pickle

# saving the tokenizer to pickle file which will be later used on server
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# convert the text data to sequence
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
# after that we pad the sequence to make it of equal legth
maxlen = 400
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
import os
import random
import tensorflow as tf

# set any random seed value
seed_val = 3

os.environ['PYTHONHASHSEED']=str(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)
tf.random.set_seed(seed_val)

In [None]:
!pip install focal-loss
from focal_loss import BinaryFocalLoss 

early_stopping = [EarlyStopping(monitor='val_loss', patience=2), 
                  ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

In [None]:
rnn_with_lstm = Sequential()
embedding_size = 128
# embedding layer that maps features (input) to embedding size
rnn_with_lstm.add(Embedding(max_features, embedding_size))
# adding LSTM layer to retain the memory for longer period of time
rnn_with_lstm.add(LSTM(60, return_sequences=True,name='lstm_layer'))        
# globalmaxpooling for dimensionality reduction on max value
rnn_with_lstm.add(GlobalMaxPool1D())
# dropout layer to avoid overfitting
rnn_with_lstm.add(Dropout(0.1))  
# dense hidden layers of size 50 and 10 respectively, with same dropout
rnn_with_lstm.add(Dense(50, kernel_regularizer=regularizers.l2(.00001),activation='relu'))
rnn_with_lstm.add(Dropout(.01))
rnn_with_lstm.add(Dense(10, kernel_regularizer=regularizers.l2(.00001),activation='relu'))
# finally output layer, which is mapped to 6 different target classes
rnn_with_lstm.add(Dense(6, activation='sigmoid'))

# compile the rnn model
rnn_with_lstm.compile(loss=BinaryFocalLoss(gamma=2), optimizer='adam', metrics=['accuracy'])
# dimensions of each layer of rnn
for layer in rnn_with_lstm.layers:
    print(layer.name, " ", layer.output_shape)

In [None]:
start = time.time()
fit_rnn_model = rnn_with_lstm.fit(X_t, y_train, epochs=5, batch_size=400, 
                        callbacks=early_stopping, validation_split=0.3)
end = time.time()
print(f"Total time taken to fit model: {end-start}")

In [None]:
# save trained model and weights to file that will be used on server for prediction
from keras.models import model_from_json
model_json = rnn_with_lstm.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
rnn_with_lstm.save_weights("model.h5")
print("Saved model to disk")

In [None]:
# testing the model
y_prediction = rnn_with_lstm.predict(X_te)
y_prediction_binary = pd.DataFrame(np.round(y_prediction), columns=target_classes)

In [None]:
mae = mean_absolute_error(y_true = y_test, y_pred = y_prediction_binary)
print('Mean absolute error : ' , mae)
mse = mean_squared_error(y_true = y_test, y_pred = y_prediction_binary)
print('Mean squared error : ' , mse)
