In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re

In [None]:
# Read data from my drive

original_data = '/content/drive/MyDrive/CS3244 Dataset/train-balanced-sarcasm.csv'

cleaned_dataset = [
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/no_pre_processing.csv',
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/lowercased.csv', 
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_no_stopwords_refined.csv',
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_no_stopwords.csv',
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_lemmatized.csv',
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_default_lemmatized.csv'
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/no_puncutation.csv', 
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/no_punctuation_numeric_empty.csv',
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/everything.csv',
    '/content/drive/MyDrive/CS3244 Dataset/pre-processed/everything_except_punctuation.csv',
]

datastore = pd.read_csv(cleaned_dataset[0])
datastore = datastore.dropna()    
total_count = len(datastore)
print(versions.split("/")[-1])
print("Length of dataset: ", total_count)

# 
to_tokenize = '!"#$%&()*+-/:;<=>@[\\]^_`{|}~\t\n.,:;!?'
datastore['comment'] = datastore['comment'].apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1 ', x))

label = datastore['label'].tolist()
comment = datastore['comment'].tolist()
meta_data = np.array([np.array([float(num) for num in stats.split(" ")]) for stats in datastore['meta_features']])
index = [i for i in range(len(comment))]

# Prepare the data by splitting into test and train
from sklearn.model_selection import train_test_split
X_train_ref, X_test_ref, y_train, y_test = train_test_split(index, label, train_size = 0.8, random_state = 42, shuffle = True)
datastore.shape

X_train = [comment[index] for index in X_train_ref]
X_test = [comment[index] for index in X_test_ref]
X_train_meta = [meta_data[index] for index in X_train_ref]
X_test_meta = [meta_data[index] for index in X_test_ref]

#tokenize text
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, Concatenate, Input, concatenate, Conv1D, GlobalMaxPooling1D
from keras import Input, Model
import pickle

vocab_size = 10000
embedding_dim = 16
max_length = 120

# Prepare tokenizer and save it to drive
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>", filters='',lower=False)
tokenizer.fit_on_texts(X_train)

# with open('/content/drive/MyDrive/CS3244 Dataset/tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences,maxlen=max_length, truncating='post')
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating='post')

training_padded = np.array(training_padded)
training_labels = np.array(y_train)
testing_padded = np.array(testing_padded)
testing_labels = np.array(y_test)
training_meta = np.array(X_train_meta)
testing_meta = np.array(X_test_meta)

# CNN Model definition
nlp_input = Input(shape=(max_length,)) 
meta_input = Input(shape=(5,))
embed = Embedding(vocab_size,
                  embedding_dim,
                  input_length=max_length)(nlp_input)
nlp_out = Conv1D(128, 5, activation='relu')(embed)
max_pool = GlobalMaxPooling1D()(nlp_out)
concat = concatenate([max_pool, meta_input], axis=1)
middle = Dense(3, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(middle)
model = Model(inputs=[nlp_input , meta_input], outputs=[output])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
history = model.fit([training_padded, training_meta], training_labels, epochs=10, batch_size=32, validation_data=([testing_padded, testing_meta], testing_labels), verbose=1,  callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        )
    ])

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

predictions = model.predict([testing_padded, testing_meta])
prediction_result = (predictions) > 0.5).astype('int32')

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(testing_labels, prediction_result)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(testing_labels, prediction_result)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(testing_labels, prediction_result)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(testing_labels, prediction_result)
print('F1 score: %f' % f1)
# auc
auc = roc_auc_score(testing_labels, predictions)
print('AUC : %f\n\n' % auc)

# Run only if you want to replace the saved model with current fit
# model.save('/content/drive/MyDrive/CS3244 Dataset')

#Evaluating Accuracy and Loss of the model
%matplotlib inline
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) #No. of epochs

#Plot training and validation accuracy per epoch
import matplotlib.pyplot as plt
plt.plot(epochs,acc,'r',label='Training Accuracy')
plt.plot(epochs,val_acc,'g',label='Testing Accuracy')
plt.legend()
plt.figure()

#Plot training and validation loss per epoch
plt.plot(epochs,loss,'r',label='Training Loss')
plt.plot(epochs,val_loss,'g',label='Testing Loss')
plt.legend()
plt.show()