In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
### Extracting the Train-Val-Test Data
import pickle
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/train_val_test_data.pkl"
with open(fp,"rb") as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(file=f)

print(len(X_train))
print(y_train.shape)
print(len(X_val))
print(y_val.shape)
print(len(X_test))
print(y_test.shape)

17843
(17843, 2)
4461
(4461, 2)
2479
(2479, 2)


## 5. Solution 2: Predicting

Prediction Models Used in this File:
<br>5.1 Convolution NN (CNN)
<br>5.2 GRU Network
<br>5.3 Long Short Term Memory (LSTM)

### Custom Functions

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.regularizers import L2
from tensorflow.keras.callbacks import ReduceLROnPlateau 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import roc_auc_score

# custom callback for performance metric: mean column wise AUC
class CustomMetrics(tf.keras.callbacks.Callback):

    def __init__(self,train_data,train_labels,val_data,val_labels):
        '''
        This function initializes callback object to 
        compute custom metric
        '''
        
        self.train_data = train_data
        self.train_labels = train_labels
        self.val_data = val_data
        self.val_labels = val_labels
    
    def on_epoch_end(self,epoch,logs={}):
        '''
        This function computes the mean wise column AUC at 
        the end of each epoch
        '''
        
        # predicting probabilities for training datapoints
        train_proba = self.model.predict(self.train_data)
        
        # mean column wise auc for train set
        train_auc = roc_auc_score(y_true=self.train_labels,
                                  y_score=train_proba,
                                  average="macro")
        
        # predicting probabilities for val datapoints
        val_proba = self.model.predict(self.val_data)
        
        # mean column wise auc for val set
        val_auc = roc_auc_score(y_true=self.val_labels,
                                  y_score=val_proba,
                                  average="macro")
        
        print(f"train_auc: {round(train_auc,4)} val_auc: {round(val_auc,4)}")


# custom callback to save model after each epoch
class SaveModel(tf.keras.callbacks.Callback):

    def __init__(self,file_path):
        '''
        This function initializes callback object to 
        save model
        '''
        self.file_path = file_path
    
    def on_epoch_end(self,epoch,logs={}):
        '''
        Function saves model architecture, weights and optimizer state for current epoch
        '''

        # saving the model to specified file location
        self.model.save(self.file_path)

In [None]:
def load_embeddings(embedding_type):
    '''
    Function to load embeddings based on input type specified
    '''

    # creating file path
    fp = f"/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/{embedding_type}_embedding_matrix.pkl"

    # loading embedding matrix
    with open(fp,mode="rb") as f:
      embedding_matrix = pickle.load(file=f)

    return embedding_matrix


In [None]:
def get_test_predictions(test_data,model_type):
    '''
    Function returns predictions of test data set by using input model specified
    '''

    # model file path
    fp = f"models/{model_type}.hdf5"

    # loading the model
    model = tf.keras.models.load_model(fp)

    # predicting class probabilities
    pred_proba = model.predict(test_data)

    # dataframe to store results
    pred_df = pd.DataFrame()

    

    # adding predicted probability for each class
    class_labels = ["neutral","hate"]
    for i,label in enumerate(class_labels):
      pred_df[label] = pred_proba[:,i]

    
    # filepath to save predictions
    fp = f"/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Predictions/{model_type}.csv"

    # saving to disk
    pred_df.to_csv(fp)

    print("Predictions saved to disk")

### Data Representation

#### Tokenizer

In [None]:
### Tokenizer
# initially we will tokenize the corpus without passing any parameter for num_words 
from tensorflow.keras.preprocessing.text import Tokenizer
word_tokenizer = Tokenizer(num_words=None,
                           lower=True,
                           split=" ",
                           char_level=False) 

word_tokenizer.fit_on_texts(X_train)
total_words = len(word_tokenizer.word_index)
print(f'Number of unique words in vocabulary: {total_words}')

Number of unique words in vocabulary: 17004


In [None]:
# finding the number of words which occur only once
num_words = 0
for word,count in word_tokenizer.word_counts.items():
    if count < 2:
        num_words += 1

print(f'Number of words in our vocabulary which occur only once: {num_words}')

# finding max number of words we need to consider
max_words = total_words - num_words
print(f'Number of words which occur at least two times: {max_words}')

Number of words in our vocabulary which occur only once: 9428
Number of words which occur at least two times: 7576


In [None]:
# storing all document lengths in a list
import numpy as np
document_lengths = []

for doc in X_train:
    length = len(doc.split())
    document_lengths.append(length)

# finding the percentile values from 0 to 100 incrementing by 10 each iteration
for i in range(0,110,10):
    per_val = np.percentile(document_lengths,i)
    print(f'{i} percentile value is {per_val}')

0 percentile value is 0.0
10 percentile value is 3.0
20 percentile value is 4.0
30 percentile value is 4.0
40 percentile value is 5.0
50 percentile value is 6.0
60 percentile value is 7.0
70 percentile value is 8.0
80 percentile value is 10.0
90 percentile value is 12.0
100 percentile value is 27.0


In [None]:
# finding the percentile values between 90 and 100 incrementing by 1 each iteration
for i in range(90,101):
    per_val = np.percentile(document_lengths,i)
    print(f'{i} percentile value is {per_val}')

90 percentile value is 12.0
91 percentile value is 12.0
92 percentile value is 12.0
93 percentile value is 12.0
94 percentile value is 13.0
95 percentile value is 13.0
96 percentile value is 14.0
97 percentile value is 14.0
98 percentile value is 15.0
99 percentile value is 16.0
100 percentile value is 27.0


In [None]:
word_tokenizer = Tokenizer(num_words=max_words+1,
                           lower=True,
                           split=" ",
                           char_level=False) 

word_tokenizer.fit_on_texts(X_train)

# converting train,val and test into integere sequences
X_train = word_tokenizer.texts_to_sequences(X_train)
X_val = word_tokenizer.texts_to_sequences(X_val)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [None]:
# saving the tokenizer to disk
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/tokenizer.pkl"
with open(fp,mode="wb") as f:
    pickle.dump(word_tokenizer,f)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# padding of all sequences to max length of 15
max_length = 15

X_train = pad_sequences(sequences=X_train, maxlen=max_length, padding='post', truncating='post')
X_val = pad_sequences(sequences=X_val, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(sequences=X_test, maxlen=max_length, padding='post', truncating='post')

print(f'Shape of Training Data: {X_train.shape}')
print(f'Shape of Training Data: {X_val.shape}')
print(f'Shape of Test Data: {X_test.shape}')

Shape of Training Data: (17843, 15)
Shape of Training Data: (4461, 15)
Shape of Test Data: (2479, 15)


In [None]:
# saving the training, validation and test tokenized data
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/tokenized_data.pkl"
with open(fp,mode="wb") as f:
    pickle.dump(obj=(X_train,
                     y_train,
                     X_val,
                     y_val,
                     X_test,
                     y_test),
                file=f)

### Word-Embedding
3 Types of embedding used:
<br> GloVe
<br> Fasttext
<br> Word2Vec

#### GloVe Embedding

In [None]:
# loading pre-defined word embeddings and storing them as a dict
word_embeddings = dict()
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Word Embedding Vectors/glove.twitter.27B.50d.txt"
with open(fp,encoding='utf-8') as f:
    for line in f:
        # extracting the token and corresponsing embedding
        token = line.split()[0]  
        vector = np.asarray(line.split()[1:], dtype='float32')
        word_embeddings[token] = vector
        
print(f'Loaded 300d vector representations corresponding to {len(word_embeddings)} words') 

Loaded 300d vector representations corresponding to 1193514 words


In [None]:
# creating a weight matrix for words in our training vocab
vocab_size = max_words + 1
embedding_matrix = np.zeros((vocab_size,50))

for word,index in word_tokenizer.word_index.items(): # word_tokenizer was already fitted on X_train
    # only considering max words
    if index < vocab_size:
        glove_vector = word_embeddings.get(word)
        # updating our weight matrix if glove vector corresponding to word exists
        if glove_vector is not None:
            embedding_matrix[index] = glove_vector
            
print(f'Shape of Embedding Matrix {embedding_matrix.shape}') 

Shape of Embedding Matrix (7577, 50)


In [None]:
# saving our glove embedding matrix on disk
import pickle
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/glove_embedding_matrix.pkl"
with open(fp,mode="wb") as f:
    pickle.dump(obj=embedding_matrix,
                file=f)

#### Fasttext

In [None]:
# loading pre-defined word embeddings and storing them as a dict
word_embeddings = dict()
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Word Embedding Vectors/wiki-news-300d-1M.vec"
with open(fp,encoding='utf-8') as f:
    for line in f:
        # extracting the token and corresponsing embedding
        tokens = line.rstrip().split(" ")  
        vector = np.asarray(tokens[1:], dtype='float32')
        word_embeddings[tokens[0]] = vector
        
print(f'Loaded 300d vector representations corresponding to {len(word_embeddings)} words')  

Loaded 300d vector representations corresponding to 999995 words


In [None]:
# creating a weight matrix for words in our training vocab
vocab_size = max_words + 1
embedding_matrix = np.zeros((vocab_size,300))

for word,index in word_tokenizer.word_index.items():
    # only considering max words
    if index < vocab_size:
        fasttext_vector = word_embeddings.get(word)
        # updating our weight matrix if fasttext vector corresponding to word exists
        if fasttext_vector is not None:
            embedding_matrix[index] = fasttext_vector
            
print(f'Shape of Embedding Matrix {embedding_matrix.shape}')  

Shape of Embedding Matrix (7577, 300)


In [None]:
# saving our fasttext embedding matrix on disk
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/fasttext_embedding_matrix.pkl"
with open(fp,mode="wb") as f:
    pickle.dump(obj=embedding_matrix,
                file=f)

#### Word2Vec

In [None]:
import gensim
from gensim.models import Word2Vec

word2vec_path = '/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Word Embedding Vectors/GoogleNews-vectors-negative300.bin'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
# word2vec.init_sims(replace=True)

In [None]:
# loading pre-defined word embeddings and storing them as a dict
word_embeddings = dict()

for key,vector in word2vec.vocab.items():
    # extracting the token and corresponsing embedding
    vector = np.asarray(word2vec.get_vector(key), dtype='float32')
    word_embeddings[key] = vector
        
print(f'Loaded 300d vector representations corresponding to {len(word_embeddings)} words') 

Loaded 300d vector representations corresponding to 3000000 words


In [None]:
# creating a weight matrix for words in our training vocab
vocab_size = max_words + 1
embedding_matrix = np.zeros((vocab_size,300))

for word,index in word_tokenizer.word_index.items():
    # only considering max words
    if index < vocab_size:
        word2vec_vector = word_embeddings.get(word)
        # updating our weight matrix if fasttext vector corresponding to word exists
        if word2vec_vector is not None:
            embedding_matrix[index] = word2vec_vector
            
print(f'Shape of Embedding Matrix {embedding_matrix.shape}')  

Shape of Embedding Matrix (7577, 300)


In [None]:
# saving our word2vec embedding matrix on disk
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/word2vec_embedding_matrix.pkl"
with open(fp,mode="wb") as f:
    pickle.dump(obj=embedding_matrix,
                file=f)

### NN Models

#### Load Tokenized Data 

In [None]:
# loading tokenized data 
import pickle
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/tokenized_data.pkl"
with open(fp,"rb") as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(file=f)

#### Load Embeddings

In [None]:
# loading the glove embeddings
word_embedding_matrix_glove = load_embeddings(embedding_type="glove")

# loading the fasttext embeddings
word_embedding_matrix_fasttext = load_embeddings(embedding_type="fasttext")

# loading the fasttext embeddings
word_embedding_matrix_word2vec = load_embeddings(embedding_type="word2vec")

#### CNN

In [None]:
from tensorflow.keras.initializers import HeNormal
def get_cnn_architecture(max_length,vocab_size,embedding_matrix,output_dim):
    '''
    Function creates CNN architecture with 1d conv layers
    '''

    # clearing backend session
    tf.keras.backend.clear_session()

    # defining kernel initializer and regularizer
    initializer = HeNormal()
    regularizer = L2(l2=0.01)

    # defining input and embedding layers
    input_layer = Input(shape=(max_length,))
    embedding = Embedding(input_dim=vocab_size,output_dim=output_dim,input_length=max_length,weights=[embedding_matrix],trainable=False)(input_layer)

    # defining the first set of conv1d layers
    conv_a_1 = Conv1D(50,3,1,activation='relu',kernel_initializer=initializer,padding='same')(embedding)
    conv_a_2 = Conv1D(50,4,1,activation='relu',kernel_initializer=initializer,padding='same')(embedding)
    conv_a_3 = Conv1D(50,5,1,activation='relu',kernel_initializer=initializer,padding='same')(embedding)

    # concatenating and max pool first set of conv1d layers
    concat_a = concatenate([conv_a_1,conv_a_2,conv_a_3])
    maxpool_a = MaxPooling1D(pool_size=2,strides=1)(concat_a)

    # defining the second set of conv1d layers
    conv_b_1 = Conv1D(50,3,1,activation='relu',kernel_initializer=initializer,padding='same')(maxpool_a)
    conv_b_2 = Conv1D(50,4,1,activation='relu',kernel_initializer=initializer,padding='same')(maxpool_a)
    conv_b_3 = Conv1D(50,5,1,activation='relu',kernel_initializer=initializer,padding='same')(maxpool_a)

    # concatenating and max pool second set of conv1d layers
    concat_b = concatenate([conv_b_1,conv_b_2,conv_b_3])
    maxpool_b = MaxPooling1D(pool_size=2,strides=1)(concat_b)

    # final conv1d layer and dense layers
    conv_c = Conv1D(50,5,1,activation='relu',kernel_initializer=initializer)(maxpool_b)
    flatten = Flatten()(conv_c)
    drop_1 = Dropout(rate=0.5)(flatten)
    dense_1 = Dense(units=32,activation='relu',kernel_initializer=initializer,kernel_regularizer=regularizer)(drop_1)
    output_layer = Dense(units=2,activation='sigmoid',kernel_initializer=initializer)(dense_1)

    # creating the model
    model = Model(inputs=input_layer,outputs=output_layer)

    return model

##### CNN: GloVe

In [None]:
model = get_cnn_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_glove,
                             output_dim = 50)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 50)       378850      ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 15, 50)       7550        ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 15, 50)       10050       ['embedding[0][0]']              
                                                                                              

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/cnn-glove.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
# training the model
model.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=64,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: early stopping


<keras.callbacks.History at 0x7fc0ed693490>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/cnn-glove"
model.save(fp)

INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/cnn-glove/assets


In [None]:
cnn_glove_auc = roc_auc_score(y_true=y_test,
                                  y_score=model.predict(X_test),
                                  average="macro")
print(cnn_glove_auc)

0.9774757401469107


In [None]:
cnn_glove_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model.predict(X_train),
                                  average="macro")
print(cnn_glove_auc_train)

cnn_glove_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model.predict(X_val),
                                  average="macro")
print(cnn_glove_auc_val)

0.9971973737515817
0.9793549265399843


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,
                     model_type="cnn-glove")

Predictions saved to disk


##### CNN: Fasttext

In [None]:
model2 = get_cnn_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_fasttext,
                             output_dim = 300)

model2.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 300)      2273100     ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 15, 50)       45050       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 15, 50)       60050       ['embedding[0][0]']              
                                                                                              

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/cnn-fasttext.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model2.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model2.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=64,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 4: early stopping


<keras.callbacks.History at 0x7f71e8031850>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/cnn-fasttext"
model2.save(fp)

INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/cnn-fasttext/assets


In [None]:
cnn_fasttext_auc = roc_auc_score(y_true=y_test,
                                  y_score=model2.predict(X_test),
                                  average="macro")
print(cnn_fasttext_auc)

0.9811522964129908


In [None]:
cnn_fasttext_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model2.predict(X_train),
                                  average="macro")
print(cnn_fasttext_auc_train)

cnn_fasttext_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model2.predict(X_val),
                                  average="macro")
print(cnn_fasttext_auc_val)

0.9989300490211168
0.9782803975703236


In [None]:
get_test_predictions(test_data = X_test, model_type = "cnn-fasttext")

Predictions saved to disk


##### CNN: Word2Vec

In [None]:
model3 = get_cnn_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_word2vec,
                             output_dim = 300)

model3.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 300)      2273100     ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 15, 50)       45050       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 15, 50)       60050       ['embedding[0][0]']              
                                                                                              

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/cnn-word2vec.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model3.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model3.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=64,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 5: early stopping


<keras.callbacks.History at 0x7f72116e1fd0>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/cnn-word2vec"
model3.save(fp)

INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/cnn-word2vec/assets


In [None]:
cnn_word2vec_auc = roc_auc_score(y_true=y_test,
                                  y_score=model3.predict(X_test),
                                  average="macro")
print(cnn_word2vec_auc)

0.977303870390395


In [None]:
cnn_word2vec_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model3.predict(X_train),
                                  average="macro")
print(cnn_word2vec_auc_train)

cnn_word2vec_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model3.predict(X_val),
                                  average="macro")
print(cnn_word2vec_auc_val)

0.9995001439154115
0.9733056431563925


In [None]:
get_test_predictions(test_data = X_test, model_type = "cnn-word2vec")

Predictions saved to disk


#### LSTM

In [None]:
def get_lstm_architecture(max_length,vocab_size,embedding_matrix,output_dim):
    '''
    Function creates LSTM architecture with the input embedding matrix specified 
    '''

    # clearing backend session
    tf.keras.backend.clear_session()

    # defining input and embedding layers
    input_layer = Input(shape=(max_length,))
    embedding = Embedding(input_dim=vocab_size,output_dim=output_dim,input_length=max_length,weights=[embedding_matrix],trainable=False)(input_layer) 

    # bi-directional lstm layers
    lstm_output_1 = Bidirectional(LSTM(units=64,return_sequences=True))(embedding)
    drop = Dropout(rate=0.5)(lstm_output_1)
    lstm_output_2 = Bidirectional(LSTM(units=64,return_sequences=False))(drop)

    # output layer
    output_layer = Dense(units=2,activation='sigmoid')(lstm_output_2)

    # creating the model
    model = Model(inputs=input_layer,outputs=output_layer)

    return model

##### LSTM: GloVe

In [None]:
# getting model architecture
model4 = get_lstm_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_glove,
                              output_dim=50)


model4.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 50)            378850    
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          58880     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 2)                 258   

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/lstm-glove.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model4.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model4.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 7: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

Epoch 11: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 12/20

Epoch 12: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 13/20

Epoch 13: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 14/20

Epoch 14: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 15/20

Epoch 15: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-10.
Epoch 15: early stopping


<keras.callbacks.History at 0x7f7210e522d0>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-glove"
model4.save(fp)



INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-glove/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-glove/assets


In [None]:
lstm_glove_auc = roc_auc_score(y_true=y_test,
                                  y_score=model4.predict(X_test),
                                  average="macro")
print(lstm_glove_auc)

0.9809422657444349


In [None]:
lstm_glove_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model4.predict(X_train),
                                  average="macro")
print(lstm_glove_auc_train)

lstm_glove_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model4.predict(X_val),
                                  average="macro")
print(lstm_glove_auc_val)

0.9876593264938498
0.9812165142603931


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,model_type="lstm-glove")

Predictions saved to disk


##### LSTM: Fasttext

In [None]:
# getting model architecture
model5 = get_lstm_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_fasttext,
                              output_dim=300)


model5.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 300)           2273100   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          186880    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 2)                 258   

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/lstm-fasttext.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
model_checkpoint = ModelCheckpoint(filepath=filepath,
                                   monitor="val_loss",
                                   save_best_only=True)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,model_checkpoint,reduced_lr]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model5.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model5.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/20

Epoch 8: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 8: early stopping


<keras.callbacks.History at 0x7f71df47d610>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-fasttext"
model5.save(fp)



INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-fasttext/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-fasttext/assets


In [None]:
lstm_fasttext_auc = roc_auc_score(y_true=y_test,
                                  y_score=model5.predict(X_test),
                                  average="macro")
print(lstm_fasttext_auc)

0.98336009452254


In [None]:
lstm_fasttext_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model5.predict(X_train),
                                  average="macro")
print(lstm_fasttext_auc_train)

lstm_fasttext_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model5.predict(X_val),
                                  average="macro")
print(lstm_fasttext_auc_val)

0.9929616769324829
0.9796502196894711


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,model_type="lstm-fasttext")

Predictions saved to disk


##### LSTM: Word2Vec

In [None]:
# getting model architecture
model6 = get_lstm_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_word2vec,
                              output_dim=300)


model6.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 300)           2273100   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          186880    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 2)                 258   

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/lstm-word2vec.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
model_checkpoint = ModelCheckpoint(filepath=filepath,
                                   monitor="val_loss",
                                   save_best_only=True)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,model_checkpoint,reduced_lr]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model6.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model6.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

Epoch 4: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/20
Epoch 6/20

Epoch 6: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 7/20

Epoch 7: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 7: early stopping


<keras.callbacks.History at 0x7f720d58f650>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-word2vec"
model6.save(fp)



INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-word2vec/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/lstm-word2vec/assets


In [None]:
lstm_word2vec_auc = roc_auc_score(y_true=y_test,
                                  y_score=model6.predict(X_test),
                                  average="macro")
print(lstm_word2vec_auc)

0.9793377596107238


In [None]:
lstm_word2vec_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model6.predict(X_train),
                                  average="macro")
print(lstm_word2vec_auc_train)

lstm_word2vec_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model6.predict(X_val),
                                  average="macro")
print(lstm_word2vec_auc_val)

0.9893692572863777
0.9773577413562912


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,model_type="lstm-word2vec")

Predictions saved to disk


#### GRU

In [None]:
def get_gru_architecture(max_length,vocab_size,embedding_matrix, output_dim):
    '''
    Function creates GRU architecture with the input embedding matrix specified 
    '''

    # clearing backend session
    tf.keras.backend.clear_session()

    # defining input and embedding layers
    input_layer = Input(shape=(max_length,))
    embedding = Embedding(input_dim=vocab_size,output_dim=output_dim,input_length=max_length,weights=[embedding_matrix],trainable=False)(input_layer) 

    # bi-directional GRU layers with MaxPooling1D
    gru_output_1 = Bidirectional(GRU(units=64,return_sequences=True))(embedding)
    max_pool = MaxPooling1D()(gru_output_1)
    drop = Dropout(rate=0.5)(max_pool)
    gru_output_2 = Bidirectional(GRU(units=64,return_sequences=False))(drop)

    # output layer
    output_layer = Dense(units=2,activation='sigmoid')(gru_output_2)

    # creating the model
    model = Model(inputs=input_layer,outputs=output_layer)

    return model

##### GRU: GloVe

In [None]:
# getting model architecture
model7 = get_gru_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_glove,
                              output_dim = 50)


model7.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 50)            378850    
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          44544     
 l)                                                              
                                                                 
 max_pooling1d (MaxPooling1D  (None, 7, 128)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 7, 128)            0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496 

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/gru-glove.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model7.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model7.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

Epoch 4: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

Epoch 8: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 9/20

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 9: early stopping


<keras.callbacks.History at 0x7f72086f0550>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-glove"
model7.save(fp)



INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-glove/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-glove/assets


In [None]:
gru_glove_auc = roc_auc_score(y_true=y_test,
                                  y_score=model7.predict(X_test),
                                  average="macro")
print(gru_glove_auc)

0.9799390124538573


In [None]:
gru_glove_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model7.predict(X_train),
                                  average="macro")
print(gru_glove_auc_train)

gru_glove_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model7.predict(X_val),
                                  average="macro")
print(gru_glove_auc_val)

0.9880151941420023
0.9819410794852907


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,model_type="gru-glove")

Predictions saved to disk


##### GRU: Fasttext

In [None]:
# getting model architecture
model8 = get_gru_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_fasttext,
                              output_dim = 300)


model8.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 300)           2273100   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          140544    
 l)                                                              
                                                                 
 max_pooling1d (MaxPooling1D  (None, 7, 128)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 7, 128)            0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496 

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/gru-fasttext.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model8.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model8.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 6/20

Epoch 6: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 6: early stopping


<keras.callbacks.History at 0x7f720be18490>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-fasttext"
model8.save(fp)



INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-fasttext/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-fasttext/assets


In [None]:
gru_fasttext_auc = roc_auc_score(y_true=y_test,
                                  y_score=model8.predict(X_test),
                                  average="macro")
print(gru_fasttext_auc)

0.9826877633394235


In [None]:
gru_fasttext_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model8.predict(X_train),
                                  average="macro")
print(gru_fasttext_auc_train)

gru_fasttext_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model8.predict(X_val),
                                  average="macro")
print(gru_fasttext_auc_val)

0.9900768413926337
0.9803287465183463


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,model_type="gru-fasttext")

Predictions saved to disk


##### GRU: Word2vec

In [None]:
# getting model architecture
model9 = get_gru_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix_word2vec,
                              output_dim = 300)


model9.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 300)           2273100   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          140544    
 l)                                                              
                                                                 
 max_pooling1d (MaxPooling1D  (None, 7, 128)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 7, 128)            0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496 

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/gru-word2vec.hdf5"

custom_metric = CustomMetrics(train_data=X_train,
                              train_labels=y_train,
                              val_data=X_val,
                              val_labels=y_val)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]

In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model9.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model9.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=128,epochs=20,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 4/20
Epoch 5/20

Epoch 5: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 6/20

Epoch 6: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 6: early stopping


<keras.callbacks.History at 0x7f7207a10e50>

In [None]:
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-word2vec"
model9.save(fp)



INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-word2vec/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Models/gru-word2vec/assets


In [None]:
gru_word2vec_auc = roc_auc_score(y_true=y_test,
                                  y_score=model9.predict(X_test),
                                  average="macro")
print(gru_word2vec_auc)

0.9798452123494537


In [None]:
gru_word2vec_auc_train = roc_auc_score(y_true=y_train,
                                  y_score=model9.predict(X_train),
                                  average="macro")
print(gru_word2vec_auc_train)

gru_word2vec_auc_val = roc_auc_score(y_true=y_val,
                                  y_score=model9.predict(X_val),
                                  average="macro")
print(gru_word2vec_auc_val)

0.9860869298517413
0.9779608263604346


In [None]:
# predicting on test data
get_test_predictions(test_data=X_test,model_type="gru-word2vec")

Predictions saved to disk


#### Ensemble

In [10]:
# Taking simple average of predictions on test data from all previously trained models
import pandas as pd
# predictions folder file path
filepath = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Predictions/"

# class label columns
cols = list(pd.read_csv(filepath + "cnn-glove.csv", nrows =1))
required_cols = [col for col in cols if col != "Unnamed"]

# loading predictions
cnn_glove = pd.read_csv(filepath + "cnn-glove.csv",usecols=required_cols)
cnn_fasttext = pd.read_csv(filepath + "cnn-fasttext.csv",usecols=required_cols)
cnn_word2vec = pd.read_csv(filepath + "cnn-word2vec.csv",usecols=required_cols)
lstm_glove = pd.read_csv(filepath + "lstm-glove.csv",usecols=required_cols)
lstm_fasttext = pd.read_csv(filepath + "lstm-fasttext.csv",usecols=required_cols)
lstm_word2vec = pd.read_csv(filepath + "lstm-word2vec.csv",usecols=required_cols)
gru_glove = pd.read_csv(filepath + "gru-glove.csv",usecols=required_cols)
gru_fasttext = pd.read_csv(filepath + "gru-fasttext.csv",usecols=required_cols)
gru_word2vec = pd.read_csv(filepath + "gru-word2vec.csv",usecols=required_cols)

# taking average of all model predictions
ensemble_predictions = (cnn_glove + cnn_fasttext + cnn_word2vec + lstm_glove + lstm_fasttext + lstm_word2vec + gru_glove + gru_fasttext + gru_word2vec) / 9

# saving predictions to disk
ensemble_predictions.to_csv(filepath + "ensemble_predictions.csv")
print("Predictions saved to disk")

Predictions saved to disk


In [13]:
import numpy as np
ensemble_predictions[["neutral","hate"]].to_numpy()
np.argmax(ensemble_predictions[["neutral","hate"]].to_numpy(), axis = 1)

array([0, 1, 1, ..., 1, 1, 1])

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_test_new = []
for i in y_test:
  if i[0] == 1:
    y_test_new.append(0)
  elif i[1] == 1:
    y_test_new.append(1)

In [18]:
accuracy_score(y_test_new,np.argmax(ensemble_predictions[["neutral","hate"]].to_numpy(), axis = 1))

0.9532069382815651

#### Comparison Table

In [19]:
from prettytable import PrettyTable
# creating table object
table = PrettyTable(field_names=["Model","Train Score","Val Score","Test Score"])

# adding rows
table.add_row(["CNN-GloVe",0.9920, 0.9801, 0.9777])
table.add_row(["CNN-Fasttext",0.9989, 0.9783, 0.9812])
table.add_row(["CNN-word2vec",0.9995, 0.9733, 0.9773])
table.add_row(["LSTM-GloVe",0.9877, 0.9812, 0.9809])
table.add_row(["LSTM-Fasttext",0.9930, 0.9797,0.9834])
table.add_row(["LSTM-word2vec",0.9894, 0.9774, 0.9793])
table.add_row(["GRU-GloVe",0.9880,0.9819,0.9799])
table.add_row(["GRU-Fasttext",0.9901, 0.9803, 0.9827])
table.add_row(["GRU-word2vec",0.9861, 0.9780, 0.9798])
table.add_row(["Ensemble (Simple Average)","--","--",0.9532])

print(table)

+---------------------------+-------------+-----------+------------+
|           Model           | Train Score | Val Score | Test Score |
+---------------------------+-------------+-----------+------------+
|         CNN-GloVe         |    0.992    |   0.9801  |   0.9777   |
|        CNN-Fasttext       |    0.9989   |   0.9783  |   0.9812   |
|        CNN-word2vec       |    0.9995   |   0.9733  |   0.9773   |
|         LSTM-GloVe        |    0.9877   |   0.9812  |   0.9809   |
|       LSTM-Fasttext       |    0.993    |   0.9797  |   0.9834   |
|       LSTM-word2vec       |    0.9894   |   0.9774  |   0.9793   |
|         GRU-GloVe         |    0.988    |   0.9819  |   0.9799   |
|        GRU-Fasttext       |    0.9901   |   0.9803  |   0.9827   |
|        GRU-word2vec       |    0.9861   |   0.978   |   0.9798   |
| Ensemble (Simple Average) |      --     |     --    |   0.9532   |
+---------------------------+-------------+-----------+------------+
