In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/Github/

/content/drive/MyDrive/Github


In [3]:
!git pull https://github.com/rohit-khoiwal-30/msgmt.git

From https://github.com/rohit-khoiwal-30/msgmt
 * branch            HEAD       -> FETCH_HEAD
Already up to date.


In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("augment_train.csv")
df_test = pd.read_csv("evaluation.csv")

In [6]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as tfl
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import Input
from tensorflow.keras.regularizers import l2

try:
    import tensorflow_hub as hub
except ModuleNotFoundError:
    %pip install tensorflow_hub
    import tensorflow_hub as hub

try:
    import tensorflow_text as text
except ModuleNotFoundError:
    %pip install -q -U "tensorflow-text==2.8.*"
    %pip install -q tf-models-official==2.7.0
    import tensorflow_text as text

try:
    from keras_self_attention import SeqSelfAttention
except ModuleNotFoundError:
    %pip install keras-self-attention
    from keras_self_attention import SeqSelfAttention

import numpy as np
import math as m

In [8]:
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/2"

In [47]:
def gen_random_batch(in_groups, batch_halfsize = 8):
    text_batch, reason_batch, y_hat = [], [], []
    all_groups = list(range(in_groups[0].shape[0]))
    for match_group in [True, False]:
        group_idx = np.random.choice(all_groups, size = batch_halfsize)
        text_batch += [in_groups[0][c_idx] for c_idx in group_idx]
        if match_group:
            b_group_idx = group_idx
            y_hat += [1]*batch_halfsize
        else:
            # anything but the same group
            non_group_idx = [np.random.choice([i for i in all_groups if i!=c_idx]) for c_idx in group_idx] 
            b_group_idx = non_group_idx
            y_hat += [0]*batch_halfsize
            
        reason_batch += [in_groups[1][c_idx] for c_idx in b_group_idx]
            
    return np.stack(text_batch, 0), np.stack(reason_batch, 0), np.stack(y_hat, 0)

def siam_gen(in_groups, batch_size = 32):
    while True:
        text_stack, reason_stack, y_hatstack = gen_random_batch(in_groups, batch_size//2)
        yield [preprocessing_layer(text_stack), preprocessing_layer(reason_stack)], y_hatstack

In [10]:
def accuracy(y_true, y_pred):
    results = (y_pred >= 0.5).astype(int).squeeze()
    return np.mean([y_true == results])

In [99]:
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
def encoder1(name):
    input_word_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_word_ids1")
    input_mask = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_mask1")
    input_type_ids = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_type_ids1")

    bert_layer = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name=name)

    outputs = bert_layer({'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids})
    net = outputs['pooled_output']
    a = tfl.Dropout(0.3)(net)
    a = tfl.Dense(128, activation="linear", kernel_regularizer=l2(1e-3))(a)
    a = tfl.BatchNormalization()(a)
    a = tfl.Activation('relu')(a)
    a = tfl.Dense(64, activation="linear", kernel_regularizer=l2(1e-3))(a)
    a = tfl.BatchNormalization()(a)
    a = tfl.Activation('relu')(a)
    a = tfl.Dense(32, activation="linear", kernel_regularizer=l2(1e-3))(a)
    a = tfl.BatchNormalization()(a)
    outputs = tfl.Activation('relu')(a)
    
    model = Model(inputs={'input_word_ids': input_word_ids,
                          'input_mask': input_mask,
                          'input_type_ids': input_type_ids}, 
                outputs=outputs)
    return model

def encoder2(name):
    input_word_ids = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_word_ids2")
    input_mask = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_mask2")
    input_type_ids = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_type_ids2")

    bert_layer = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name=name)

    outputs = bert_layer({'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids})
    net = outputs["pooled_output"]
    a = tfl.Dropout(0.3)(net)
    a = tfl.Dense(64, activation="linear", kernel_regularizer=l2(1e-3))(a)
    a = tfl.BatchNormalization()(a)
    a = tfl.Activation('relu')(a)
    a = tfl.Dense(32, activation="linear", kernel_regularizer=l2(1e-3))(a)
    a = tfl.BatchNormalization()(a)
    outputs = tfl.Activation('relu')(a)
    
    model = Model(inputs={'input_word_ids': input_word_ids,
                          'input_mask': input_mask,
                          'input_type_ids': input_type_ids}, 
                outputs=outputs)
    return model

In [102]:
def get_model():
    #textEncoder
    input_word_ids1 = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_word_ids1")
    input_mask1 = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_mask1")
    input_type_ids1 = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_type_ids1")
    inputText = {'input_word_ids': input_word_ids1, 'input_mask': input_mask1, 'input_type_ids': input_type_ids1}
    text_embedd = encoder1("textBertEncoder")(inputText)

    #reasonEncoder
    input_word_ids2 = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_word_ids2")
    input_mask2 = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_mask2")
    input_type_ids2 = tf.keras.layers.Input(shape=(128, ), dtype=tf.int32, name="input_type_ids2")
    inputReason = {'input_word_ids': input_word_ids2, 'input_mask': input_mask2, 'input_type_ids': input_type_ids2}
    reason_embedd = encoder2("reasonBertEncoder")(inputReason)

    combined_features = tfl.concatenate([text_embedd, reason_embedd], name = 'merge_features')
    print(combined_features.shape)
    combined_features = tfl.Dropout(0.2)(combined_features)
    combined_features = tfl.Dense(16, activation = 'linear', kernel_regularizer=l2(1e-3))(combined_features)
    combined_features = tfl.BatchNormalization()(combined_features)
    combined_features = tfl.Activation('relu')(combined_features)
    combined_features = tfl.Dense(4, activation = 'linear', kernel_regularizer=l2(1e-3))(combined_features)
    combined_features = tfl.BatchNormalization()(combined_features)
    combined_features = tfl.Activation('relu')(combined_features)
    combined_features = tfl.Dense(1, activation = 'sigmoid')(combined_features)
    
    model = Model(inputs = [inputText, inputReason], outputs = [combined_features], name="BertModel")
    return model

In [103]:
model = get_model()

(None, 64)


In [104]:
model.summary()

Model: "BertModel"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask1 (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids1 (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 input_word_ids1 (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 input_mask2 (InputLayer)       [(None, 128)]        0           []                               
                                                                                          

In [105]:
model.compile(optimizer=tf.keras.optimizers.Adam(), loss = "binary_crossentropy", metrics = [tf.metrics.BinaryAccuracy()])

In [None]:
loss_history = model.fit(siam_gen([df["clean_text"], df["reason"]], 128),
                         steps_per_epoch=50, validation_steps=10, epochs = 50, 
                         verbose = True, use_multiprocessing=True)

Epoch 1/50


In [None]:
from sklearn.metrics import brier_score_loss as brier_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

In [None]:
y_preds = model.predict([df['clean_text'], df['reason']])
accuracy(df['label'], y_preds)



1.0

In [None]:
y_hat = (y_preds >= 0.5).astype(int)

In [None]:
print("Brier Loss                       : ", brier_loss(df['label'], y_preds))
print("Precision Score                  : ", precision_score(df['label'], y_hat))
print("Recall Score                     : ", recall_score(df['label'], y_hat))
print("F1 Score                         : ", f1_score(df['label'], y_hat))

In [None]:
y_preds = model.predict([df_test['clean_text'], df_test['reason']])
accuracy(df_test['label'], y_preds)

0.513

In [None]:
y_hat = (y_preds >= 0.5).astype(int)

In [None]:
print("Brier Loss                       : ", brier_loss(df_test['label'], y_preds))
print("Precision Score                  : ", precision_score(df_test['label'], y_hat))
print("Recall Score                     : ", recall_score(df_test['label'], y_hat))
print("F1 Score                         : ", f1_score(df_test['label'], y_hat))
print("Roc AUC Score                    : ", roc_auc_score(df_test['label'], y_preds))
print("BCE Loss                         : ", log_loss(df_test['label'], y_preds))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(df_test['label'], y_preds)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc[2],
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()