In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat Apr 10 04:43:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 14.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 54.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 51.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=9d613a37d3

In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig



referring to this article: https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379

Read the data in

In [5]:
lav_path = '/content/gdrive/MyDrive/W266Project_Lav_Shalz/train-balanced-sarcasm.csv'
shalz_path = '/content/gdrive/MyDrive/Colab Notebooks/train-balanced-sarcasm.csv'

In [6]:
df = pd.read_csv(shalz_path)

In [7]:
df.shape

(1010826, 10)

In [8]:
df.isna().sum()

label              0
comment           53
author             0
subreddit          0
score              0
ups                0
downs              0
date               0
created_utc        0
parent_comment     0
dtype: int64

In [9]:
df = df[df['comment'].notna()]
df.isna().sum()

label             0
comment           0
author            0
subreddit         0
score             0
ups               0
downs             0
date              0
created_utc       0
parent_comment    0
dtype: int64

In [10]:
# check label distribution after removing NA
df['label'].value_counts()

0    505405
1    505368
Name: label, dtype: int64

In this notebook, we want to do hyperparameter tuning in order to improve our model. This means that unlike before, we want to be able to use all of the data we have to build the model and then tune the parameters.

In [11]:
# # select a fraction of the data
# s0 = df.label[df.label.eq(0)].sample(505368).index
# s1 = df.label[df.label.eq(1)].sample(505368).index 

# df = df.loc[s0.union(s1)]
# df

In [12]:
tokenizer_case = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
tokenizer_uncase = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [13]:
# check what max length should be based on the sentence lengths in the full data

comment_lengths = list(df['comment'].str.split().apply(len))
parent_comment_lengths = list(df['parent_comment'].str.split().apply(len))
total_comment_lengths = [a + b for a, b in zip(comment_lengths, parent_comment_lengths)]


In [14]:
print("Comment Length Distribution")
print(min(comment_lengths))
print(np.percentile(comment_lengths, [25, 50, 75]))
print(max(comment_lengths))


print("Parent Comment Length Distribution")
print(min(parent_comment_lengths))
print(np.percentile(parent_comment_lengths, [25, 50, 75]))
print(max(parent_comment_lengths))


print("Total Comment Length Distribution")
print(min(total_comment_lengths))
print(np.percentile(total_comment_lengths, [25, 50, 75]))
print(max(total_comment_lengths))




Comment Length Distribution
1
[ 5.  9. 14.]
2222
Parent Comment Length Distribution
1
[ 8. 14. 26.]
4198
Total Comment Length Distribution
2
[16. 24. 40.]
4444


We have some sentences that are very long, but most of the data (75% percentile) is below 50 so we will use this as our max_length value.

Trying out the tokenizer in order to the two methods we want to try out: 

`Approach A: [CLS] [comment] [SEP] [Masking]
id: 0`


`Approach B: [CLS] [parent_comment] [SEP] [comment] [SEP] [Masking]`


In [17]:
# understanding the tokenizer
temp_sentence = df["comment"][10]
temp_parent_comment = df["parent_comment"][10]
print(temp_sentence)
print(temp_parent_comment)
temp_tokens = tokenizer_uncase.tokenize(temp_sentence)
print(temp_tokens)
print(temp_parent_comment)

I think a significant amount would be against spending their tax dollars on other people.
I bet if that money was poured into college debt or health debt relief, 81% of Americans would have been for it instead.
['i', 'think', 'a', 'significant', 'amount', 'would', 'be', 'against', 'spending', 'their', 'tax', 'dollars', 'on', 'other', 'people', '.']
I bet if that money was poured into college debt or health debt relief, 81% of Americans would have been for it instead.


In [19]:
inputs = tokenizer_uncase(temp_sentence,
          padding = 'max_length', max_length = 50, truncation = True)

inputs
# 101 at the beginning is the CLS token
# 102 in between comment and parent comment is SEP token
# 0 is padding based on the max_length

{'input_ids': [101, 1045, 2228, 1037, 3278, 3815, 2052, 2022, 2114, 5938, 2037, 4171, 6363, 2006, 2060, 2111, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [21]:
encoded_sequence = inputs["input_ids"]
encoded_sequence
decoded_sequence = tokenizer_uncase.decode(encoded_sequence)
decoded_sequence

'[CLS] i think a significant amount would be against spending their tax dollars on other people. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [22]:
inputs = tokenizer_uncase([[temp_sentence, temp_parent_comment]],
          padding = 'max_length', max_length = 50, truncation = True)
inputs

{'input_ids': [[101, 1045, 2228, 1037, 3278, 3815, 2052, 2022, 2114, 5938, 2037, 4171, 6363, 2006, 2060, 2111, 1012, 102, 1045, 6655, 2065, 2008, 2769, 2001, 8542, 2046, 2267, 7016, 2030, 2740, 7016, 4335, 1010, 6282, 1003, 1997, 4841, 2052, 2031, 2042, 2005, 2009, 2612, 1012, 102, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}

Diff version of tokenizing that we would do below

In [None]:
MAX_LENGTH = 30
tokenizer.batch_encode_plus(temp_tokens,
                            max_length=MAX_LENGTH,
                            padding='longest', #implements dynamic padding
                            truncation=True,
                            return_attention_mask=True,
                            return_token_type_ids=False
                            )

{'input_ids': [[101, 146, 102], [101, 1341, 102], [101, 170, 102], [101, 2418, 102], [101, 2971, 102], [101, 1156, 102], [101, 1129, 102], [101, 1222, 102], [101, 5369, 102], [101, 1147, 102], [101, 3641, 102], [101, 5860, 102], [101, 1113, 102], [101, 1168, 102], [101, 1234, 102], [101, 119, 102]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}

In [None]:
tokenizer(temp_sentence,
          padding = 'max_length', 
          max_length = MAX_LENGTH, 
          truncation = True,
          return_attention_mask=True,
          return_token_type_ids=False
          )

{'input_ids': [101, 146, 1341, 170, 2418, 2971, 1156, 1129, 1222, 5369, 1147, 3641, 5860, 1113, 1168, 1234, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

## Train Test Split
Let's split the data in train, val, test and then tokenize all of it

In [None]:
X_train, temp_text, y_train, temp_labels = train_test_split(df['comment'], df['label'], 
                                                                    random_state=0, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

# we will use temp_text and temp_labels to create validation and test set
X_val, X_test, y_val, y_test = train_test_split(temp_text, temp_labels, 
                                                                random_state=0, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [None]:
MAX_LENGTH = 50

In [None]:
def batch_encode(tokenizer, texts, batch_size=256, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # inputs = tokenizer.batch_encode_plus(batch,
        #                                      max_length=max_length,
        #                                      padding='longest', #implements dynamic padding
        #                                      truncation=True,
        #                                      return_attention_mask=True,
        #                                      return_token_type_ids=False
        #                                      )

        inputs = tokenizer(batch,
                          padding = 'max_length', 
                          max_length = MAX_LENGTH, 
                          truncation = True,
                          return_attention_mask=True,
                          return_token_type_ids=False
                          )
        
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)
    
  

In [None]:
### This cell takes a few minutes to run

# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_val_ids, X_val_attention = batch_encode(tokenizer, X_val.tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

In [None]:
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilBERT.layers:
    layer.trainable = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Trying to understand the DistilBERT model layers a bit more

In [None]:
distilBERT.layers

[<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertMainLayer at 0x7fc2d849c050>]

In [None]:
len(distilBERT.layers[0].weights)


100

We have 100 layers in the model, let's look at the first 10.

In [None]:
for layer in range(10):
    print(layer)
    print('Layer name: \t', distilBERT.layers[0].weights[layer].name)
    print('Layer shape: \t', distilBERT.layers[0].weights[layer].shape)


0
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/word_embeddings/weight:0
Layer shape: 	 (30522, 768)
1
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/position_embeddings/embeddings:0
Layer shape: 	 (512, 768)
2
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/LayerNorm/gamma:0
Layer shape: 	 (768,)
3
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/LayerNorm/beta:0
Layer shape: 	 (768,)
4
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/q_lin/kernel:0
Layer shape: 	 (768, 768)
5
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/q_lin/bias:0
Layer shape: 	 (768,)
6
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/k_lin/kernel:0
Layer shape: 	 (768, 768)
7
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/k_lin/bias:0
Layer shape: 	 (768,)
8
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/v_lin/kernel:0
Layer 

Last 5 layers:

In [None]:
for layer in [99, 98, 97, 96, 95]:
    print(layer)
    print('Layer name: \t', distilBERT.layers[0].weights[layer].name)
    print('Layer shape: \t', distilBERT.layers[0].weights[layer].shape)


99
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/output_layer_norm/beta:0
Layer shape: 	 (768,)
98
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/output_layer_norm/gamma:0
Layer shape: 	 (768,)
97
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/ffn/lin2/bias:0
Layer shape: 	 (768,)
96
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/ffn/lin2/kernel:0
Layer shape: 	 (3072, 768)
95
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/ffn/lin1/bias:0
Layer shape: 	 (3072,)


We see the embedding layer which maps the token id to a 768 dim vector.
Next is the positional encoding which encodes the 512 BERT input positions. 
Layers 5-10 hold the weights and biases for the first self-attention layer

In [None]:
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model(transformer, max_length=MAX_LENGTH):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [None]:
model = build_model(distilBERT)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


In [None]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE

# Train the model
train_history1 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)



Epoch 1/6
11055/11055 - 929s - loss: 0.6894 - accuracy: 0.5331 - val_loss: 0.6826 - val_accuracy: 0.5583
Epoch 2/6
11055/11055 - 920s - loss: 0.6832 - accuracy: 0.5550 - val_loss: 0.6786 - val_accuracy: 0.5692
Epoch 3/6
11055/11055 - 920s - loss: 0.6810 - accuracy: 0.5604 - val_loss: 0.6766 - val_accuracy: 0.5741
Epoch 4/6
11055/11055 - 925s - loss: 0.6797 - accuracy: 0.5637 - val_loss: 0.6757 - val_accuracy: 0.5759
Epoch 5/6
11055/11055 - 928s - loss: 0.6787 - accuracy: 0.5666 - val_loss: 0.6742 - val_accuracy: 0.5795
Epoch 6/6
11055/11055 - 928s - loss: 0.6778 - accuracy: 0.5685 - val_loss: 0.6731 - val_accuracy: 0.5817


In [None]:
pd.DataFrame.from_dict(train_history1.history).to_csv('PRO_history1.csv',index=False)


In [None]:
FT_EPOCHS = 7
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index)

# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
train_history2 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/7
707541/707541 - 14672s - loss: 0.4802 - accuracy: 0.7646 - val_loss: 0.6034 - val_accuracy: 0.7331


In [None]:
pd.DataFrame.from_dict(train_history2.history).to_csv('PRO_history2.csv',index=False)

In [None]:
tf.keras.backend.clear_session()


## Adding additional layers

In [None]:
# resetting some of the global parameters here
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
MAX_LENGTH = 50
L2REG = 0.01

In [None]:
def build_model2(transformer, max_length=MAX_LENGTH, l2reg = L2REG, dropout_rate = LAYER_DROPOUT):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    dense = tf.keras.layers.Dense(512, activation='relu', kernel_regularizer= tf.keras.regularizers.l2(l2reg))(cls_token)
    dropout= tf.keras.layers.Dropout(dropout_rate)(dense)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(dropout)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [None]:
model2 = build_model2(distilBERT)



In [None]:
EPOCHS = 7
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE


In [None]:
# Train the model
train_history3 = model2.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/7
11055/11055 - 879s - loss: 0.9772 - accuracy: 0.5378 - val_loss: 0.6909 - val_accuracy: 0.5607
Epoch 2/7
11055/11055 - 871s - loss: 0.6917 - accuracy: 0.5468 - val_loss: 0.6880 - val_accuracy: 0.5526
Epoch 3/7
11055/11055 - 873s - loss: 0.6889 - accuracy: 0.5511 - val_loss: 0.6847 - val_accuracy: 0.5650
Epoch 4/7
11055/11055 - 870s - loss: 0.6877 - accuracy: 0.5535 - val_loss: 0.6896 - val_accuracy: 0.5416
Epoch 5/7
11055/11055 - 870s - loss: 0.6869 - accuracy: 0.5550 - val_loss: 0.6841 - val_accuracy: 0.5615
Epoch 6/7
11055/11055 - 870s - loss: 0.6863 - accuracy: 0.5566 - val_loss: 0.6822 - val_accuracy: 0.5737
Epoch 7/7
11055/11055 - 869s - loss: 0.6859 - accuracy: 0.5581 - val_loss: 0.6857 - val_accuracy: 0.5520


In [None]:
pd.DataFrame.from_dict(train_history3.history).to_csv('PRO_history3.csv',index=False)

# from google.colab import files
# files.download('PRO_history3.csv') 

In [None]:
# try with diff hyper parameters






## Add the parent comment

In [23]:
tf.keras.backend.clear_session()


In [24]:
X_train_p, temp_text, y_train_p, temp_labels = train_test_split(df[['comment', 'parent_comment']], df['label'], 
                                                                    random_state=0, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

# we will use temp_text and temp_labels to create validation and test set
X_val_p, X_test_p, y_val_p, y_test_p = train_test_split(temp_text, temp_labels, 
                                                                random_state=0, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [25]:
X_train_p.head()

Unnamed: 0,comment,parent_comment
715875,They do in neutral!,The only main thing you'll notice (vs driving ...
348688,But but... He was on Howard Stern voting for t...,Trump was against the war from the very beginn...
323243,"Kvothe, is that you?",Now imagine a hoodie with a great number of po...
56970,"Never ""meta"" in pro scene so it must be a shit...",What's wrong with scarab?
267644,I'm almost shocked not to see TJ McConnell here,Best and Worst Catch &amp; Shooters


In [26]:
MAX_LENGTH = 30

In [27]:
# try with parent comment connected
def batch_encode_parent(tokenizer, texts, parent, batch_size=256, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        parent_batch = parent[i:i+batch_size]

        combined = [list(i) for i in zip(parent_batch, batch)]


        inputs = tokenizer(combined,
                          padding = 'max_length', 
                          max_length = MAX_LENGTH, 
                          truncation = True,
                          return_attention_mask=True,
                          return_token_type_ids=False
                          )
        
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)
    
  

In [None]:
# temp1 = X_train_p["comment"].head(2).tolist()
# temp2 = X_train_p["parent_comment"].head(2).tolist()

# [[temp2, temp1]]

# print(temp1)
# print(temp2)

# # zip(temp1, temp2)


# [list(i) for i in zip(temp1, temp2)]


# # batch_encode_parent(tokenizer, temp1, temp2)

['They do in neutral!', "But but... He was on Howard Stern voting for the war and that's even worse than if he were a sitting senator doing the same... Because reasons..."]
["The only main thing you'll notice (vs driving a car with automatic transmission) is that the car will slow down a lot more quickly than an automatic would, manuals generally dont coast as freely.", "Trump was against the war from the very beginning too. Hillary literally voted for it. It is always such doublethink with the left it's unreal."]


[['They do in neutral!',
  "The only main thing you'll notice (vs driving a car with automatic transmission) is that the car will slow down a lot more quickly than an automatic would, manuals generally dont coast as freely."],
 ["But but... He was on Howard Stern voting for the war and that's even worse than if he were a sitting senator doing the same... Because reasons...",
  "Trump was against the war from the very beginning too. Hillary literally voted for it. It is always such doublethink with the left it's unreal."]]

In [28]:
### This cell takes a few minutes to run

# tokenizer_case
# tokenizer_uncase

# Encode X_train
X_train_ids_p, X_train_attention_p = batch_encode_parent(tokenizer_uncase, 
                                                  X_train_p["comment"].tolist(), 
                                                  X_train_p["parent_comment"].tolist())

# Encode X_valid
X_val_ids_p, X_val_attention_p = batch_encode_parent(tokenizer_uncase, 
                                              X_val_p["comment"].tolist(), 
                                              X_val_p["parent_comment"].tolist())

# Encode X_test
X_test_ids_p, X_test_attention_p = batch_encode_parent(tokenizer_uncase, 
                                                X_test_p["comment"].tolist(), 
                                                X_test_p["parent_comment"].tolist())


In [29]:
DISTILBERT_DROPOUT = 0.4
DISTILBERT_ATT_DROPOUT = 0.4
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilBERT.layers:
    layer.trainable = False


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [30]:
LAYER_DROPOUT = 0.4
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model_parent(transformer, max_length=MAX_LENGTH):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [31]:
model_parent = build_model_parent(distilBERT)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported



In [32]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train_p.index) // BATCH_SIZE

# Train the model
train_history_parent1 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/6
11055/11055 - 570s - loss: 0.6776 - accuracy: 0.5704 - val_loss: 0.6617 - val_accuracy: 0.6028
Epoch 2/6
11055/11055 - 561s - loss: 0.6669 - accuracy: 0.5945 - val_loss: 0.6574 - val_accuracy: 0.6071
Epoch 3/6
11055/11055 - 561s - loss: 0.6640 - accuracy: 0.5991 - val_loss: 0.6546 - val_accuracy: 0.6120
Epoch 4/6
11055/11055 - 561s - loss: 0.6627 - accuracy: 0.6016 - val_loss: 0.6529 - val_accuracy: 0.6145
Epoch 5/6
11055/11055 - 561s - loss: 0.6614 - accuracy: 0.6030 - val_loss: 0.6510 - val_accuracy: 0.6171
Epoch 6/6
11055/11055 - 562s - loss: 0.6613 - accuracy: 0.6036 - val_loss: 0.6508 - val_accuracy: 0.6175


## Testing

In [74]:
FT_EPOCHS = 4
BATCH_SIZE = 32
NUM_STEPS = len(X_train_p.index)



# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model_parent.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
              loss='binary_crossentropy',
              metrics=['accuracy'])



# Train the model
train_history_parent2 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 6494s - loss: 0.5134 - accuracy: 0.7433 - val_loss: 0.5505 - val_accuracy: 0.7586


In [75]:
train_history_parent22 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4


KeyboardInterrupt: ignored

## Error Analysis on Comment + Parent Model

In [76]:
y_prob = model_parent.predict(x = [X_test_ids_p, X_test_attention_p], verbose=2)
y_pred = [1 if x >= 0.5 else 0 for x in y_prob]
y_true = y_test_p.values

4738/4738 - 90s


In [77]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# accuracy
acc = accuracy_score(y_true, y_pred)
print('Accuracy: %.3f' % acc)

# precision
precision = precision_score(y_true, y_pred, average='binary', zero_division = 0)
print('Precision: %.3f' % precision)

# recall
recall = recall_score(y_true, y_pred, average='binary', zero_division = 0)
print('Recall: %.3f' % recall)

#f1
score = f1_score(y_true, y_pred, average='binary')
print('F-Measure: %.3f' % score)

target_names = ['not sarc', 'sarc']
print(classification_report(y_true, y_pred, target_names=target_names))


Accuracy: 0.760
Precision: 0.757
Recall: 0.767
F-Measure: 0.762
              precision    recall  f1-score   support

    not sarc       0.76      0.75      0.76     75811
        sarc       0.76      0.77      0.76     75805

    accuracy                           0.76    151616
   macro avg       0.76      0.76      0.76    151616
weighted avg       0.76      0.76      0.76    151616



In [78]:
test_np = X_test_ids_p.numpy()
untokenized_comments = []
for elem in test_np:
  # elem_input_ids = elem[0]
  elem_tokens = tokenizer_uncase.convert_ids_to_tokens(elem)
  untokenized_comments.append(" ".join(elem_tokens))

In [79]:
untokenized_comments[:5]

['[CLS] i would like for the usa to continue to exist as a country after [SEP] and demon ##izing russia and threatening them will definitely not bring on w [SEP]',
 "[CLS] i mean , poland did kind of start the war . i can see why that wouldn ' t be pleasant for poland to [SEP] you forgot the [SEP]",
 "[CLS] when people say the colts wouldn ' t have made the playoffs with [SEP] nah , don ' t you know temporary football re ##tar ##dation is a [SEP]",
 "[CLS] a real man wouldn ' t need the beer . in fact , he [SEP] honestly , i don ' t think he needed the beer , he [SEP]",
 '[CLS] 1984 is a book basically a out communism . being forced into things . [SEP] thank you [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [81]:
fp = []
fn = []
tp = []
tn = []

for ind, (l1, l2) in enumerate(zip(y_true, y_pred)):
  if l1 == 0 and l2 == 1: #predicted as sarcastic but actually not
    fp.append(ind)
  elif l1 == 1 and l2 == 0: # predicted as not but actually are sarcastic
    fn.append(ind)
  elif l1 == 1 and l2 == 1: # predicted as sarc and correct
    tp.append(ind)
  else: # predicted as not and it's correct
    tn.append(ind)

In [82]:
[untokenized_comments[i] for i in fp]


["[CLS] i don ' t think lo ##d is even near the same level as double [SEP] don ##t you mean lucian is the best double ##lift na kappa [SEP]",
 "[CLS] so what you ' re saying is . . . po ##g stack ##ing is [SEP] br ##b , converting all my bit ##co ##ins to po ##gs [SEP]",
 '[CLS] new vi ##d so f ##nn ##y pl ##s fa ##v [SEP] thanks for telling me this was funny . . . otherwise i would not have known [SEP]',
 '[CLS] miguel cot ##to vs . daniel ge ##ale 6 . 06 . 2015 hd ( 720 ##p @ 60 f ##ps ) [SEP] video buffer ##ing [SEP] [PAD] [PAD]',
 '[CLS] because of him or because he didn \' t want to take carlisle [SEP] i \' m sure he would have dealt with pop \' s " bullshit [SEP]',
 "[CLS] hey , # ow ##s ? look what i just found : you don ' t need [SEP] the problem is , now there is no room ! [SEP]",
 '[CLS] ep ##2 [SEP] drop the accents [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] i feel like s

In [83]:
[untokenized_comments[i] for i in fn]


["[CLS] i ' ve been hearing about ks ##p for a while , but [SEP] yeah , unfortunately all the interest has died down , no one really plays [SEP]",
 '[CLS] one god damn line . half - way through the season and not a single line from kenny . [SEP] hey you have the intro every episode . [SEP]',
 "[CLS] she doesn ' t serve the lord of light . she served the seven . [SEP] sorry , i ' m drunk . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]",
 "[CLS] josh ho - sang is wide awake [SEP] he should get to bed so he doesn ' t miss the start of training camp again . [SEP] [PAD] [PAD]",
 '[CLS] my mouse stopped working , driver update maybe ? [SEP] i tried to look up drivers for it based off of what the model of your mouse looks [SEP]',
 '[CLS] anyone else pay their followers wages ? i throw my boy der ##kee ##thus [SEP] what part of " sworn to carry your burden ##s " don \' [SEP]',
 "[CLS] woman at a doctors office laughed at me today . so i ' ve decided to [SEP] if anyone knows what ##s best for you its som

In [84]:
[untokenized_comments[i] for i in tp]


['[CLS] i would like for the usa to continue to exist as a country after [SEP] and demon ##izing russia and threatening them will definitely not bring on w [SEP]',
 "[CLS] i mean , poland did kind of start the war . i can see why that wouldn ' t be pleasant for poland to [SEP] you forgot the [SEP]",
 "[CLS] when people say the colts wouldn ' t have made the playoffs with [SEP] nah , don ' t you know temporary football re ##tar ##dation is a [SEP]",
 '[CLS] watch dogs xbox one graphic comparison [SEP] a difference you could bra ##g about [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] saying " all muslims are terrorists or terrorist sy ##mp ##athi ##zers who hate america [SEP] no no no , that \' s simply all gay people ! [SEP]',
 '[CLS] house . . . . . republican . . . . . cent ##rist ##s ? did we actually discover [SEP] you and your fancy pants science ! [SEP]',
 "[CLS] speak for yourself . i ' m not a selfish person . [SEP] ah yes , 

In [85]:
[untokenized_comments[i] for i in tn]

["[CLS] a real man wouldn ' t need the beer . in fact , he [SEP] honestly , i don ' t think he needed the beer , he [SEP]",
 '[CLS] 1984 is a book basically a out communism . being forced into things . [SEP] thank you [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] in halo 4 , you could use a pistol too and it was still a one hit [SEP] yes , but you couldn ' t drop the flag [SEP]",
 "[CLS] what ##s the difference between hitler and a gay guy ? about 45 [SEP] i don ' t think 45 degrees difference from room temperature is oven levels [SEP]",
 '[CLS] apple far ##ts [SEP] if ##arts ? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] for everyone excited about christopher nolan ##s dunkirk - check out the amazing 5 minute dunkirk tracking shot [SEP] this gets posted at least once a month . [SEP]',
 "[CLS] no singing so the other day , during lunch , a girl stood up [

## Extra

In [None]:
train_history_parent222 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 6737s - loss: 0.3842 - accuracy: 0.8268 - val_loss: 0.7022 - val_accuracy: 0.7507


In [None]:
train_history_parent2222 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 6744s - loss: 0.3246 - accuracy: 0.8586 - val_loss: 0.8182 - val_accuracy: 0.7514


In [None]:
train_history_parent22222 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 6740s - loss: 0.2684 - accuracy: 0.8865 - val_loss: 0.8614 - val_accuracy: 0.7485


In [None]:
train_history_parent222222 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 6753s - loss: 0.2187 - accuracy: 0.9098 - val_loss: 0.9422 - val_accuracy: 0.7442
