In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |▏                               | 10kB 24.9MB/s eta 0:00:01[K     |▎                               | 20kB 32.3MB/s eta 0:00:01[K     |▌                               | 30kB 22.5MB/s eta 0:00:01[K     |▋                               | 40kB 26.0MB/s eta 0:00:01[K     |▉                               | 51kB 24.1MB/s eta 0:00:01[K     |█                               | 61kB 26.8MB/s eta 0:00:01[K     |█▏                              | 71kB 17.9MB/s eta 0:00:01[K     |█▎                              | 81kB 19.2MB/s eta 0:00:01[K     |█▌                              | 92kB 18.0MB/s eta 0:00:01[K     |█▋                              | 102kB 18.3MB/s eta 0:00:01[K     |█▉                              | 112kB 18.3MB/s eta 0:00:01[K     |██                              | 

In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig



referring to this article: https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379

Read the data in

In [4]:
lav_path = '/content/gdrive/MyDrive/W266Project_Lav_Shalz/train-balanced-sarcasm.csv'
shalz_path = '/content/gdrive/MyDrive/Colab Notebooks/train-balanced-sarcasm.csv'

In [5]:
df = pd.read_csv(shalz_path)

In [6]:
df.shape

(1010826, 10)

In [7]:
df.isna().sum()

label              0
comment           53
author             0
subreddit          0
score              0
ups                0
downs              0
date               0
created_utc        0
parent_comment     0
dtype: int64

In [8]:
df = df[df['comment'].notna()]
df.isna().sum()

label             0
comment           0
author            0
subreddit         0
score             0
ups               0
downs             0
date              0
created_utc       0
parent_comment    0
dtype: int64

In [9]:
# check label distribution after removing NA
df['label'].value_counts()

0    505405
1    505368
Name: label, dtype: int64

In this notebook, we want to do hyperparameter tuning in order to improve our model. This means that unlike before, we want to be able to use all of the data we have to build the model and then tune the parameters.

In [None]:
# # select a fraction of the data
# s0 = df.label[df.label.eq(0)].sample(505368).index
# s1 = df.label[df.label.eq(1)].sample(505368).index 

# df = df.loc[s0.union(s1)]
# df

In [10]:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
# model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
# model.layers


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [11]:
# check what max length should be based on the sentence lengths in the full data

comment_lengths = list(df['comment'].str.split().apply(len))
parent_comment_lengths = list(df['parent_comment'].str.split().apply(len))
total_comment_lengths = [a + b for a, b in zip(comment_lengths, parent_comment_lengths)]


In [37]:
print("Comment Length Distribution")
print(min(comment_lengths))
print(np.percentile(comment_lengths, [25, 50, 75]))
print(max(comment_lengths))


print("Parent Comment Length Distribution")
print(min(parent_comment_lengths))
print(np.percentile(parent_comment_lengths, [25, 50, 75]))
print(max(parent_comment_lengths))


print("Total Comment Length Distribution")
print(min(total_comment_lengths))
print(np.percentile(total_comment_lengths, [25, 50, 75]))
print(max(total_comment_lengths))




Comment Length Distribution
1
[ 5.  9. 14.]
2222
Parent Comment Length Distribution
1
[ 8. 14. 26.]
4198
Total Comment Length Distribution
2
[16. 24. 40.]
4444


We have some sentences that are very long, but most of the data (75% percentile) is below 50 so we will use this as our max_length value.

Trying out the tokenizer in order to the two methods we want to try out: 

`Approach A: [CLS] [comment] [SEP] [Masking]
id: 0`


`Approach B: [CLS] [parent_comment] [SEP] [comment] [SEP] [Masking]`


In [18]:
# understanding the tokenizer
temp_sentence = df["comment"][10]
temp_parent_comment = df["parent_comment"][10]
print(temp_sentence)
print(temp_parent_comment)
temp_tokens = tokenizer.tokenize(temp_sentence)
print(temp_tokens)
print(temp_parent_comment)

I think a significant amount would be against spending their tax dollars on other people.
I bet if that money was poured into college debt or health debt relief, 81% of Americans would have been for it instead.
['i', 'think', 'a', 'significant', 'amount', 'would', 'be', 'against', 'spending', 'their', 'tax', 'dollars', 'on', 'other', 'people', '.']
I bet if that money was poured into college debt or health debt relief, 81% of Americans would have been for it instead.


In [49]:
inputs = tokenizer(temp_sentence,
          padding = 'max_length', max_length = 50, truncation = True)

inputs
# 101 at the beginning is the CLS token
# 102 in between comment and parent comment is SEP token
# 0 is padding based on the max_length

{'input_ids': [101, 1045, 2228, 1037, 3278, 3815, 2052, 2022, 2114, 5938, 2037, 4171, 6363, 2006, 2060, 2111, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [50]:
encoded_sequence = inputs["input_ids"]
encoded_sequence
decoded_sequence = tokenizer.decode(encoded_sequence)
decoded_sequence

'[CLS] i think a significant amount would be against spending their tax dollars on other people. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [51]:
inputs = tokenizer([[temp_sentence, temp_parent_comment]],
          padding = 'max_length', max_length = 50, truncation = True)
inputs

{'input_ids': [[101, 1045, 2228, 1037, 3278, 3815, 2052, 2022, 2114, 5938, 2037, 4171, 6363, 2006, 2060, 2111, 1012, 102, 1045, 6655, 2065, 2008, 2769, 2001, 8542, 2046, 2267, 7016, 2030, 2740, 7016, 4335, 1010, 6282, 1003, 1997, 4841, 2052, 2031, 2042, 2005, 2009, 2612, 1012, 102, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}

Diff version of tokenizing that we would do below

In [54]:
MAX_LENGTH = 30
tokenizer.batch_encode_plus(temp_tokens,
                            max_length=MAX_LENGTH,
                            padding='longest', #implements dynamic padding
                            truncation=True,
                            return_attention_mask=True,
                            return_token_type_ids=False
                            )

{'input_ids': [[101, 1045, 102], [101, 2228, 102], [101, 1037, 102], [101, 3278, 102], [101, 3815, 102], [101, 2052, 102], [101, 2022, 102], [101, 2114, 102], [101, 5938, 102], [101, 2037, 102], [101, 4171, 102], [101, 6363, 102], [101, 2006, 102], [101, 2060, 102], [101, 2111, 102], [101, 1012, 102]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}

In [55]:
tokenizer(temp_sentence,
          padding = 'max_length', 
          max_length = MAX_LENGTH, 
          truncation = True,
          return_attention_mask=True,
          return_token_type_ids=False
          )

{'input_ids': [101, 1045, 2228, 1037, 3278, 3815, 2052, 2022, 2114, 5938, 2037, 4171, 6363, 2006, 2060, 2111, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

Let's split the data in train, val, test and then tokenize all of it

In [35]:
X_train, temp_text, y_train, temp_labels = train_test_split(df['comment'], df['label'], 
                                                                    random_state=0, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

# we will use temp_text and temp_labels to create validation and test set
X_val, X_test, y_val, y_test = train_test_split(temp_text, temp_labels, 
                                                                random_state=0, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [57]:
MAX_LENGTH = 50

In [58]:
def batch_encode(tokenizer, texts, batch_size=256, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # inputs = tokenizer.batch_encode_plus(batch,
        #                                      max_length=max_length,
        #                                      padding='longest', #implements dynamic padding
        #                                      truncation=True,
        #                                      return_attention_mask=True,
        #                                      return_token_type_ids=False
        #                                      )

        inputs = tokenizer(batch,
                          padding = 'max_length', 
                          max_length = MAX_LENGTH, 
                          truncation = True,
                          return_attention_mask=True,
                          return_token_type_ids=False
                          )
        
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)
    
  

In [59]:
### This cell takes a few minutes to run

# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_val_ids, X_val_attention = batch_encode(tokenizer, X_val.tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

In [60]:
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilBERT.layers:
    layer.trainable = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Trying to understand the DistilBERT model layers a bit more

In [61]:
distilBERT.layers

[<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertMainLayer at 0x7f864e154ed0>]

In [62]:
len(distilBERT.layers[0].weights)


100

We have 100 layers in the model, let's look at the first 10.

In [63]:
for layer in range(10):
    print(layer)
    print('Layer name: \t', distilBERT.layers[0].weights[layer].name)
    print('Layer shape: \t', distilBERT.layers[0].weights[layer].shape)


0
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/word_embeddings/weight:0
Layer shape: 	 (30522, 768)
1
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/position_embeddings/embeddings:0
Layer shape: 	 (512, 768)
2
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/LayerNorm/gamma:0
Layer shape: 	 (768,)
3
Layer name: 	 tf_distil_bert_model/distilbert/embeddings/LayerNorm/beta:0
Layer shape: 	 (768,)
4
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/q_lin/kernel:0
Layer shape: 	 (768, 768)
5
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/q_lin/bias:0
Layer shape: 	 (768,)
6
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/k_lin/kernel:0
Layer shape: 	 (768, 768)
7
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/k_lin/bias:0
Layer shape: 	 (768,)
8
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._0/attention/v_lin/kernel:0
Layer 

Last 5 layers:

In [64]:
for layer in [99, 98, 97, 96, 95]:
    print(layer)
    print('Layer name: \t', distilBERT.layers[0].weights[layer].name)
    print('Layer shape: \t', distilBERT.layers[0].weights[layer].shape)


99
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/output_layer_norm/beta:0
Layer shape: 	 (768,)
98
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/output_layer_norm/gamma:0
Layer shape: 	 (768,)
97
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/ffn/lin2/bias:0
Layer shape: 	 (768,)
96
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/ffn/lin2/kernel:0
Layer shape: 	 (3072, 768)
95
Layer name: 	 tf_distil_bert_model/distilbert/transformer/layer_._5/ffn/lin1/bias:0
Layer shape: 	 (3072,)


We see the embedding layer which maps the token id to a 768 dim vector.
Next is the positional encoding which encodes the 512 BERT input positions. 
Layers 5-10 hold the weights and biases for the first self-attention layer

In [65]:
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model(transformer, max_length=MAX_LENGTH):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [66]:
model = build_model(distilBERT)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


In [67]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE

# Train the model
train_history1 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)



Epoch 1/6
11055/11055 - 1427s - loss: 0.6525 - accuracy: 0.6149 - val_loss: 0.6288 - val_accuracy: 0.6450
Epoch 2/6
11055/11055 - 1419s - loss: 0.6338 - accuracy: 0.6401 - val_loss: 0.6190 - val_accuracy: 0.6563
Epoch 3/6
11055/11055 - 1419s - loss: 0.6281 - accuracy: 0.6467 - val_loss: 0.6133 - val_accuracy: 0.6615
Epoch 4/6
11055/11055 - 1420s - loss: 0.6248 - accuracy: 0.6498 - val_loss: 0.6102 - val_accuracy: 0.6655
Epoch 5/6
11055/11055 - 1420s - loss: 0.6229 - accuracy: 0.6518 - val_loss: 0.6075 - val_accuracy: 0.6677
Epoch 6/6
11055/11055 - 1420s - loss: 0.6214 - accuracy: 0.6537 - val_loss: 0.6056 - val_accuracy: 0.6689


In [69]:
pd.DataFrame.from_dict(train_history1.history).to_csv('history1.csv',index=False)


In [70]:

FT_EPOCHS = 4
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index)



# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
train_history2 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/4


KeyboardInterrupt: ignored

In [71]:
tf.keras.backend.clear_session()


## Adding additional layers

In [72]:
# resetting some of the global parameters here
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
MAX_LENGTH = 40
L2REG = 0.01

In [73]:
def build_model2(transformer, max_length=MAX_LENGTH, l2reg = L2REG, dropout_rate = LAYER_DROPOUT):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    dense = tf.keras.layers.Dense(512, activation='relu', kernel_regularizer= tf.keras.regularizers.l2(l2reg))(cls_token)
    dropout= tf.keras.layers.Dropout(dropout_rate)(dense)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(dropout)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [74]:
model2 = build_model2(distilBERT)



In [75]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE


In [None]:
# Train the model
train_history3 = model2.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/6
11055/11055 - 3578s - loss: 0.7288 - accuracy: 0.7964 - val_loss: 0.5064 - val_accuracy: 0.7680
Epoch 2/6
11055/11055 - 3573s - loss: 0.4023 - accuracy: 0.8194 - val_loss: 0.5135 - val_accuracy: 0.7654
Epoch 3/6
11055/11055 - 3572s - loss: 0.3594 - accuracy: 0.8423 - val_loss: 0.5569 - val_accuracy: 0.7602
Epoch 4/6


In [1]:
pd.DataFrame.from_dict(train_history3.history).to_csv('history2.csv',index=False)

from google.colab import files
files.download('history2.csv') 

NameError: ignored

In [None]:
# try with diff hyper parameters






## Add the parent comment

In [None]:
tf.keras.backend.clear_session()


In [12]:
X_train_p, temp_text, y_train_p, temp_labels = train_test_split(df[['comment', 'parent_comment']], df['label'], 
                                                                    random_state=0, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

# we will use temp_text and temp_labels to create validation and test set
X_val_p, X_test_p, y_val_p, y_test_p = train_test_split(temp_text, temp_labels, 
                                                                random_state=0, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [13]:
X_train_p.head()

Unnamed: 0,comment,parent_comment
715875,They do in neutral!,The only main thing you'll notice (vs driving ...
348688,But but... He was on Howard Stern voting for t...,Trump was against the war from the very beginn...
323243,"Kvothe, is that you?",Now imagine a hoodie with a great number of po...
56970,"Never ""meta"" in pro scene so it must be a shit...",What's wrong with scarab?
267644,I'm almost shocked not to see TJ McConnell here,Best and Worst Catch &amp; Shooters


In [16]:
MAX_LENGTH = 30

In [29]:
# try with parent comment connected
def batch_encode_parent(tokenizer, texts, parent, batch_size=256, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        parent_batch = parent[i:i+batch_size]

        combined = [list(i) for i in zip(parent_batch, batch)]


        inputs = tokenizer(combined,
                          padding = 'max_length', 
                          max_length = MAX_LENGTH, 
                          truncation = True,
                          return_attention_mask=True,
                          return_token_type_ids=False
                          )
        
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)
    
  

In [28]:
# temp1 = X_train_p["comment"].head(2).tolist()
# temp2 = X_train_p["parent_comment"].head(2).tolist()

# [[temp2, temp1]]

# print(temp1)
# print(temp2)

# # zip(temp1, temp2)


# [list(i) for i in zip(temp1, temp2)]


# # batch_encode_parent(tokenizer, temp1, temp2)

['They do in neutral!', "But but... He was on Howard Stern voting for the war and that's even worse than if he were a sitting senator doing the same... Because reasons..."]
["The only main thing you'll notice (vs driving a car with automatic transmission) is that the car will slow down a lot more quickly than an automatic would, manuals generally dont coast as freely.", "Trump was against the war from the very beginning too. Hillary literally voted for it. It is always such doublethink with the left it's unreal."]


[['They do in neutral!',
  "The only main thing you'll notice (vs driving a car with automatic transmission) is that the car will slow down a lot more quickly than an automatic would, manuals generally dont coast as freely."],
 ["But but... He was on Howard Stern voting for the war and that's even worse than if he were a sitting senator doing the same... Because reasons...",
  "Trump was against the war from the very beginning too. Hillary literally voted for it. It is always such doublethink with the left it's unreal."]]

In [30]:
### This cell takes a few minutes to run

# Encode X_train
X_train_ids_p, X_train_attention_p = batch_encode_parent(tokenizer, 
                                                  X_train_p["comment"].tolist(), 
                                                  X_train_p["parent_comment"].tolist())

# Encode X_valid
X_val_ids_p, X_val_attention_p = batch_encode_parent(tokenizer, 
                                              X_val_p["comment"].tolist(), 
                                              X_val_p["parent_comment"].tolist())

# Encode X_test
X_test_ids_p, X_test_attention_p = batch_encode_parent(tokenizer, 
                                                X_test_p["comment"].tolist(), 
                                                X_test_p["parent_comment"].tolist())


In [31]:
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilBERT.layers:
    layer.trainable = False


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'activation_13', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [32]:
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model_parent(transformer, max_length=MAX_LENGTH):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [33]:
model_parent = build_model_parent(distilBERT)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


In [36]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE

# Train the model
train_history_parent1 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/6
11055/11055 - 890s - loss: 0.6686 - accuracy: 0.5892 - val_loss: 0.6557 - val_accuracy: 0.6136
Epoch 2/6
11055/11055 - 882s - loss: 0.6567 - accuracy: 0.6106 - val_loss: 0.6505 - val_accuracy: 0.6204
Epoch 3/6
11055/11055 - 882s - loss: 0.6529 - accuracy: 0.6159 - val_loss: 0.6475 - val_accuracy: 0.6238
Epoch 4/6
11055/11055 - 881s - loss: 0.6511 - accuracy: 0.6187 - val_loss: 0.6454 - val_accuracy: 0.6263
Epoch 5/6
11055/11055 - 880s - loss: 0.6499 - accuracy: 0.6196 - val_loss: 0.6438 - val_accuracy: 0.6274
Epoch 6/6
11055/11055 - 880s - loss: 0.6492 - accuracy: 0.6209 - val_loss: 0.6428 - val_accuracy: 0.6284


In [37]:
pd.DataFrame.from_dict(train_history_parent1.history).to_csv('parenthistory1.csv',index=False)


In [38]:
FT_EPOCHS = 4
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index)



# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model_parent.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
              loss='binary_crossentropy',
              metrics=['accuracy'])



# Train the model
train_history_parent2 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 8796s - loss: 0.4690 - accuracy: 0.7736 - val_loss: 0.5134 - val_accuracy: 0.7647


In [39]:
pd.DataFrame.from_dict(train_history_parent2.history).to_csv('train_history_parent2.csv',index=False)


In [40]:
# maybe change hyperparameters
FT_EPOCHS = 4
BATCH_SIZE = 32
NUM_STEPS = len(X_train.index)



# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model_parent.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-3), 
              loss='binary_crossentropy',
              metrics=['accuracy'])



# Train the model
train_history_parent3 = model_parent.fit(
    x = [X_train_ids_p, X_train_attention_p],
    y = y_train_p.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids_p, X_val_attention_p], y_val_p.to_numpy()),
    verbose=2
)


Epoch 1/4
707541/707541 - 9831s - loss: 0.6940 - accuracy: 0.5000 - val_loss: 0.6932 - val_accuracy: 0.5000


In [None]:
model = build_model(distilBERT)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


In [None]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE

# Train the model
train_history1 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/6
11055/11055 - 869s - loss: 0.6538 - accuracy: 0.6120 - val_loss: 0.6299 - val_accuracy: 0.6437
Epoch 2/6
11055/11055 - 861s - loss: 0.6352 - accuracy: 0.6382 - val_loss: 0.6207 - val_accuracy: 0.6545
Epoch 3/6
11055/11055 - 861s - loss: 0.6301 - accuracy: 0.6442 - val_loss: 0.6157 - val_accuracy: 0.6595
Epoch 4/6
11055/11055 - 861s - loss: 0.6274 - accuracy: 0.6468 - val_loss: 0.6127 - val_accuracy: 0.6627
Epoch 5/6
11055/11055 - 861s - loss: 0.6253 - accuracy: 0.6492 - val_loss: 0.6103 - val_accuracy: 0.6650
Epoch 6/6
11055/11055 - 862s - loss: 0.6240 - accuracy: 0.6513 - val_loss: 0.6083 - val_accuracy: 0.6658


In [None]:
FT_EPOCHS = 4
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index)

# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
train_history2 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/4
707541/707541 - 8453s - loss: 0.4618 - accuracy: 0.7783 - val_loss: 0.5063 - val_accuracy: 0.7696


All of the above code was run the full data using random parameters. We get a training accuracy of 77.83% and a validation accuracy of 76.96%, which are good signs we are not overfitting.

Next, let's try changing the model architecture slightly (adding some dense layers) and hyperparameter tuning the model. Based on some research, the parameters that lead to the biggest change in accuracy are: learning rate, dropout, and batch size so I am going to focus on these parameters. It may also be good to try adding more dense/dropout layers to the model.

In [None]:
tf.keras.backend.clear_session()

## Add additional layers

In [None]:
# resetting some of the global parameters here
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
MAX_LENGTH = 30
L2REG = 0.01

In [None]:
def build_model2(transformer, max_length=MAX_LENGTH, l2reg = L2REG, dropout_rate = LAYER_DROPOUT):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    dense = tf.keras.layers.Dense(512, activation='relu', kernel_regularizer= tf.keras.regularizers.l2(l2reg))(cls_token)
    dropout= tf.keras.layers.Dropout(dropout_rate)(dense)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(dropout)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model



    # inps = Input(shape = (max_len,), dtype='int64')
    # masks= Input(shape = (max_len,), dtype='int64')
    # dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    # dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    # dropout= Dropout(0.5)(dense)
    # pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    # model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    # print(model.summary())
    # return model

In [None]:
model2 = build_model2(distilBERT)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


In [None]:
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE


In [None]:

# Train the model
train_history3 = model2.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/6
11055/11055 - 2234s - loss: 0.7305 - accuracy: 0.8047 - val_loss: 0.5110 - val_accuracy: 0.7629
Epoch 2/6
11055/11055 - 2226s - loss: 0.3899 - accuracy: 0.8259 - val_loss: 0.5051 - val_accuracy: 0.7584
Epoch 3/6
11055/11055 - 2226s - loss: 0.3469 - accuracy: 0.8492 - val_loss: 0.5824 - val_accuracy: 0.7495
Epoch 4/6
11055/11055 - 2223s - loss: 0.3076 - accuracy: 0.8692 - val_loss: 0.5959 - val_accuracy: 0.7534
Epoch 5/6


with additional dense + dropout layers


11055/11055 - 2234s - loss: 0.7305 - accuracy: 0.8047 - val_loss: 0.5110 - val_accuracy: 0.7629
Epoch 2/6
11055/11055 - 2226s - loss: 0.3899 - accuracy: 0.8259 - val_loss: 0.5051 - val_accuracy: 0.7584
Epoch 3/6
11055/11055 - 2226s - loss: 0.3469 - accuracy: 0.8492 - val_loss: 0.5824 - val_accuracy: 0.7495
Epoch 4/6
11055/11055 - 2223s - loss: 0.3076 - accuracy: 0.8692 - val_loss: 0.5959 - val_accuracy: 0.7534
Epoch 5/6

In [None]:
FT_EPOCHS = 10

In [None]:
# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True
    
# Recompile model after unfreezing
model2.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
train_history4 = model2.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/10
11055/11055 - 2283s - loss: 0.3801 - accuracy: 0.8299 - val_loss: 0.5387 - val_accuracy: 0.7676
Epoch 2/10
11055/11055 - 2290s - loss: 0.3486 - accuracy: 0.8474 - val_loss: 0.5328 - val_accuracy: 0.7625
Epoch 3/10
11055/11055 - 2283s - loss: 0.3183 - accuracy: 0.8637 - val_loss: 0.5681 - val_accuracy: 0.7627
Epoch 4/10
11055/11055 - 2287s - loss: 0.2878 - accuracy: 0.8786 - val_loss: 0.5962 - val_accuracy: 0.7601
Epoch 5/10
11055/11055 - 2280s - loss: 0.2621 - accuracy: 0.8912 - val_loss: 0.6502 - val_accuracy: 0.7570
Epoch 6/10
11055/11055 - 2286s - loss: 0.2371 - accuracy: 0.9028 - val_loss: 0.6628 - val_accuracy: 0.7553
Epoch 7/10
11055/11055 - 2286s - loss: 0.2152 - accuracy: 0.9131 - val_loss: 0.7298 - val_accuracy: 0.7532
Epoch 8/10
11055/11055 - 2287s - loss: 0.1963 - accuracy: 0.9218 - val_loss: 0.7284 - val_accuracy: 0.7539
Epoch 9/10
11055/11055 - 2285s - loss: 0.1788 - accuracy: 0.9290 - val_loss: 0.7987 - val_accuracy: 0.7509
Epoch 10/10
11055/11055 - 2283s - los

In [None]:
model2.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_attention (InputLayer)    [(None, 30)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 66362880    input_ids[0][0]                  
                                                                 input_attention[0][0]            
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           tf_distil_bert_model[0][7]   

In [None]:
# save this model
# !mkdir -p distilbert

In [None]:
import torch
torch.save(model2, 'model2_saved')

# saved_model = torch.load('path/to/model')

NameError: ignored

In [None]:
# hyper parameter values to try
lr_vals = [5e-7, 5e-6, 5e-5, 5e-4, 5e-3, 5e-2, 5e-1] 
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE


LEARNING_RATE_TUNE0 = lr_vals[0]

def build_model_tune0(transformer, max_length=MAX_LENGTH, l2reg = L2REG, dropout_rate = LAYER_DROPOUT, lr = LEARNING_RATE_TUNE0):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    dense = tf.keras.layers.Dense(512, activation='relu', kernel_regularizer= tf.keras.regularizers.l2(l2reg))(cls_token)
    dropout= tf.keras.layers.Dropout(dropout_rate)(dense)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(dropout)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=lr), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [None]:
hyperparam_models = []
train_histories = []

for lr in lr_vals:
  model3 = build_model_tune0(distilBERT, max_length=MAX_LENGTH, l2reg = L2REG, dropout_rate = LAYER_DROPOUT, lr = lr)

  train_history3 = model3.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)
  
  hyperparam_models.append(model3)
  train_histories.append(train_history3)

  print("Learning Rate: ", lr)
  print()
  print(model3.summary())




Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Epoch 1/6
11055/11055 - 871s - loss: 6.2524 - accuracy: 0.5203 - val_loss: 5.6675 - val_accuracy: 0.5885
Epoch 2/6
11055/11055 - 867s - loss: 5.1588 - accuracy: 0.5595 - val_loss: 4.6573 - val_accuracy: 0.6148
Epoch 3/6
11055/11055 - 873s - loss: 4.2299 - accuracy: 0.5814 - val_loss: 3.8060 - val_accuracy: 0.6243
Epoch 4/6
11055/11055 - 872s - loss: 3

In [None]:
# had to rerun some of them
lr_vals = [5e-4, 5e-3, 5e-2, 5e-1]


hyperparam_models1 = []
train_histories1 = []

for lr in lr_vals:
  model3 = build_model_tune0(distilBERT, max_length=MAX_LENGTH, l2reg = L2REG, dropout_rate = LAYER_DROPOUT, lr = lr)

  train_history3 = model3.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)
  
  hyperparam_models1.append(model3)
  train_histories1.append(train_history3)

  print("Learning Rate: ", lr)
  print()
  print(model3.summary())



# which learning rate was the best? 


In [None]:
# add other metrics - accuracy, precision, recall, f1
