#### Prepared by: Sneha Kumar


# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd "/content/drive/MyDrive/cs4248-project/DistilBERT"

/content/drive/.shortcut-targets-by-id/1xEJE2h7GM3zlZE8uzzEx5XTAwZaOR0Kb/cs4248-project/Saved Items


#Install & Import Libraries

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel

# Prepare Dataset

In [None]:
X_train_valid_directory = "../Datasets/clean_data/X_train_dev.csv"
y_train_valid_directory = "../Datasets/clean_data/y_train_dev.csv"
X_train_dev = pd.read_csv(X_train_valid_directory)
y_train_dev = pd.read_csv(y_train_valid_directory)


In [None]:
num_rows = 10000      #take a subset of data for hyperparameter tuning purposes
rd_X_train_dev = X_train_dev.sample(n=num_rows, random_state=4248)
rd_y_train_dev = y_train_dev.iloc[rd_X_train_dev.index]
X_train, X_valid, y_train, y_valid = train_test_split(rd_X_train_dev['text'], rd_y_train_dev['label'], test_size=0.11, random_state=4248)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU used")
else:
    device = torch.device("cpu")
    print("GPU not found")

GPU used


Tokenization with DistilBERT tokenizer

In [None]:
model_name = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(model_name)
max_len = 150

x_train = list(X_train)
x_valid = list(X_valid)

print('Encodings Started')
train_encodings = tokenizer(x_train, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
valid_encodings = tokenizer(x_valid, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
print("Encodings Created")

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(list(y_train), dtype=torch.float32))
valid_dataset = TensorDataset(valid_encodings['input_ids'], valid_encodings['attention_mask'], torch.tensor(list(y_valid), dtype=torch.float32))

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
print("Data Loaded")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Encodings Started
Encodings Created
Data Loaded


# Find Suitable LR

## Base Model


In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

epochs = 3

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

        model.train()


Epoch 1/3
Train Loss: 0.6903 | Train F1: 0.4511
Validation Loss: 0.6772 | Validation F1: 0.6644
Epoch 1/3
Train Loss: 0.6635 | Train F1: 0.6013
Validation Loss: 0.6496 | Validation F1: 0.5344
Epoch 1/3
Train Loss: 0.6242 | Train F1: 0.6148
Validation Loss: 0.6402 | Validation F1: 0.5640
Epoch 1/3
Train Loss: 0.6063 | Train F1: 0.6422
Validation Loss: 0.6456 | Validation F1: 0.5248
Epoch 1/3
Train Loss: 0.6286 | Train F1: 0.5779
Validation Loss: 0.6090 | Validation F1: 0.6902
Epoch 1/3
Train Loss: 0.5850 | Train F1: 0.6700
Validation Loss: 0.6040 | Validation F1: 0.6863
Epoch 1/3
Train Loss: 0.5850 | Train F1: 0.6726
Validation Loss: 0.5879 | Validation F1: 0.6710
Epoch 2/3
Train Loss: 0.5241 | Train F1: 0.7311
Validation Loss: 0.6209 | Validation F1: 0.6987
Epoch 2/3
Train Loss: 0.4628 | Train F1: 0.7488
Validation Loss: 0.6520 | Validation F1: 0.7090
Epoch 2/3
Train Loss: 0.5460 | Train F1: 0.6806
Validation Loss: 0.6608 | Validation F1: 0.7164
Epoch 2/3
Train Loss: 0.4981 | Train F1:

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

        model.train()


Epoch 1/3
Train Loss: 0.6875 | Train F1: 0.4903
Validation Loss: 0.6605 | Validation F1: 0.6590
Epoch 1/3
Train Loss: 0.6302 | Train F1: 0.6524
Validation Loss: 0.6319 | Validation F1: 0.6145
Epoch 1/3
Train Loss: 0.6231 | Train F1: 0.6013
Validation Loss: 0.6274 | Validation F1: 0.6500
Epoch 1/3
Train Loss: 0.6034 | Train F1: 0.6756
Validation Loss: 0.6751 | Validation F1: 0.6650
Epoch 1/3
Train Loss: 0.6414 | Train F1: 0.5900
Validation Loss: 0.6169 | Validation F1: 0.6690
Epoch 1/3
Train Loss: 0.6124 | Train F1: 0.6617
Validation Loss: 0.6097 | Validation F1: 0.6244
Epoch 1/3
Train Loss: 0.6001 | Train F1: 0.6407
Validation Loss: 0.6060 | Validation F1: 0.6373
Epoch 2/3
Train Loss: 0.5136 | Train F1: 0.7489
Validation Loss: 0.6361 | Validation F1: 0.6385
Epoch 2/3
Train Loss: 0.4865 | Train F1: 0.7525
Validation Loss: 0.6112 | Validation F1: 0.6720
Epoch 2/3
Train Loss: 0.4814 | Train F1: 0.7424
Validation Loss: 0.6215 | Validation F1: 0.6408
Epoch 2/3
Train Loss: 0.5107 | Train F1:

## Base Model with Step LR Decay (0.5)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.5)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

        model.train()


Epoch 1/3
Train Loss: 0.6791 | Train F1: 0.3840
Validation Loss: 0.6629 | Validation F1: 0.6131
Epoch 1/3
Train Loss: 0.6605 | Train F1: 0.5667
Validation Loss: 0.6511 | Validation F1: 0.6626
Epoch 1/3
Train Loss: 0.6323 | Train F1: 0.6176
Validation Loss: 0.6467 | Validation F1: 0.5791
Epoch 1/3
Train Loss: 0.6280 | Train F1: 0.6299
Validation Loss: 0.6390 | Validation F1: 0.6286
Epoch 1/3
Train Loss: 0.6316 | Train F1: 0.6719
Validation Loss: 0.6380 | Validation F1: 0.6498
Epoch 1/3
Train Loss: 0.6254 | Train F1: 0.6591
Validation Loss: 0.6374 | Validation F1: 0.6462
Epoch 1/3
Train Loss: 0.6253 | Train F1: 0.6525
Validation Loss: 0.6371 | Validation F1: 0.6462
Epoch 2/3
Train Loss: 0.6116 | Train F1: 0.6816
Validation Loss: 0.6371 | Validation F1: 0.6456
Epoch 2/3
Train Loss: 0.6075 | Train F1: 0.6804
Validation Loss: 0.6370 | Validation F1: 0.6462
Epoch 2/3
Train Loss: 0.6096 | Train F1: 0.6884
Validation Loss: 0.6370 | Validation F1: 0.6450
Epoch 2/3
Train Loss: 0.6229 | Train F1:

## Base LR (0.6) with Step LR Decay (0.5)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.6)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

        model.train()


Epoch 1/3
Train Loss: 0.6804 | Train F1: 0.5748
Validation Loss: 0.6621 | Validation F1: 0.6538
Epoch 1/3
Train Loss: 0.6497 | Train F1: 0.6025
Validation Loss: 0.6406 | Validation F1: 0.5889
Epoch 1/3
Train Loss: 0.6229 | Train F1: 0.6178
Validation Loss: 0.6252 | Validation F1: 0.6694
Epoch 1/3
Train Loss: 0.6113 | Train F1: 0.6377
Validation Loss: 0.6225 | Validation F1: 0.6199
Epoch 1/3
Train Loss: 0.6242 | Train F1: 0.6154
Validation Loss: 0.6179 | Validation F1: 0.6533
Epoch 1/3
Train Loss: 0.6129 | Train F1: 0.6599
Validation Loss: 0.6174 | Validation F1: 0.6675
Epoch 1/3
Train Loss: 0.6023 | Train F1: 0.6670
Validation Loss: 0.6170 | Validation F1: 0.6618
Epoch 2/3
Train Loss: 0.5891 | Train F1: 0.6931
Validation Loss: 0.6169 | Validation F1: 0.6635
Epoch 2/3
Train Loss: 0.5812 | Train F1: 0.6919
Validation Loss: 0.6167 | Validation F1: 0.6626
Epoch 2/3
Train Loss: 0.5745 | Train F1: 0.7136
Validation Loss: 0.6167 | Validation F1: 0.6626
Epoch 2/3
Train Loss: 0.5899 | Train F1:

## Base LR (0.7) with Step LR Decay (0.5)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.7)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

        model.train()


Epoch 1/3
Train Loss: 0.6808 | Train F1: 0.5764
Validation Loss: 0.6604 | Validation F1: 0.6555
Epoch 1/3
Train Loss: 0.6454 | Train F1: 0.6390
Validation Loss: 0.6319 | Validation F1: 0.5678
Epoch 1/3
Train Loss: 0.6277 | Train F1: 0.6193
Validation Loss: 0.6303 | Validation F1: 0.6849
Epoch 1/3
Train Loss: 0.6204 | Train F1: 0.6312
Validation Loss: 0.6135 | Validation F1: 0.6418
Epoch 1/3
Train Loss: 0.5905 | Train F1: 0.6417
Validation Loss: 0.6115 | Validation F1: 0.6540
Epoch 1/3
Train Loss: 0.5945 | Train F1: 0.6388
Validation Loss: 0.6117 | Validation F1: 0.6324
Epoch 1/3
Train Loss: 0.6084 | Train F1: 0.6300
Validation Loss: 0.6082 | Validation F1: 0.6453
Epoch 2/3
Train Loss: 0.5700 | Train F1: 0.7024
Validation Loss: 0.6086 | Validation F1: 0.6527
Epoch 2/3
Train Loss: 0.5657 | Train F1: 0.6857
Validation Loss: 0.6081 | Validation F1: 0.6485
Epoch 2/3
Train Loss: 0.5686 | Train F1: 0.7111
Validation Loss: 0.6078 | Validation F1: 0.6464
Epoch 2/3
Train Loss: 0.5589 | Train F1:

## Base with Plateau LR (0.5)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

# Use ReduceLROnPlateau scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()


Epoch 1/3
Train Loss: 0.6851 | Train F1: 0.5715
Validation Loss: 0.6732 | Validation F1: 0.3714
Epoch 1/3
Train Loss: 0.6346 | Train F1: 0.5934
Validation Loss: 0.6357 | Validation F1: 0.6671
Epoch 1/3
Train Loss: 0.6093 | Train F1: 0.6795
Validation Loss: 0.6170 | Validation F1: 0.6268
Epoch 1/3
Train Loss: 0.6029 | Train F1: 0.6490
Validation Loss: 0.6119 | Validation F1: 0.6723
Epoch 1/3
Train Loss: 0.6027 | Train F1: 0.6696
Validation Loss: 0.6036 | Validation F1: 0.6433
Epoch 1/3
Train Loss: 0.6283 | Train F1: 0.6432
Validation Loss: 0.5973 | Validation F1: 0.6773
Epoch 1/3
Train Loss: 0.6023 | Train F1: 0.6275
Validation Loss: 0.5820 | Validation F1: 0.6837
Epoch 2/3
Train Loss: 0.4880 | Train F1: 0.7717
Validation Loss: 0.6576 | Validation F1: 0.6825
Epoch 2/3
Train Loss: 0.5086 | Train F1: 0.7477
Validation Loss: 0.6007 | Validation F1: 0.6571
Epoch 2/3
Train Loss: 0.4846 | Train F1: 0.7606
Validation Loss: 0.6111 | Validation F1: 0.6829
Epoch 2/3
Train Loss: 0.4523 | Train F1:

## Base with Plateau LR (0.3)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

# Use ReduceLROnPlateau scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=1)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

            scheduler.step(avg_val_loss)

        model.train()


Epoch 1/3
Train Loss: 0.6991 | Train F1: 0.3035
Validation Loss: 0.6913 | Validation F1: 0.0000
Epoch 1/3
Train Loss: 0.6886 | Train F1: 0.4930
Validation Loss: 0.6759 | Validation F1: 0.6586
Epoch 1/3
Train Loss: 0.6508 | Train F1: 0.6386
Validation Loss: 0.6434 | Validation F1: 0.5872
Epoch 1/3
Train Loss: 0.6446 | Train F1: 0.5951
Validation Loss: 0.6315 | Validation F1: 0.5928
Epoch 1/3
Train Loss: 0.6148 | Train F1: 0.6550
Validation Loss: 0.6174 | Validation F1: 0.6591
Epoch 1/3
Train Loss: 0.5985 | Train F1: 0.6418
Validation Loss: 0.6120 | Validation F1: 0.6677
Epoch 1/3
Train Loss: 0.5849 | Train F1: 0.6972
Validation Loss: 0.6154 | Validation F1: 0.6137
Epoch 2/3
Train Loss: 0.5229 | Train F1: 0.7387
Validation Loss: 0.6491 | Validation F1: 0.6611
Epoch 2/3
Train Loss: 0.5052 | Train F1: 0.7340
Validation Loss: 0.6319 | Validation F1: 0.6875
Epoch 2/3
Train Loss: 0.4872 | Train F1: 0.7589
Validation Loss: 0.6127 | Validation F1: 0.6780
Epoch 2/3
Train Loss: 0.4991 | Train F1:

## Base with Plateau LR (0.6)



In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

# Use ReduceLROnPlateau scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.6, patience=1)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")

            scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6977 | Train F1: 0.5098
Validation Loss: 0.6908 | Validation F1: 0.6722
Epoch 1/3
Train Loss: 0.6832 | Train F1: 0.6466
Validation Loss: 0.6582 | Validation F1: 0.6694
Epoch 1/3
Train Loss: 0.6152 | Train F1: 0.6492
Validation Loss: 0.6461 | Validation F1: 0.5413
Epoch 1/3
Train Loss: 0.6238 | Train F1: 0.6390
Validation Loss: 0.6094 | Validation F1: 0.6448
Epoch 1/3
Train Loss: 0.6232 | Train F1: 0.6184
Validation Loss: 0.6144 | Validation F1: 0.6913
Epoch 1/3
Train Loss: 0.6175 | Train F1: 0.5637
Validation Loss: 0.6153 | Validation F1: 0.6795
Epoch 1/3
Train Loss: 0.5996 | Train F1: 0.6815
Validation Loss: 0.6055 | Validation F1: 0.6500
Epoch 2/3
Train Loss: 0.5144 | Train F1: 0.7621
Validation Loss: 0.6524 | Validation F1: 0.6687
Epoch 2/3
Train Loss: 0.5136 | Train F1: 0.7416
Validation Loss: 0.6137 | Validation F1: 0.6501
Epoch 2/3
Train Loss: 0.5073 | Train F1: 0.7433
Validation Loss: 0.6089 | Validation F1: 0.6800
Epoch 2/3
Train Loss: 0.4821 | Train F1:

## Base No LR, smaller LR rate

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.00001)

epochs = 3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).logits.view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")
            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).logits.view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")


        model.train()


Epoch 1/3
Train Loss: 0.6942 | Train F1: 0.2000
Validation Loss: 0.6912 | Validation F1: 0.1793
Epoch 1/3
Train Loss: 0.6895 | Train F1: 0.3962
Validation Loss: 0.6877 | Validation F1: 0.5070
Epoch 1/3
Train Loss: 0.6865 | Train F1: 0.5317
Validation Loss: 0.6822 | Validation F1: 0.6088
Epoch 1/3
Train Loss: 0.6789 | Train F1: 0.5767
Validation Loss: 0.6735 | Validation F1: 0.5508
Epoch 1/3
Train Loss: 0.6631 | Train F1: 0.6108
Validation Loss: 0.6586 | Validation F1: 0.6348
Epoch 1/3
Train Loss: 0.6429 | Train F1: 0.6504
Validation Loss: 0.6465 | Validation F1: 0.6655
Epoch 1/3
Train Loss: 0.6428 | Train F1: 0.6702
Validation Loss: 0.6396 | Validation F1: 0.5930
Epoch 2/3
Train Loss: 0.6119 | Train F1: 0.6490
Validation Loss: 0.6346 | Validation F1: 0.6586
Epoch 2/3
Train Loss: 0.6060 | Train F1: 0.6830
Validation Loss: 0.6281 | Validation F1: 0.6296
Epoch 2/3
Train Loss: 0.6072 | Train F1: 0.6628
Validation Loss: 0.6215 | Validation F1: 0.6702
Epoch 2/3
Train Loss: 0.5856 | Train F1:

## Conclusion

For LR decay, Reduce LR on Plateau performed the best with factor of 0.5 and patience level of 1

# Find Dropout; Final Plateau LR

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        pooler = self.dropout(pooler)
        pooler = self.dropout(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6968 | Train F1: 0.5235
Validation Loss: 0.6889 | Validation F1: 0.6643
Epoch 1/3
Train Loss: 0.6979 | Train F1: 0.5620
Validation Loss: 0.6847 | Validation F1: 0.6169
Epoch 1/3
Train Loss: 0.6933 | Train F1: 0.5468
Validation Loss: 0.6745 | Validation F1: 0.6652
Epoch 1/3
Train Loss: 0.6622 | Train F1: 0.5971
Validation Loss: 0.6398 | Validation F1: 0.5656
Epoch 1/3
Train Loss: 0.6477 | Train F1: 0.6027
Validation Loss: 0.6201 | Validation F1: 0.6468
Epoch 1/3
Train Loss: 0.6234 | Train F1: 0.6396
Validation Loss: 0.6614 | Validation F1: 0.6788
Epoch 1/3
Train Loss: 0.6414 | Train F1: 0.5869
Validation Loss: 0.6191 | Validation F1: 0.6912
Epoch 2/3
Train Loss: 0.5832 | Train F1: 0.7031
Validation Loss: 0.6242 | Validation F1: 0.6809
Epoch 2/3
Train Loss: 0.5544 | Train F1: 0.7344
Validation Loss: 0.6107 | Validation F1: 0.6822
Epoch 2/3
Train Loss: 0.5210 | Train F1: 0.7442
Validation Loss: 0.6430 | Validation F1: 0.6785
Epoch 2/3
Train Loss: 0.5622 | Train F1:

## Dropout 0.9

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.9)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.7010 | Train F1: 0.3960
Validation Loss: 0.6967 | Validation F1: 0.0000
Epoch 1/3
Train Loss: 0.7059 | Train F1: 0.5037
Validation Loss: 0.6925 | Validation F1: 0.6722
Epoch 1/3
Train Loss: 0.6962 | Train F1: 0.5148
Validation Loss: 0.6868 | Validation F1: 0.1472
Epoch 1/3
Train Loss: 0.6850 | Train F1: 0.4641
Validation Loss: 0.6725 | Validation F1: 0.6433
Epoch 1/3
Train Loss: 0.6808 | Train F1: 0.5795
Validation Loss: 0.6485 | Validation F1: 0.6260
Epoch 1/3
Train Loss: 0.6519 | Train F1: 0.6053
Validation Loss: 0.6428 | Validation F1: 0.6448
Epoch 1/3
Train Loss: 0.6457 | Train F1: 0.6417
Validation Loss: 0.6425 | Validation F1: 0.6782
Epoch 2/3
Train Loss: 0.5939 | Train F1: 0.7027
Validation Loss: 0.6228 | Validation F1: 0.6456
Epoch 2/3
Train Loss: 0.5808 | Train F1: 0.6801
Validation Loss: 0.6133 | Validation F1: 0.5877
Epoch 2/3
Train Loss: 0.5800 | Train F1: 0.6947
Validation Loss: 0.6052 | Validation F1: 0.6488
Epoch 2/3
Train Loss: 0.5563 | Train F1:

## Dropout 0.3

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6808 | Train F1: 0.5724
Validation Loss: 0.6716 | Validation F1: 0.6813
Epoch 1/3
Train Loss: 0.6313 | Train F1: 0.5817
Validation Loss: 0.6392 | Validation F1: 0.5810
Epoch 1/3
Train Loss: 0.6286 | Train F1: 0.5616
Validation Loss: 0.6319 | Validation F1: 0.6212
Epoch 1/3
Train Loss: 0.6048 | Train F1: 0.6877
Validation Loss: 0.6198 | Validation F1: 0.6074
Epoch 1/3
Train Loss: 0.6215 | Train F1: 0.5916
Validation Loss: 0.6321 | Validation F1: 0.5325
Epoch 1/3
Train Loss: 0.6037 | Train F1: 0.6493
Validation Loss: 0.6141 | Validation F1: 0.5965
Epoch 1/3
Train Loss: 0.6111 | Train F1: 0.6328
Validation Loss: 0.5960 | Validation F1: 0.6544
Epoch 2/3
Train Loss: 0.4898 | Train F1: 0.7533
Validation Loss: 0.6386 | Validation F1: 0.6807
Epoch 2/3
Train Loss: 0.5121 | Train F1: 0.7571
Validation Loss: 0.6024 | Validation F1: 0.6356
Epoch 2/3
Train Loss: 0.4972 | Train F1: 0.7544
Validation Loss: 0.5926 | Validation F1: 0.6916
Epoch 2/3
Train Loss: 0.5138 | Train F1:

### Frozen Layers for Pre-trained

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")

        # Freeze the DistilBert model
        for param in self.base.parameters():
            param.requires_grad = False

        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6939 | Train F1: 0.6037
Validation Loss: 0.6895 | Validation F1: 0.4734
Epoch 1/3
Train Loss: 0.6921 | Train F1: 0.2416
Validation Loss: 0.6870 | Validation F1: 0.2171
Epoch 1/3
Train Loss: 0.6850 | Train F1: 0.5431
Validation Loss: 0.6805 | Validation F1: 0.6497
Epoch 1/3
Train Loss: 0.6774 | Train F1: 0.6185
Validation Loss: 0.6762 | Validation F1: 0.6264
Epoch 1/3
Train Loss: 0.6737 | Train F1: 0.6252
Validation Loss: 0.6720 | Validation F1: 0.6540
Epoch 1/3
Train Loss: 0.6697 | Train F1: 0.6680
Validation Loss: 0.6690 | Validation F1: 0.6710
Epoch 1/3
Train Loss: 0.6643 | Train F1: 0.6523
Validation Loss: 0.6652 | Validation F1: 0.6301
Epoch 2/3
Train Loss: 0.6624 | Train F1: 0.6049
Validation Loss: 0.6633 | Validation F1: 0.5720
Epoch 2/3
Train Loss: 0.6573 | Train F1: 0.5698
Validation Loss: 0.6603 | Validation F1: 0.6396
Epoch 2/3
Train Loss: 0.6575 | Train F1: 0.6519
Validation Loss: 0.6584 | Validation F1: 0.6454
Epoch 2/3
Train Loss: 0.6536 | Train F1:

## Dropout 0.2

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6790 | Train F1: 0.5244
Validation Loss: 0.6520 | Validation F1: 0.6436
Epoch 1/3
Train Loss: 0.6540 | Train F1: 0.5942
Validation Loss: 0.6394 | Validation F1: 0.5452
Epoch 1/3
Train Loss: 0.6262 | Train F1: 0.5858
Validation Loss: 0.6050 | Validation F1: 0.6590
Epoch 1/3
Train Loss: 0.6216 | Train F1: 0.6146
Validation Loss: 0.6273 | Validation F1: 0.7093
Epoch 1/3
Train Loss: 0.5874 | Train F1: 0.7062
Validation Loss: 0.6029 | Validation F1: 0.6399
Epoch 1/3
Train Loss: 0.6019 | Train F1: 0.6232
Validation Loss: 0.6383 | Validation F1: 0.5354
Epoch 1/3
Train Loss: 0.5975 | Train F1: 0.6586
Validation Loss: 0.6039 | Validation F1: 0.6550
Epoch 2/3
Train Loss: 0.5230 | Train F1: 0.7053
Validation Loss: 0.6268 | Validation F1: 0.6847
Epoch 2/3
Train Loss: 0.4935 | Train F1: 0.7478
Validation Loss: 0.6240 | Validation F1: 0.6677
Epoch 2/3
Train Loss: 0.4552 | Train F1: 0.7827
Validation Loss: 0.6276 | Validation F1: 0.6936
Epoch 2/3
Train Loss: 0.4843 | Train F1:

### Frozen Layers for Pre-trained

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")

        # Freeze the DistilBert model
        for param in self.base.parameters():
            param.requires_grad = False

        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6945 | Train F1: 0.2587
Validation Loss: 0.6884 | Validation F1: 0.6585
Epoch 1/3
Train Loss: 0.6886 | Train F1: 0.6731
Validation Loss: 0.6874 | Validation F1: 0.6710
Epoch 1/3
Train Loss: 0.6827 | Train F1: 0.6607
Validation Loss: 0.6804 | Validation F1: 0.6069
Epoch 1/3
Train Loss: 0.6802 | Train F1: 0.4328
Validation Loss: 0.6797 | Validation F1: 0.3792
Epoch 1/3
Train Loss: 0.6745 | Train F1: 0.5280
Validation Loss: 0.6727 | Validation F1: 0.6498
Epoch 1/3
Train Loss: 0.6676 | Train F1: 0.6415
Validation Loss: 0.6691 | Validation F1: 0.6585
Epoch 1/3
Train Loss: 0.6660 | Train F1: 0.6638
Validation Loss: 0.6654 | Validation F1: 0.6462
Epoch 2/3
Train Loss: 0.6584 | Train F1: 0.6351
Validation Loss: 0.6627 | Validation F1: 0.6237
Epoch 2/3
Train Loss: 0.6624 | Train F1: 0.6475
Validation Loss: 0.6614 | Validation F1: 0.6637
Epoch 2/3
Train Loss: 0.6530 | Train F1: 0.5868
Validation Loss: 0.6628 | Validation F1: 0.5015
Epoch 2/3
Train Loss: 0.6500 | Train F1:

##Dropout 0.4

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.4)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6954 | Train F1: 0.3270
Validation Loss: 0.6914 | Validation F1: 0.6722
Epoch 1/3
Train Loss: 0.6812 | Train F1: 0.5946
Validation Loss: 0.6621 | Validation F1: 0.6339
Epoch 1/3
Train Loss: 0.6524 | Train F1: 0.6532
Validation Loss: 0.6392 | Validation F1: 0.6406
Epoch 1/3
Train Loss: 0.6290 | Train F1: 0.6487
Validation Loss: 0.6137 | Validation F1: 0.6686
Epoch 1/3
Train Loss: 0.6008 | Train F1: 0.6445
Validation Loss: 0.6065 | Validation F1: 0.6549
Epoch 1/3
Train Loss: 0.6001 | Train F1: 0.6424
Validation Loss: 0.6011 | Validation F1: 0.6591
Epoch 1/3
Train Loss: 0.5883 | Train F1: 0.6797
Validation Loss: 0.6018 | Validation F1: 0.6156
Epoch 2/3
Train Loss: 0.5123 | Train F1: 0.7474
Validation Loss: 0.6949 | Validation F1: 0.7035
Epoch 2/3
Train Loss: 0.5326 | Train F1: 0.7206
Validation Loss: 0.5802 | Validation F1: 0.6725
Epoch 2/3
Train Loss: 0.4964 | Train F1: 0.7765
Validation Loss: 0.5896 | Validation F1: 0.6642
Epoch 2/3
Train Loss: 0.4862 | Train F1:

##Dropout 0.1 (**best performing**)

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6887 | Train F1: 0.6028
Validation Loss: 0.6716 | Validation F1: 0.4152
Epoch 1/3
Train Loss: 0.6523 | Train F1: 0.5619
Validation Loss: 0.6296 | Validation F1: 0.6471
Epoch 1/3
Train Loss: 0.6237 | Train F1: 0.6191
Validation Loss: 0.6196 | Validation F1: 0.6773
Epoch 1/3
Train Loss: 0.5960 | Train F1: 0.6838
Validation Loss: 0.6031 | Validation F1: 0.6621
Epoch 1/3
Train Loss: 0.5948 | Train F1: 0.6829
Validation Loss: 0.5945 | Validation F1: 0.6916
Epoch 1/3
Train Loss: 0.5807 | Train F1: 0.6525
Validation Loss: 0.6046 | Validation F1: 0.6630
Epoch 1/3
Train Loss: 0.5612 | Train F1: 0.6921
Validation Loss: 0.5962 | Validation F1: 0.6807
Epoch 2/3
Train Loss: 0.4773 | Train F1: 0.7736
Validation Loss: 0.6299 | Validation F1: 0.6963
Epoch 2/3
Train Loss: 0.4393 | Train F1: 0.7740
Validation Loss: 0.6563 | Validation F1: 0.7124
Epoch 2/3
Train Loss: 0.4564 | Train F1: 0.7947
Validation Loss: 0.6106 | Validation F1: 0.6778
Epoch 2/3
Train Loss: 0.4557 | Train F1:

## no dropout

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = base_output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
del model
torch.cuda.empty_cache()

model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 0.6822 | Train F1: 0.4642
Validation Loss: 0.6772 | Validation F1: 0.5071
Epoch 1/3
Train Loss: 0.6575 | Train F1: 0.5581
Validation Loss: 0.6292 | Validation F1: 0.6664
Epoch 1/3
Train Loss: 0.6169 | Train F1: 0.6799
Validation Loss: 0.6179 | Validation F1: 0.6256
Epoch 1/3
Train Loss: 0.5935 | Train F1: 0.6632
Validation Loss: 0.6043 | Validation F1: 0.6878
Epoch 1/3
Train Loss: 0.5979 | Train F1: 0.6742
Validation Loss: 0.6083 | Validation F1: 0.6076
Epoch 1/3
Train Loss: 0.5858 | Train F1: 0.6348
Validation Loss: 0.5972 | Validation F1: 0.6629
Epoch 1/3
Train Loss: 0.5979 | Train F1: 0.6386
Validation Loss: 0.5965 | Validation F1: 0.6188
Epoch 2/3
Train Loss: 0.4899 | Train F1: 0.7554
Validation Loss: 0.6469 | Validation F1: 0.6944
Epoch 2/3
Train Loss: 0.4604 | Train F1: 0.7906
Validation Loss: 0.5930 | Validation F1: 0.6795
Epoch 2/3
Train Loss: 0.4566 | Train F1: 0.7840
Validation Loss: 0.6349 | Validation F1: 0.7070
Epoch 2/3
Train Loss: 0.4730 | Train F1:

## sequence of dropouts

In [None]:
class DistilBERTAdd(torch.nn.Module):
    def __init__(self):
        super(DistilBERTAdd, self).__init__()
        self.base = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pooler = torch.nn.AdaptiveAvgPool1d(1)
        self.relu = torch.nn.ReLU()
        self.norm = torch.nn.LayerNorm(normalized_shape=768, eps=1e-6)
        self.classifier = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        base_output = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.pooler(base_output.last_hidden_state.permute(0,2,1))
        pooled = self.relu(pooled.squeeze(-1))
        norm = self.norm(pooled)
        drop = self.dropout(norm)
        drop = self.dropout(drop)
        drop = self.dropout(drop)
        drop = self.dropout(drop)
        output = self.classifier(drop)
        return output

In [None]:
model = DistilBERTAdd()
model.to(device)

loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 3

In [None]:
for epoch in range(epochs):
    running_loss = 0
    running_f1 = 0
    model.train()
    for step, batch in enumerate(train_loader, 0):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask).view(-1)

        loss = loss_func(outputs, labels)
        running_loss += loss.item()

        f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs.detach()).cpu().numpy()), average = 'binary')
        running_f1 += f1_batch
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 9:
            avg_loss = running_loss / 10
            avg_f1 = running_f1 / 10

            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Train Loss: {avg_loss:.4f} | Train F1: {avg_f1:.4f}")

            running_loss = 0.0
            running_f1 = 0.0

            model.eval()
            val_loss = 0.0
            val_f1 = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch[0].to(device)
                    attention_mask = batch[1].to(device)
                    labels = batch[2].to(device)

                    outputs = model(input_ids, attention_mask).view(-1)

                    loss = loss_func(outputs, labels)
                    val_loss += loss.item()

                    f1_batch = f1_score(labels.cpu().numpy(), np.round(torch.sigmoid(outputs).cpu().numpy()), average = 'binary')
                    val_f1 += f1_batch

                avg_val_loss = val_loss / len(valid_loader)
                avg_val_f1 = val_f1 / len(valid_loader)

                print(f"Validation Loss: {avg_val_loss:.4f} | Validation F1: {avg_val_f1:.4f}")
                scheduler.step(avg_val_loss)

        model.train()

Epoch 1/3
Train Loss: 1.1080 | Train F1: 0.4902
Validation Loss: 0.7191 | Validation F1: 0.0282
Epoch 1/3
Train Loss: 0.9343 | Train F1: 0.5628
Validation Loss: 0.6971 | Validation F1: 0.6715
Epoch 1/3
Train Loss: 0.8667 | Train F1: 0.4990
Validation Loss: 0.6788 | Validation F1: 0.4496
Epoch 1/3
Train Loss: 0.8401 | Train F1: 0.5426
Validation Loss: 0.6691 | Validation F1: 0.5858
Epoch 1/3
Train Loss: 0.7743 | Train F1: 0.4751
Validation Loss: 0.6657 | Validation F1: 0.6508
Epoch 1/3
Train Loss: 0.7419 | Train F1: 0.5948
Validation Loss: 0.6574 | Validation F1: 0.5667
Epoch 1/3
Train Loss: 0.7298 | Train F1: 0.5622
Validation Loss: 0.6386 | Validation F1: 0.6488
Epoch 2/3
Train Loss: 0.7259 | Train F1: 0.5884
Validation Loss: 0.6371 | Validation F1: 0.6557
Epoch 2/3
Train Loss: 0.6916 | Train F1: 0.5918
Validation Loss: 0.6226 | Validation F1: 0.6329
Epoch 2/3
Train Loss: 0.6431 | Train F1: 0.6395
Validation Loss: 0.6192 | Validation F1: 0.6360
Epoch 2/3
Train Loss: 0.6173 | Train F1:

### Conclusion

Dropout rate of 0.1 performs the best with no frozen layerss