## This is the old version of the fine-tuning BERT model with the customized data loader and the train *loop*

In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, DataCollatorWithPadding, get_linear_schedule_with_warmup, AdamW
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import os

# Data preprocessing

In [None]:
dataset = load_dataset("glue", "stsb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/502k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1379
    })
})

In [None]:
dataset['train'][0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'label': 5.0,
 'idx': 0}

In [None]:
model_id = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_id)

def tokenize_fun(example):
  example['label'] = [e / 5.0 for e in example['label']]
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# (currentvalue-min)/(max-min)
def standardize_label(example):
  example['label'] = example['label']/5.0
  return example

## Check the dataloader and experiment how to use similarity calculation

In [None]:
train_dataloader = DataLoader(dataset['train'], batch_size=4, shuffle=True)

In [None]:
next(iter(train_dataloader))

{'sentence1': ['A man is punching through boards.',
  'A black and white dog looking at the camera.',
  'A close-up of a sheep in the grass.',
  'Chera Larkins, 32, of Manhattan, charged with three sham marriages, is also charged with perjury and filing a false instrument.'],
 'sentence2': ['A dog is running through water.',
  'A grey, black, and white cat looking at the camera.',
  'A close-up of a lamb with its ear tagged, standing on grass.',
  'Chera Larkins, 32, of Manhattan, charged with perjury and filing a false instrument in three marriage applications.'],
 'label': tensor([0.4000, 1.0000, 1.8000, 4.6000], dtype=torch.float64),
 'idx': tensor([ 213, 1453, 1484, 3392])}

In [None]:
model = BertModel.from_pretrained(model_id)

In [None]:
batch = next(iter(train_dataloader))
s1 = tokenizer([b for b in batch['sentence1']], return_tensors="pt", padding=True, truncation=True)
s2 = tokenizer([b for b in batch['sentence2']], return_tensors="pt", padding=True, truncation=True)

emb_s1 = model(**s1.to("cuda:0")).pooler_output
emb_s2 = model(**s2.to("cuda:0")).pooler_output

# similarity = torch.mm(emb_s1, emb_s2.transpose(0, 1))
similarity = torch.nn.functional.cosine_similarity(emb_s1, emb_s2)

labels = torch.tensor([b / 5.0 for b in batch['label']], dtype=torch.float, device="cuda:0")

In [None]:
labels, similarity, emb_s1.shape

(tensor([1.0000, 0.4400, 0.3600, 1.0000], device='cuda:0'),
 tensor([0.7295, 0.3117, 0.8007, 0.7553], device='cuda:0',
        grad_fn=<SumBackward1>),
 torch.Size([4, 768]))

## Fine-tuning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
!ls

/content
sample_data


In [None]:
def train_loop(model, tokenizer, data_loader, output_dir):
  lr = 2e-5
  epochs = 10
  num_steps = 15000
  device = "cuda:0"

  optimizer = AdamW(params=model.parameters(), lr=lr)

  # lr_scheduler = get_linear_schedule_with_warmup(
  #     optimizer=optimizer,
  #     num_warmup_steps=500,
  #     num_training_steps=num_steps,
  # )

  # loss_fn = torch.nn.CrossEntropyLoss()
  # loss_fn = torch.nn.MSELoss()
  loss_fn = torch.nn.L1Loss()

  model.to(device)
  model.train()

  def train_one_epoch(epoch_idx):
    running_loss = 0.
    total_loss = 0.
    for i, batch in enumerate(data_loader):
      s1 = tokenizer([b for b in batch['sentence1']], return_tensors="pt", padding=True, truncation=True)
      s2 = tokenizer([b for b in batch['sentence2']], return_tensors="pt", padding=True, truncation=True)

      optimizer.zero_grad()
      emb_s1 = model(**s1.to(device)).pooler_output
      emb_s2 = model(**s2.to(device)).pooler_output

      # similarity = torch.mm(emb_s1, emb_s2.transpose(0, 1))
      similarity = torch.nn.functional.cosine_similarity(emb_s1, emb_s2)

      labels = torch.tensor([b / 5.0 for b in batch['label']], dtype=torch.float, device=device)

      ## Symmetric loss as in CLIP
      loss = loss_fn(similarity, labels)

      loss.backward()
      optimizer.step()
      # lr_scheduler.step()

      running_loss += loss.item()
      total_loss += loss.item()
      if i % 100 == 99:
        print('  batch {} loss: {}'.format(i + 1, running_loss / 100))
        running_loss = 0.

    return total_loss / len(data_loader)

  for epoch_idx in range(epochs):
    print(f'EPOCH {epoch_idx}:')

    epoch_avg_loss = train_one_epoch(epoch_idx)
    print(f'   avg. loss={epoch_avg_loss}')

    output_dir_epoch = os.path.join(output_dir, f"epoch_{epoch_idx + 1}")
    model.save_pretrained(output_dir_epoch)


model_name = f"{model_id}-finetuned"
output_dir = f'./drive/MyDrive/ml_class_group_project/Lili/{model_name}'

train_loop(model, tokenizer, train_dataloader, output_dir)




EPOCH 0:
  batch 100 loss: 0.38540428794920445
  batch 200 loss: 0.42578324377536775
  batch 300 loss: 0.3683255960047245
  batch 400 loss: 0.32705690547823907
  batch 500 loss: 0.3468583682179451
  batch 600 loss: 0.3300378703325987
  batch 700 loss: 0.3201250653713942
  batch 800 loss: 0.3124663881957531
  batch 900 loss: 0.30931282564997675
  batch 1000 loss: 0.2970354448258877
  batch 1100 loss: 0.31974761314690114
  batch 1200 loss: 0.2682490088790655
  batch 1300 loss: 0.282981647849083
  batch 1400 loss: 0.27470863699913023
   avg. loss=0.3256882147627762
EPOCH 1:
  batch 100 loss: 0.295395962074399
  batch 200 loss: 0.28367391012609006
  batch 300 loss: 0.29192663833498955
  batch 400 loss: 0.26591820411384104
  batch 500 loss: 0.27516653571277855
  batch 600 loss: 0.2593534374982119
  batch 700 loss: 0.28844262577593327
  batch 800 loss: 0.25565898314118385
  batch 900 loss: 0.2823531371355057
  batch 1000 loss: 0.2585633732378483
  batch 1100 loss: 0.2712113002687693
  batch 