In [4]:
import pandas as pd

import torch.nn as nn
from transformers import BertTokenizer, BertModel
from datasets import load_dataset

In [1]:
import os, sys
project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from transfer_learning.bert_plus import *

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset = load_dataset('imdb')
train_dataset = dataset['train']
test_dataset = dataset['test']


In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenize = lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=512)
import torch.nn as nn


In [7]:
train_tokenized = train_dataset.map(tokenize, batched=True)
test_tokenized = test_dataset.map(tokenize, batched=True)

In [8]:
train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [9]:
from torch.utils.data import DataLoader
batch_size = 64
train_loader = DataLoader(train_tokenized, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_tokenized, batch_size=batch_size)


In [10]:
for batch in train_loader:
    print(batch.keys())
    break

dict_keys(['label', 'input_ids', 'attention_mask'])


In [11]:
embedding_dim = 32
hidden_size = 64
output_size = 2
seq_len = 10
learning_rate = 1e-3
max_iter = 5000
eval_interval = 500

import torch
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [13]:
# 1-batch overfit
batch = next(iter(train_loader))
model = SemiFrozenBERTClassifier().to(device)
optimizer = torch.optim.AdamW([
    {'params': model.classifier.parameters(), 'lr': 2e-4},
    {'params': model.bert.encoder.layer[11].parameters(), 'lr': 1e-5},
    {'params': model.bert.encoder.layer[10].parameters(), 'lr': 1e-5},
])
criterion = nn.CrossEntropyLoss()

for step in range(100):
    model.train()
    logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
    labels = batch['label'].to(device)
    loss = criterion(logits, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())


0.6787136793136597
0.6700606346130371
0.6636359095573425
0.6454020738601685
0.6270042061805725
0.6250820755958557
0.6177058219909668
0.5975170135498047
0.5820800065994263
0.5659152269363403
0.550234854221344
0.526050865650177
0.5141038298606873
0.4885838031768799
0.4771472215652466
0.4631776213645935
0.4466305375099182
0.4283401370048523
0.40658828616142273
0.37958675622940063
0.35350340604782104
0.3469786047935486
0.32961970567703247
0.3212670087814331
0.28489410877227783
0.2770461440086365
0.2505478858947754
0.2250167429447174
0.21140539646148682
0.20824676752090454
0.19729341566562653
0.15676163136959076
0.15898293256759644
0.18108299374580383
0.11832742393016815
0.12122823297977448
0.10366817563772202
0.07910440117120743
0.10409163683652878
0.061882488429546356
0.058903373777866364
0.07141883671283722
0.05030558258295059
0.05123717337846756
0.035245999693870544
0.029277050867676735
0.034901637583971024
0.026213137432932854
0.04502221569418907
0.022775467485189438
0.0206679869443178

In [14]:
model = FrozenBERTClassifier().to(device)
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()


model.train()

for epoch in range(3):
    step = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch: {epoch}, Step {step} Loss: {loss.item():.4f}")
        step += 1

Step 0 Loss: 0.6896
Step 100 Loss: 0.3358
Step 200 Loss: 0.2901
Step 300 Loss: 0.3526


In [17]:
from sklearn.metrics import accuracy_score

@torch.no_grad()
def compute_accuracy(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        logits = model(input_ids, attention_mask)
        predictions = torch.argmax(logits, dim=-1)
        all_predictions.extend(predictions.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
    accuracy = accuracy_score(all_labels, all_predictions)
    return accuracy

compute_accuracy(model, test_loader)

KeyboardInterrupt: 