# Finetuning a pretrained BERT model for IMDB dataset
## This notebook outlines the concepts behind finetuning an already existing pretrained BERT model for a Sentence Classification problem using IMDB dataset

### Import the Transformers library

In [1]:
! pip install transformers

### Import the datasets library

In [2]:
! pip install datasets

In [3]:
from datasets import load_dataset

### Load the IMDB dataset

In [4]:
raw_datasets = load_dataset("imdb")

In [5]:
raw_datasets

The **raw_datasets** object is a dictionary with three keys: 
- "train"
- "test"
- "unsupervised" (which correspond to the three splits of that dataset)

Use the "train" split for training and the "test" split for validation

### Import Tokenizer

In [6]:
from transformers import AutoTokenizer

### Create a BERT tokenizer
- Use **bert-base-cased**

In [7]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

### Preprocess the input data for the model
Ex: inputs = tokenizer(sentences, padding="max_length", truncation=True)

### tokenize_function

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [9]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Run the following commented cells only if you have higher GPU memory

### Generate a small version dataset (1000 samples) with sampling 

In [10]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [11]:
len(small_train_dataset), len(small_eval_dataset), len(full_train_dataset), len(full_eval_dataset)

### Import the Model for Sequence Classification

In [12]:
from transformers import AutoModelForSequenceClassification

### Create the model from pretrained version
- Use **bert-base-cased**
- Use **num_labels** as the number of target labels

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2)

### Import the Trainer from transformers

In [14]:
from transformers import Trainer

### Import TrainingArguments

In [15]:
from transformers import TrainingArguments

### Specify TrainingArguments
- Destination to store checkpoints
- evaluation_strategy as **epoch**

In [16]:
training_args = TrainingArguments("test_trainer")

### Create a Trainer
- model
- training arguments
- train dataset
- eval dataset

In [17]:
trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=full_train_dataset,
     eval_dataset=full_eval_dataset
 )

### Train (Finetune) the model

In [18]:
trainer.train()

### Import load_metric from datasets

In [19]:
import numpy as np
from datasets import load_metric

### Load the Accuracy metric

In [20]:
metric = load_metric("accuracy")

### compute_metrics() function

In [21]:
def compute_metrics(eval_pred):
     logits, labels = eval_pred
     predictions = np.argmax(logits, axis=-1)
     return metric.compute(predictions=predictions, references=labels)

### Trainer
- Use compute_metrics as a hyperparameter

In [22]:
trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=full_train_dataset,
     eval_dataset=full_eval_dataset,
     compute_metrics=compute_metrics,
)

### Evaluate the finetuned model

In [23]:
trainer.evaluate()

### Clear Cache if you run into Cuda Out of Memory issues

In [None]:
import torch
torch.cuda.empty_cache()

### Preprocessing for DataLaoder

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

### Create DataLoaders

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=2)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=2)

### Create the model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

### Optimizer

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

### Scheduler

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

### Check if GPU is available and send the model to the corresponding device

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
device

### Training loop

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

### Evaluation on test dataset

In [None]:
from datasets import load_metric
metric= load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()