# Fine tunning Bert Model

In [16]:
%pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting scipy>=1.8.0
  Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, sc

In [27]:
%pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 KB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.4
Note: you may need to restart the kernel to use updated packages.


## 1.Loading the dataset

In [3]:
from datasets import load_dataset

data = load_dataset("fancyzhx/amazon_polarity")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})

In [5]:
# label 1-> positive and 0-> negative
data["train"][0]

{'label': 1,
 'title': 'Stuning even for the non-gamer',
 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}

In [7]:
import torch
print(torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"
device 

True


'cuda'

In [14]:
train_data = data["train"].shuffle(seed=42).select(range(20000))
test_data = data["test"].shuffle(seed=42).select(range(2000))

In [15]:
train_data,test_data

(Dataset({
     features: ['label', 'title', 'content'],
     num_rows: 20000
 }),
 Dataset({
     features: ['label', 'title', 'content'],
     num_rows: 2000
 }))

In [17]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

labels = data["train"]["label"]
class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
print(class_weights)

[1. 1.]


## 2.Loading the tokenizer

In [None]:
from transformers import AutoTokenizer

# Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [10]:
text = data["train"][0]["content"]
tokens = tokenizer(text,padding="max_length",truncation=True,max_length=216)
print(f"Text:{text}")
print("="*30)
print(f"tokens:{tokens}")

Text:This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
tokens:{'input_ids': [101, 2023, 2614, 2650, 2001, 3376, 999, 2009, 23262, 1996, 12411, 7301, 1999, 2115, 2568, 2061, 2092, 1045, 2052, 28667, 8462, 4859, 2009, 2130, 2000, 2111, 2040, 5223, 6819, 2094, 1012, 2208, 2189, 999, 1045, 2031, 2209, 1996, 2208, 10381, 4948, 2080, 2892, 2021, 2041, 1997, 2035, 1997, 1996, 2399, 1045, 2031, 2412, 2209, 2009, 2038, 1996, 2190, 2189, 999, 2009, 10457, 2185, 2013, 13587, 9019, 2075, 1998, 3138, 1037, 4840, 2121, 3357, 2007, 24665, 3686, 7334, 1998, 3969, 3993, 19505, 1012, 2009, 2052, 17894, 3087, 2040, 14977, 2000, 4952, 999, 1034, 1035, 1034, 102, 0, 0

In [18]:
# Tokenizing the data 
def tokenizer_function(examples):
  return tokenizer(examples["content"],padding="max_length",truncation=True,max_length=256)

tokenized_train_data = train_data.map(tokenizer_function,batched=True)
tokenized_test_data = test_data.map(tokenizer_function,batched=True)

Map: 100%|██████████| 20000/20000 [00:03<00:00, 5012.92 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 5240.75 examples/s]


In [20]:
tokenized_train_data[0].keys()

dict_keys(['label', 'title', 'content', 'input_ids', 'token_type_ids', 'attention_mask'])

## 3.Loading the Model

In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.53.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



## 3.Freezing the layers

In [23]:
# Freeze all layers except the classifier
for param in model.bert.parameters():
    param.requires_grad = False

# Keep only the classification head trainable
for param in model.classifier.parameters():
    param.requires_grad = True

print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Trainable parameters: 1538


## 4.Defining the training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",           
    eval_strategy="epoch",     
    save_strategy="epoch",
    learning_rate=5e-5,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    num_train_epochs=3,             
    weight_decay=0.01,               
    save_total_limit=2,              
    load_best_model_at_end=True,     
    logging_dir="./logs",            
    logging_steps=100,               
    fp16=True                        
)

print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=True,
fp16

## 5.Defing the trainer

In [None]:
from transformers import Trainer
from evaluate import load


metric = load("accuracy") 


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script: 4.20kB [00:00, 8.94MB/s]


In [29]:
predictions = [0, 1, 1, 0, 1]
references =  [0, 1, 0, 0, 1]

results = metric.compute(predictions=predictions, references=references)

print(results)

{'accuracy': 0.8}


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # for dynamic paadding

In [None]:
trainer = Trainer(
    model=model,                        
    args=training_args,                 
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,        
    compute_metrics=compute_metrics     
)

  trainer = Trainer(


In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6515,0.6425,0.6745
2,0.6337,0.618662,0.7545
3,0.6181,0.610606,0.7455


TrainOutput(global_step=3750, training_loss=0.6449039916992187, metrics={'train_runtime': 497.4175, 'train_samples_per_second': 120.623, 'train_steps_per_second': 7.539, 'total_flos': 7893331660800000.0, 'train_loss': 0.6449039916992187, 'epoch': 3.0})

In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6024,0.586796,0.723
2,0.5839,0.567042,0.776
3,0.5756,0.562973,0.77


TrainOutput(global_step=3750, training_loss=0.5919004740397136, metrics={'train_runtime': 501.8527, 'train_samples_per_second': 119.557, 'train_steps_per_second': 7.472, 'total_flos': 7893331660800000.0, 'train_loss': 0.5919004740397136, 'epoch': 3.0})

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5606,0.54863,0.7515
2,0.5574,0.533319,0.7875
3,0.5493,0.530634,0.788


TrainOutput(global_step=3750, training_loss=0.557756825764974, metrics={'train_runtime': 478.7053, 'train_samples_per_second': 125.338, 'train_steps_per_second': 7.834, 'total_flos': 7893331660800000.0, 'train_loss': 0.557756825764974, 'epoch': 3.0})