<a href="https://colab.research.google.com/github/sdeshmukh99/Generative-AI-Showcase/blob/main/Showcase_05/Fine_Tuning_BERT_for_Intent_Classification_on_User_Queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1: Setup Steps

In [1]:
## 1.1 Import Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 1.2 Install Required Libraries
!pip install transformers datasets

## 1.3 Download Dataset
!wget https://cdn.exec.talentsprint.com/static/aimlops/c3/Intent.json

## 1.4 Import Necessary Packages
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import time
import datetime
import random
from datasets import load_dataset, Dataset

## 1.5 Check for GPU Availability
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

print(device)  # Print the device type (CPU or GPU)

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:0

# 2: Data Preparation

In [2]:
## 2.1 Load Dataset from JSON File
with open('Intent.json') as f:
    data = json.load(f)

## 2.2 Exploratory Data Analysis
for i in data['intents']:
  print(i)

## 2.3 Convert JSON Data to DataFrame and Create a Copy
raw_dataset = pd.DataFrame(data['intents'][0:])
raw_dataset_base = raw_dataset.copy()

## 2.4 Drop Unnecessary Columns and Display Dataset
raw_dataset.drop(columns=['extension', 'context', 'entityType', 'entities', 'responses'], inplace=True)
raw_dataset.head(2)  # Display first few rows after dropping columns

## 2.5 Explode the 'text' Column
raw_dataset = raw_dataset.explode('text')

## 2.6 Encode Intent Labels to Numeric Values
encoder = LabelEncoder()
raw_dataset['intent']= encoder.fit_transform(raw_dataset['intent'])

## 2.7 Split the Data into Training, Validation, and Test Sets
train_val_df, test_df = train_test_split(raw_dataset, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)

## 2.8 Convert DataFrames to Dataset Format and Display Lengths
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
len(train_dataset), len(val_dataset), len(test_dataset)  # Display the lengths of train, validation, and test sets

{'intent': 'Greeting', 'text': ['Hi', 'Hi there', 'Hola', 'Hello', 'Hello there', 'Hya', 'Hya there'], 'responses': ['Hi human, please tell me your GeniSys user', 'Hello human, please tell me your GeniSys user', 'Hola human, please tell me your GeniSys user'], 'extension': {'function': '', 'entities': False, 'responses': []}, 'context': {'in': '', 'out': 'GreetingUserRequest', 'clear': False}, 'entityType': 'NA', 'entities': []}
{'intent': 'GreetingResponse', 'text': ['My user is Adam', 'This is Adam', 'I am Adam', 'It is Adam', 'My user is Bella', 'This is Bella', 'I am Bella', 'It is Bella'], 'responses': ['Great! Hi <HUMAN>! How can I help?', 'Good! Hi <HUMAN>, how can I help you?', 'Cool! Hello <HUMAN>, what can I do for you?', 'OK! Hola <HUMAN>, how can I help you?', 'OK! hi <HUMAN>, what can I do for you?'], 'extension': {'function': 'extensions.gHumans.updateHuman', 'entities': True, 'responses': ['Hi %%HUMAN%%! How can I help?', 'Hi %%HUMAN%%, how can I help you?', 'Hello %%HUM

(91, 23, 29)

# 3: Data Tokenization Steps

In [3]:
## 3.1 Load Pre-trained BERT Tokenizer
ckpt = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(ckpt)

## 3.2 Define Tokenization Function
def tokenize_function(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

## 3.3 Tokenize Train, Validation, and Test Datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

## 3.4 Prepare the Tokenized Datasets for Training
def prepare_dataset(dataset):
  dataset = dataset.remove_columns(['text','__index_level_0__'])  # Remove unnecessary columns
  dataset = dataset.rename_column("intent", "labels")  # Rename intent column to labels for training
  dataset.set_format('torch')  # Set format to PyTorch for compatibility
  return dataset

# Apply Preparation to Tokenized Datasets
tokenized_train = prepare_dataset(tokenized_train)
tokenized_val = prepare_dataset(tokenized_val)
tokenized_test = prepare_dataset(tokenized_test)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

# 4: Model Loading and Training

In [4]:
## 4.1 Load a Pre-Trained BERT Model
model = (AutoModelForSequenceClassification.from_pretrained(
                        ckpt,
                        num_labels=22,  # Number of intent labels
                       ).to(device))

## 4.2 Define Performance Metrics for Evaluation
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average= 'weighted')
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc, 'f1': f1}

## 4.3 Set Training Parameters
batch_size = 16
logging_steps = len(tokenized_train) // batch_size
model_name = f"{ckpt}-finetuned-intent"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=40,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level='error')

## 4.4 Train the Model Using the Trainer API
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=tokenized_train,
                  eval_dataset=tokenized_val,
                  tokenizer=tokenizer,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=21)])

trainer.train()  # Start training the model

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.1166,3.048063,0.130435,0.110559
2,3.088,3.038755,0.043478,0.005797
3,3.0524,3.006458,0.173913,0.142857
4,3.0206,2.93633,0.434783,0.426087
5,2.8605,2.855461,0.521739,0.485507
6,2.7411,2.760912,0.608696,0.594203
7,2.7066,2.648484,0.608696,0.57971
8,2.5532,2.535587,0.652174,0.608696
9,2.4052,2.412622,0.695652,0.643892
10,2.1783,2.313383,0.73913,0.710145


TrainOutput(global_step=240, training_loss=1.445779659350713, metrics={'train_runtime': 201.463, 'train_samples_per_second': 18.068, 'train_steps_per_second': 1.191, 'total_flos': 482353311498240.0, 'train_loss': 1.445779659350713, 'epoch': 40.0})

# 5: Prediction and Response Steps


In [9]:
## 5.1 Define Function to Predict Intent for a Given Question
def predict_with_bert(sentence, trainer, tokenizer, encoder, device):
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors="pt")  # Tokenize input
    dataset = Dataset.from_dict(inputs)  # Create dataset from input
    dataset = dataset.with_format("torch")  # Set format to PyTorch
    preds_output = trainer.predict(dataset)  # Get predictions from model
    preds = torch.tensor(preds_output.predictions).argmax(dim=-1)  # Get predicted label
    preds = preds.cpu().numpy()  # Move predictions to CPU
    predicted_label = encoder.inverse_transform(preds)[0]  # Convert numeric label back to original intent label
    return predicted_label  # Return the first (and only) prediction

## 5.2 Test the Prediction Function
sentence = 'What can I call you?'
intent = predict_with_bert(sentence, trainer, tokenizer,encoder,device)
print(f"Predicted intent: {intent}")

## 5.3 Define Function to Get a Response Based on Intent
def get_response(pred_intent, data):
    result = (data.loc[data['intent'] == pred_intent, 'responses']).explode()  # Find responses for the given intent
    for i in result:  # Get the first response from the list
      response = i
      break
    return response

## 5.4 Test the Response Function
intent = "RealNameQuery"
response = get_response(intent, raw_dataset_base)
print(response)

## 5.5 Integrate the above two. Define Function to Predict Intent and Provide a Response
def pred_intent_response(sentence, trainer, tokenizer, encoder, data , device):
  inputs = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors="pt")
  dataset = Dataset.from_dict(inputs)
  dataset = dataset.with_format("torch")
  preds_output = trainer.predict(dataset)
  preds = torch.tensor(preds_output.predictions).argmax(dim=-1)
  preds = preds.cpu().numpy()
  predicted_label = encoder.inverse_transform(preds)[0]
  result_response = (data.loc[data['intent'] == predicted_label, 'responses']).explode()
  for i in result_response:
    response = i
    break
  return predicted_label, response

## 5.6 Test the Integrated Prediction and Response Function
sentence = "can you see me ?"
pred_label, pred_response = pred_intent_response(sentence, trainer, tokenizer, encoder, raw_dataset_base, device)
print(f" predicted label : {pred_label} \n predicted response : {pred_response}")

Predicted intent: NameQuery
My name is GeniSys


 predicted label : NameQuery 
 predicted response : You can call me Geni
