In [1]:
# Fine Tuning Example Banking

In [2]:
from datasets import load_dataset, DatasetDict, Dataset
import datasets

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
# Get data

In [4]:
import pandas as pd

df = pd.read_parquet("hf://datasets/bitext/Bitext-retail-banking-llm-chatbot-training-dataset/bitext-retail-banking-llm-chatbot-training-dataset.parquet")
print (df.shape)

(25545, 5)


In [5]:
df.head(2)

Unnamed: 0,tags,instruction,category,intent,response
0,BCIPZ,"I would like to acivate a card, can you help me?",CARD,activate_card,I'm here to assist you with that! Activating y...
1,BCILZ,"I have to activate an Visa online, how can I d...",CARD,activate_card,I'm here to assist you with activating your {{...


In [6]:
print (df.columns)

Index(['tags', 'instruction', 'category', 'intent', 'response'], dtype='object')


In [7]:
df = df.rename({'instruction': 'text'}, axis=1)

In [8]:
print (df.columns)

Index(['tags', 'text', 'category', 'intent', 'response'], dtype='object')


In [9]:
df_new = df.groupby("category").apply(lambda x: x.sample(frac=0.1, random_state=42)).reset_index(drop=True)
columns_required = ['text', 'category']
df_new = df_new[df_new['category'].isin(['CARD', 'LOAN', 'TRANSFER', 'FEES', 'ACCOUNT', 'CONTACT', 'ATM'])][columns_required].reset_index()

  df_new = df.groupby("category").apply(lambda x: x.sample(frac=0.1, random_state=42)).reset_index(drop=True)


In [10]:
df_new['category'].unique()

array(['ACCOUNT', 'ATM', 'CARD', 'CONTACT', 'FEES', 'LOAN', 'TRANSFER'],
      dtype=object)

In [11]:
df_new.shape

(2184, 3)

In [12]:
# preprocess dataframe

from sklearn.preprocessing import LabelEncoder

labels = list(df_new['category'].unique())

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

df_new["label"] = label_encoder.fit_transform(df_new["category"])

In [13]:
from sklearn.model_selection import train_test_split

X = ['text']
Y = ['label']

xtrain, xtest, ytrain, ytest = train_test_split(df_new[X], df_new[Y], test_size=0.2, stratify = df_new[Y], random_state=42)

xtrain['label'] = ytrain
xtest['label'] = ytest

xtrain = xtrain.reset_index()
xtest = xtest.reset_index()

xtrain = xtrain.drop('index', axis=1)
xtest = xtest.drop('index', axis=1)

train_dataset = Dataset.from_pandas(xtrain)
test_dataset = Dataset.from_pandas(xtest)


dataset = datasets.DatasetDict({"train":train_dataset,"validation":test_dataset})

In [14]:
# dataset = Dataset.from_pandas(df_new)

In [15]:
xtrain

Unnamed: 0,text,label
0,need to see the mortgage payments,5
1,"I have to dispute a transaction, can I get som...",1
2,i want help to activate an visa on mobile,2
3,"I would like to cancel a Master Card online, I...",2
4,can you help me to apply for a mortgage?,5
...,...,...
1742,can uhelp me to take out a loan,5
1743,wanma see my account fees can i get some help,4
1744,my master card has been stolen can i lock it,2
1745,"I have to cancel a account, how can I do it?",0


In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1747
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 437
    })
})

In [17]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "ACCOUNT", 1: "ATM", 2:"CARD", 3:"CONTACT", 4:"FEES", 5:"LOAN", 6:"TRANSFER"}
label2id = {"ACCOUNT":0, "ATM":1, "CARD":2, "CONTACT":3, "FEES":4, "LOAN":5, "TRANSFER":6}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=7, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [19]:
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [20]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset


Map:   0%|          | 0/1747 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1747
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 437
    })
})

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [22]:
accuracy = evaluate.load("accuracy")

In [23]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [24]:
text_list = ["What is the eligibility criteria for a home loan", "How to apply for credit card", "What is the process of getting loan approved.", "Can I get a loan", "I want to transfer money"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
What is the eligibility criteria for a home loan - CARD
How to apply for credit card - FEES
What is the process of getting loan approved. - CARD
Can I get a loan - FEES
I want to transfer money - CARD


In [25]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [26]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 632,839 || all params: 67,591,694 || trainable%: 0.9363


In [27]:
lr = 1e-3
batch_size = 4
num_epochs = 5

In [28]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.057838,{'accuracy': 0.9816933638443935}
2,0.243300,0.046114,{'accuracy': 0.9839816933638444}
3,0.039600,0.054895,{'accuracy': 0.9839816933638444}
4,0.020500,0.046321,{'accuracy': 0.9839816933638444}
5,0.008400,0.038865,{'accuracy': 0.9862700228832952}


TrainOutput(global_step=2185, training_loss=0.07174713663432909, metrics={'train_runtime': 133.6958, 'train_samples_per_second': 65.335, 'train_steps_per_second': 16.343, 'total_flos': 42235168207056.0, 'train_loss': 0.07174713663432909, 'epoch': 5.0})

In [30]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
What is the eligibility criteria for a home loan - LOAN
How to apply for credit card - CARD
What is the process of getting loan approved. - LOAN
Can I get a loan - LOAN
I want to transfer money - TRANSFER


In [36]:
model.save_pretrained("bank_fineuned_model.pt")
# tokenizer.save_pretrained()

In [37]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel

# Define paths
MODEL_CHECKPOINT = 'distilbert-base-uncased'
SAVED_MODEL_PATH = "bank_fineuned_model.pt"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=7  # Adjust this based on your training labels
)

# Load LoRA fine-tuned model
model = PeftModel.from_pretrained(base_model, SAVED_MODEL_PATH)

# Move to device (CPU/GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("LoRA Fine-tuned Model Loaded Successfully!")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA Fine-tuned Model Loaded Successfully!


In [39]:
def predict(model, tokenizer, id2label, text_list, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    print("Trained Model Predictions:")
    print("--------------------------")
    
    for text in text_list:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}  # Move input tensors to the correct device
        
        with torch.no_grad():  # Disable gradient computation for inference
            logits = model(**inputs).logits
        
        predictions = torch.argmax(logits, dim=1).item()
        print(f"{text} - {id2label[predictions]}")

In [38]:
text_samples = [
    "What is the eligibility criteria for a home loan",
    "How to apply for a credit card",
    "What is the process of getting a loan approved.",
    "Can I get a loan",
    "I want to transfer money"
]

predict(model, tokenizer, id2label, text_samples, device)


NameError: name 'predict' is not defined