In [1]:
from datasets import load_dataset, DatasetDict, Dataset
import datasets

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [2]:
import pandas as pd

df = pd.read_parquet("hf://datasets/bitext/Bitext-retail-banking-llm-chatbot-training-dataset/bitext-retail-banking-llm-chatbot-training-dataset.parquet")
print (df.shape)

(25545, 5)


In [3]:
df.head(2)

Unnamed: 0,tags,instruction,category,intent,response
0,BCIPZ,"I would like to acivate a card, can you help me?",CARD,activate_card,I'm here to assist you with that! Activating y...
1,BCILZ,"I have to activate an Visa online, how can I d...",CARD,activate_card,I'm here to assist you with activating your {{...


In [4]:
print (df.columns)

Index(['tags', 'instruction', 'category', 'intent', 'response'], dtype='object')


In [5]:
df = df.rename({'instruction': 'text'}, axis=1)

In [6]:
print (df.columns)

Index(['tags', 'text', 'category', 'intent', 'response'], dtype='object')


In [15]:
df_new = df.groupby("category").apply(lambda x: x.sample(frac=0.1, random_state=42)).reset_index(drop=True)
columns_required = ['text', 'category']
df_new = df_new[df_new['category'].isin(['CARD', 'LOAN', 'TRANSFER', 'FEES', 'ACCOUNT', 'CONTACT', 'ATM'])][columns_required].reset_index()

  df_new = df.groupby("category").apply(lambda x: x.sample(frac=0.1, random_state=42)).reset_index(drop=True)


In [16]:
df_new['category'].unique()

array(['ACCOUNT', 'ATM', 'CARD', 'CONTACT', 'LOAN', 'TRANSFER'],
      dtype=object)

In [17]:
df_new["category"].value_counts()

category
CARD        598
LOAN        595
ACCOUNT     299
CONTACT     200
TRANSFER    199
ATM         198
Name: count, dtype: int64

In [18]:
# preprocess dataframe

from sklearn.preprocessing import LabelEncoder

labels = list(df_new['category'].unique())

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

df_new["label"] = label_encoder.fit_transform(df_new["category"])

In [19]:
from sklearn.model_selection import train_test_split

X = ['text']
Y = ['label']

xtrain, xtest, ytrain, ytest = train_test_split(df_new[X], df_new[Y], test_size=0.2, stratify = df_new[Y], random_state=42)

xtrain['label'] = ytrain
xtest['label'] = ytest

xtrain = xtrain.reset_index()
xtest = xtest.reset_index()

xtrain = xtrain.drop('index', axis=1)
xtest = xtest.drop('index', axis=1)

train_dataset = Dataset.from_pandas(xtrain)
test_dataset = Dataset.from_pandas(xtest)


dataset = datasets.DatasetDict({"train":train_dataset,"validation":test_dataset})

In [20]:
# dataset = Dataset.from_pandas(df_new)

In [21]:
xtrain

Unnamed: 0,text,label
0,id like to activate a master card where could ...,2
1,i got to check the <fucking> current balance o...,0
2,i got to check the loan payments hbelp me,4
3,"I want a loan, can you help me to applu for it?",4
4,i have made a mistakeand i transferred money t...,5
...,...,...
1666,"I don't want myt Visa, I need help cancelling it",2
1667,help me to check the loan paymenys,4
1668,need to ativate a credit card for internationa...,2
1669,I'd like to dispute an atm transaction help me,1


In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1671
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 418
    })
})

In [23]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "ACCOUNT", 1: "ATM", 2:"CARD", 3:"CONTACT", 4:"FEES", 5:"LOAN", 6:"TRANSFER"}
label2id = {"ACCOUNT":0, "ATM":1, "CARD":2, "CONTACT":3, "FEES":4, "LOAN":5, "TRANSFER":6}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=7, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [25]:
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [26]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset


Map:   0%|          | 0/1671 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1671
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 418
    })
})

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [28]:
accuracy = evaluate.load("accuracy")

In [29]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [30]:
text_list = ["What is the eligibility criteria for a home loan", "How to apply for credit card", "What is the process of getting loan approved.", "Can I get a loan", "I want to transfer money"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
What is the eligibility criteria for a home loan - FEES
How to apply for credit card - CARD
What is the process of getting loan approved. - CONTACT
Can I get a loan - FEES
I want to transfer money - FEES


In [31]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [32]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 632,839 || all params: 67,591,694 || trainable%: 0.9363


In [33]:
lr = 1e-3
batch_size = 4
num_epochs = 5

In [34]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.02141,{'accuracy': 0.9904306220095693}
2,0.215300,0.040021,{'accuracy': 0.9904306220095693}
3,0.022700,0.0217,{'accuracy': 0.992822966507177}
4,0.000600,0.023622,{'accuracy': 0.992822966507177}
5,0.001800,0.023737,{'accuracy': 0.992822966507177}


TrainOutput(global_step=2090, training_loss=0.05755511112154671, metrics={'train_runtime': 77.7786, 'train_samples_per_second': 107.42, 'train_steps_per_second': 26.871, 'total_flos': 40707153443616.0, 'train_loss': 0.05755511112154671, 'epoch': 5.0})

In [38]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
What is the eligibility criteria for a home loan - FEES
How to apply for credit card - CARD
What is the process of getting loan approved. - FEES
Can I get a loan - FEES
I want to transfer money - LOAN


In [32]:
model.save_pretrained("bank_fineuned_model.pt")

In [34]:
# from transformers import AutoTokenizer,AutoModelForCausalLM

model_finetuned = AutoModelForSequenceClassification.from_pretrained("bank_fineuned_model.pt")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Error(s) in loading state_dict for DistilBertForSequenceClassification:
	size mismatch for classifier.modules_to_save.default.weight: copying a param with shape torch.Size([7, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
	size mismatch for classifier.modules_to_save.default.bias: copying a param with shape torch.Size([7]) from checkpoint, the shape in current model is torch.Size([2]).