## Import necessary dependencies

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

## Load the dataset

In [5]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

train_df = pd.read_json("hf://datasets/mteb/banking77/" + splits["train"], lines=True)

test_df = pd.read_json("hf://datasets/mteb/banking77/" + splits["test"], lines=True)

In [6]:
train_df.head()

Unnamed: 0,text,label,label_text
0,I am still waiting on my card?,11,card_arrival
1,What can I do if my card still hasn't arrived ...,11,card_arrival
2,I have been waiting over a week. Is the card s...,11,card_arrival
3,Can I track my card while it is in the process...,11,card_arrival
4,"How do I know if I will get my card, or if it ...",11,card_arrival


In [7]:
test_df.head()

Unnamed: 0,text,label,label_text
0,How do I locate my card?,11,card_arrival
1,"I still have not received my new card, I order...",11,card_arrival
2,I ordered a card but it has not arrived. Help ...,11,card_arrival
3,Is there a way to know when my card will arrive?,11,card_arrival
4,My card has not arrived yet.,11,card_arrival


In [8]:
print("No. of rows in train set: ", len(train_df))
print("No. of rows in test set: ", len(test_df))

No. of rows in train set:  10003
No. of rows in test set:  3080


In [9]:
# no. of unique labels (intents)
train_df.nunique()

text          10003
label            77
label_text       77
dtype: int64

## Data Preprocessing

#### Analyze and Select Key Intents

We will focus on the top 10 most common label_text (intent) instead of all 77. Working with a smaller subset can make the training process faster and more manageable.

In [10]:
# Select the top 20 most common intents
top_intents = train_df['label_text'].value_counts().head(20).index
print("Top 20 Intents:", top_intents)

Top 20 Intents: Index(['card_payment_fee_charged', 'direct_debit_payment_not_recognised',
       'balance_not_updated_after_cheque_or_cash_deposit',
       'wrong_amount_of_cash_received', 'cash_withdrawal_charge',
       'transaction_charged_twice', 'declined_cash_withdrawal',
       'transfer_fee_charged', 'transfer_not_received_by_recipient',
       'balance_not_updated_after_bank_transfer', 'request_refund',
       'card_payment_not_recognised', 'card_payment_wrong_exchange_rate',
       'extra_charge_on_statement', 'wrong_exchange_rate_for_cash_withdrawal',
       'Refund_not_showing_up', 'reverted_card_payment?',
       'cash_withdrawal_not_recognised', 'activate_my_card',
       'pending_card_payment'],
      dtype='object', name='label_text')


In [11]:
# Add 'card_arrival' and 'lost_or_stolen_card' to the list of intents if they're not already included
top_intents = top_intents.append(pd.Index(['card_arrival', 'lost_or_stolen_card']))
print("Updated Top Intents:", top_intents)

Updated Top Intents: Index(['card_payment_fee_charged', 'direct_debit_payment_not_recognised',
       'balance_not_updated_after_cheque_or_cash_deposit',
       'wrong_amount_of_cash_received', 'cash_withdrawal_charge',
       'transaction_charged_twice', 'declined_cash_withdrawal',
       'transfer_fee_charged', 'transfer_not_received_by_recipient',
       'balance_not_updated_after_bank_transfer', 'request_refund',
       'card_payment_not_recognised', 'card_payment_wrong_exchange_rate',
       'extra_charge_on_statement', 'wrong_exchange_rate_for_cash_withdrawal',
       'Refund_not_showing_up', 'reverted_card_payment?',
       'cash_withdrawal_not_recognised', 'activate_my_card',
       'pending_card_payment', 'card_arrival', 'lost_or_stolen_card'],
      dtype='object')


In [12]:
# Filter data for updated top intents
train_df = train_df[train_df['label_text'].isin(top_intents)]
test_df = test_df[test_df['label_text'].isin(top_intents)]

In [16]:
train_df.tail()

Unnamed: 0,text,label,label_text
9510,I used my card to get cash then I was charged ...,19,cash_withdrawal_charge
9511,I noticed a different charge while I was takin...,19,cash_withdrawal_charge
9512,Why was I charged a fee for withdrawing money?,19,cash_withdrawal_charge
9513,Do cash withdrawals cost anything?,19,cash_withdrawal_charge
9514,Why is there a fee for getting cash?,19,cash_withdrawal_charge


In [17]:
test_df.head()

Unnamed: 0,text,label,label_text
0,How do I locate my card?,11,card_arrival
1,"I still have not received my new card, I order...",11,card_arrival
2,I ordered a card but it has not arrived. Help ...,11,card_arrival
3,Is there a way to know when my card will arrive?,11,card_arrival
4,My card has not arrived yet.,11,card_arrival


In [18]:
# Predefined responses for each responses
responses = {
    "card_payment_fee_charged": "Please check your account statement for details on card payment fees.",
    "direct_debit_payment_not_recognised": "If a direct debit payment isn't recognized, please contact our support team for assistance.",
    "balance_not_updated_after_cheque_or_cash_deposit": "Balance updates may take a few hours. If it's still not updated, please contact us.",
    "wrong_amount_of_cash_received": "If you received the wrong amount of cash, please report it to the nearest branch immediately.",
    "cash_withdrawal_charge": "Cash withdrawal charges are detailed in your account terms. Please review them for more information.",
    "transaction_charged_twice": "If you notice a duplicate charge, please reach out to customer support for a refund.",
    "declined_cash_withdrawal": "A declined cash withdrawal may be due to insufficient funds or account restrictions. Check with us for details.",
    "transfer_fee_charged": "Transfer fees are applied as per the terms of your account. For details, refer to your account agreement.",
    "balance_not_updated_after_bank_transfer": "Transfers may take a few hours to reflect in your balance. Contact us if the issue persists.",
    "request_refund": "To request a refund, please fill out the refund request form on our website or contact our support team.",
    "card_arrival": "Your card will arrive within the expected delivery timeframe. If it does not arrive, please contact us.",
    "lost_or_stolen_card": "If your card is lost or stolen, please report it to our support team immediately to prevent unauthorized use.",
    "card_payment_not_recognised": "If a card payment is not recognized, please contact our support team for assistance.",
    "card_payment_wrong_exchange_rate": "If there's an issue with the exchange rate on your card payment, please reach out to customer support.",
    "extra_charge_on_statement": "If you see an extra charge on your statement, please contact our support team for clarification.",
    "wrong_exchange_rate_for_cash_withdrawal": "If you encounter a wrong exchange rate for a cash withdrawal, please contact support for assistance.",
    "refund_not_showing_up": "If a refund is not showing up, please contact our support team to check the status of your refund.",
    "reverted_card_payment?": "If your card payment was reverted, please contact support for details on why it happened.",
    "cash_withdrawal_not_recognised": "If a cash withdrawal is not recognized, please reach out to our support team for assistance.",
    "activate_my_card": "To activate your card, please follow the instructions provided in your card package or contact our support team.",
    "pending_card_payment": "If your card payment is pending, it may take a few hours to process. Contact support if the issue persists."
}

In [19]:
# Add responses to the datasets
def add_response(row):
    return responses.get(row['label_text'], "Cannot answer this question. Please contact support.")

train_df['response'] = train_df.apply(add_response, axis=1)
test_df['response'] = test_df.apply(add_response, axis=1)

In [20]:
train_df.head()

Unnamed: 0,text,label,label_text,response
0,I am still waiting on my card?,11,card_arrival,Your card will arrive within the expected deli...
1,What can I do if my card still hasn't arrived ...,11,card_arrival,Your card will arrive within the expected deli...
2,I have been waiting over a week. Is the card s...,11,card_arrival,Your card will arrive within the expected deli...
3,Can I track my card while it is in the process...,11,card_arrival,Your card will arrive within the expected deli...
4,"How do I know if I will get my card, or if it ...",11,card_arrival,Your card will arrive within the expected deli...


In [21]:
test_df.tail()

Unnamed: 0,text,label,label_text,response
2915,Was charged an ATM fee despite it being a smal...,19,cash_withdrawal_charge,Cash withdrawal charges are detailed in your a...
2916,I had a wrong fee charged at this ATM.,19,cash_withdrawal_charge,Cash withdrawal charges are detailed in your a...
2917,"When I got cash, I'm pretty sure there was a g...",19,cash_withdrawal_charge,Cash withdrawal charges are detailed in your a...
2918,"My account got charged for taking out cash, wh...",19,cash_withdrawal_charge,Cash withdrawal charges are detailed in your a...
2919,Why is there a fee for withdrawing?,19,cash_withdrawal_charge,Cash withdrawal charges are detailed in your a...


#### Preprocess the data

In [22]:
def preprocess_data(df):
    inputs = []
    targets = []
    for _, row in df.iterrows():
        inputs.append(row['text'])
        targets.append(f"{row['response']}") 
    return inputs, targets

train_inputs, train_targets = preprocess_data(train_df)
test_inputs, test_targets = preprocess_data(test_df)

In [24]:
train_inputs[:3]

['I am still waiting on my card?',
 "What can I do if my card still hasn't arrived after 2 weeks?",
 'I have been waiting over a week. Is the card still coming?']

In [25]:
train_targets[:3]

['Your card will arrive within the expected delivery timeframe. If it does not arrive, please contact us.',
 'Your card will arrive within the expected delivery timeframe. If it does not arrive, please contact us.',
 'Your card will arrive within the expected delivery timeframe. If it does not arrive, please contact us.']

In [26]:
# Further split the train dataset into train and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(train_inputs, train_targets, test_size=0.1)

## Fine-tune the T5 Model

In [27]:
# Initialize the T5 tokenizer and model

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [28]:
# Tokenize the data
def tokenize_data(inputs, targets):
    encodings = tokenizer(inputs, truncation=True, padding=True, max_length=128)
    decodings = tokenizer(targets, truncation=True, padding=True, max_length=128)
    return encodings, decodings

train_encodings, train_decodings = tokenize_data(train_inputs, train_targets)
val_encodings, val_decodings = tokenize_data(val_inputs, val_targets)

In [29]:
class ChatDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, decodings):
        self.encodings = encodings
        self.decodings = decodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.decodings['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = ChatDataset(train_encodings, train_decodings)
val_dataset = ChatDataset(val_encodings, val_decodings)

In [30]:
# Fine-tune the T5 model
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./results',
    num_train_epochs=3,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

 41%|████      | 500/1230 [35:05<46:25,  3.82s/it]  

{'loss': 1.2917, 'grad_norm': 2.5409693717956543, 'learning_rate': 2.9674796747967482e-05, 'epoch': 1.22}


 81%|████████▏ | 1000/1230 [1:16:49<21:01,  5.48s/it]

{'loss': 0.2428, 'grad_norm': 0.7909019589424133, 'learning_rate': 9.34959349593496e-06, 'epoch': 2.44}


100%|██████████| 1230/1230 [1:39:30<00:00,  4.85s/it]

{'train_runtime': 5970.0476, 'train_samples_per_second': 1.645, 'train_steps_per_second': 0.206, 'train_loss': 0.6580180253439802, 'epoch': 3.0}





TrainOutput(global_step=1230, training_loss=0.6580180253439802, metrics={'train_runtime': 5970.0476, 'train_samples_per_second': 1.645, 'train_steps_per_second': 0.206, 'total_flos': 262230555820032.0, 'train_loss': 0.6580180253439802, 'epoch': 3.0})

In [33]:
model.save_pretrained("./t5-main-chatbot-model")
tokenizer.save_pretrained("./t5-main-chatbot-model")

('./t5-main-chatbot-model\\tokenizer_config.json',
 './t5-main-chatbot-model\\special_tokens_map.json',
 './t5-main-chatbot-model\\spiece.model',
 './t5-main-chatbot-model\\added_tokens.json')

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5-main-chatbot-model")
tokenizer = T5Tokenizer.from_pretrained("./t5-main-chatbot-model")

# Define the list of valid intents
valid_intents = [
    'card_payment_fee_charged', 'direct_debit_payment_not_recognised',
    'balance_not_updated_after_cheque_or_cash_deposit', 'wrong_amount_of_cash_received',
    'cash_withdrawal_charge', 'transaction_charged_twice', 'declined_cash_withdrawal',
    'transfer_fee_charged', 'transfer_not_received_by_recipient',
    'balance_not_updated_after_bank_deposit', 'request_refund', 'card_payment_not_recognised',
    'card_payment_wrong_exchange_rate', 'extra_charge_on_statement',
    'wrong_exchange_rate_for_cash_withdrawal', 'Refund_not_showing_up',
    'reverted_card_payment?', 'cash_withdrawal_not_recognised',
    'activate_my_card', 'pending_card_payment', 'card_arrival', 'lost_or_stolen_card'
]

def generate_response(text):
    input_ids = tokenizer.encode(f"Intent: {text}", return_tensors='pt')
    output = model.generate(input_ids, max_new_tokens=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True).strip()

    # Debugging output to understand what the model is generating
    print(f"Raw model output: {response}")

    # Check if the response contains the separator "|||"
    if "|||" in response:
        response_text = response.split("||| ")[1].strip()
        
        # Check if the response text starts with any valid intent
        if any(intent in response for intent in valid_intents):
            return response_text
        else:
            return "Cannot answer that question. Please contact support."
    else:
        return "Cannot answer that question. Please contact support."

test_examples = [
    "I need to check my card payment fee.",
    "Why is my direct debit not recognized?",
    "My balance isn't updated after a deposit.",
    "I received the wrong amount of cash.",
    "There is a charge for cash withdrawal.",
    "I ordered my card one month ago but I still haven't obtained it.",
    "who is the goat?",
]

for example in test_examples:
    print(f"Input: {example}")
    print(f"Response: {generate_response(example)}")


  from .autonotebook import tqdm as notebook_tqdm


Input: I need to check my card payment fee.
Raw model output: card_payment_fee_charged ||| If you notice a duplicate charge, please reach out to customer support for a refund.
Response: If you notice a duplicate charge, please reach out to customer support for a refund.
Input: Why is my direct debit not recognized?
Raw model output: direct_debit_payment_not_recognised ||| If a direct debit payment isn't recognized, please contact our support team for assistance.
Response: If a direct debit payment isn't recognized, please contact our support team for assistance.
Input: My balance isn't updated after a deposit.
Raw model output: balance_not_updated_after_bank_deposit ||| Balance updates may take a few hours. If it's still not updated, please contact us.
Response: Balance updates may take a few hours. If it's still not updated, please contact us.
Input: I received the wrong amount of cash.
Raw model output: wrong_amount_of_cash_received ||| If you received the wrong amount of cash, pleas

In [None]:
# Top 20 Intents: Index(['card_payment_fee_charged', 'direct_debit_payment_not_recognised',
#        'balance_not_updated_after_cheque_or_cash_deposit',
#        'wrong_amount_of_cash_received', 'cash_withdrawal_charge',
#        'transaction_charged_twice', 'declined_cash_withdrawal',
#        'transfer_fee_charged', 'transfer_not_received_by_recipient',
#        'balance_not_updated_after_bank_transfer', 'request_refund',
#        'card_payment_not_recognised', 'card_payment_wrong_exchange_rate',
#        'extra_charge_on_statement', 'wrong_exchange_rate_for_cash_withdrawal',
#        'Refund_not_showing_up', 'reverted_card_payment?',
#        'cash_withdrawal_not_recognised', 'activate_my_card',
#        'pending_card_payment', 'card_arrival', 'lost_or_stolen_card'],

In [19]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5-main-chatbot-model")
tokenizer = T5Tokenizer.from_pretrained("./t5-main-chatbot-model")

# Define the list of valid intents
valid_intents = [
    'card_payment_fee_charged', 'direct_debit_payment_not_recognised',
    'balance_not_updated_after_cheque_or_cash_deposit', 'wrong_amount_of_cash_received',
    'cash_withdrawal_charge', 'transaction_charged_twice', 'declined_cash_withdrawal',
    'transfer_fee_charged', 'transfer_not_received_by_recipient',
    'balance_not_updated_after_bank_deposit', 'request_refund', 'card_payment_not_recognised',
    'card_payment_wrong_exchange_rate', 'extra_charge_on_statement',
    'wrong_exchange_rate_for_cash_withdrawal', 'Refund_not_showing_up',
    'reverted_card_payment?', 'cash_withdrawal_not_recognised',
    'activate_my_card', 'pending_card_payment', 'card_arrival', 'lost_or_stolen_card'
]

def generate_response(text):
    input_ids = tokenizer.encode(f"Intent: {text}", return_tensors='pt')
    output = model.generate(input_ids, max_new_tokens=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True).strip()

    # Check if the response contains the separator "|||"
    if "|||" in response:
        # Extract intent and response text
        intent, response_text = response.split("||| ")
        intent = intent.strip()

        # Check if the intent is valid
        if any(valid_intent in intent for valid_intent in valid_intents):
            return f"Input: {text}\nIntent: {intent}\nResponse: {response_text}"
        else:
            return f"Input: {text}\nResponse: Cannot answer that question. Please contact support."
    else:
        return f"Input: {text}\nResponse: Cannot answer that question. Please contact support."

# Test examples
test_examples = [
    "I need to check my card payment fee.",
    "Why is my direct debit not recognized?",
    "My balance isn't updated after a deposit.",
    "I received the wrong amount of cash.",
    "There is a charge for cash withdrawal.",
    "I ordered my card one month ago but I still haven't obtained it.",
    "what is python?",
    "I think my card is lost."
]

for example in test_examples:
    print(generate_response(example))
    print("\n" + "="*50 + "\n")


Input: I need to check my card payment fee.
Intent: card_payment_fee_charged
Response: If you notice a duplicate charge, please reach out to customer support for a refund.


Input: Why is my direct debit not recognized?
Intent: direct_debit_payment_not_recognised
Response: If a direct debit payment isn't recognized, please contact our support team for assistance.


Input: My balance isn't updated after a deposit.
Intent: balance_not_updated_after_bank_deposit
Response: Balance updates may take a few hours. If it's still not updated, please contact us.


Input: I received the wrong amount of cash.
Intent: wrong_amount_of_cash_received
Response: If you received the wrong amount of cash, please report it to the nearest branch immediately.


Input: There is a charge for cash withdrawal.
Intent: cash_withdrawal_charge
Response: Cash withdrawal charges are detailed in your account terms. Please review them for more information.


Input: I ordered my card one month ago but I still haven't obt

## Audio Input Processing

In [1]:
import pyttsx3
import speech_recognition as sr
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5-main-chatbot-model")
tokenizer = T5Tokenizer.from_pretrained("./t5-main-chatbot-model")

# Initialize text-to-speech engine
engine = pyttsx3.init()
rate = engine.getProperty('rate')
engine.setProperty('rate', rate - 35 )

def speak(text):
    engine.say(text)
    engine.runAndWait()

def listen():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Listening...")
        audio = recognizer.listen(source)
    
    try:
        user_input = recognizer.recognize_google(audio)
        print(f"User: {user_input}")
    except sr.UnknownValueError:
        print("Sorry, I didn't understand that.")
        return ""
    except sr.RequestError:
        print("Sorry, I'm having trouble with my speech recognition service.")
        return ""
    
    return user_input

def generate_response(text):
    input_ids = tokenizer.encode(f"Intent: {text}", return_tensors='pt')
    output = model.generate(input_ids, max_new_tokens=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True).strip()

    # Check if the response contains the separator "|||"
    if "|||" in response:
        # Extract intent and response text
        intent, response_text = response.split("||| ")
        intent = intent.strip()

        # Return formatted response
        return response_text
    else:
        return "Cannot answer that question. Please contact support."

while True:
    # Listen for user input
    user_input = listen()
    
    if user_input:
        # Generate response based on user input
        response = generate_response(user_input)
        print(f"Response: {response}")
        
        # Speak the response
        speak(response)
        
        # Exit after providing the response
        break


  from .autonotebook import tqdm as notebook_tqdm


Listening...
User: my balance is not updated after deposit
Response: Balance updates may take a few hours. If it's still not updated, please contact us.
