In [15]:
#importing all the necessary libraries
import os
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, logging
import torch

In [16]:
#Disabling weights & biases logging
os.environ['WANDB_DISABLED'] = "true"
#Disabling Transformers advisory and info logs
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = "true"
logging.set_verbosity_error()

In [17]:
#loading the dataset
df = pd.read_csv('https://raw.githubusercontent.com/psabhay2003/BCGX-GenAI/refs/heads/main/financial_chatbot_data.csv')
df

Unnamed: 0,question,response
0,What is Apple's revenue in 2022?,3.943280e+11
1,What is Apple's net income in 2022?,9.980300e+10
2,What are Apple's total assets in 2022?,3.527550e+11
3,What are Apple's total liabilities in 2022?,3.020830e+11
4,What is Apple's operating cashflow in 2022?,1.221510e+11
...,...,...
142,What is Tesla's investing cashflow growth in 2...,2.055313e+01
143,What is Tesla's gross margin growth in 2024?,-1.189128e+00
144,What is Tesla's profit margin in 2024?,7.322141e+00
145,What is Tesla's return on assets in 2024?,5.859753e+00


In [18]:
#Defining intent mapping based on keywords
intent_keywords = {
    'revenue': ['revenue'],
    'net_income': ['net income'],
    'assets': ['total assets'],
    'liabilities': ['total liabilities'],
    'cashflow': ['operating cashflow', 'financing cashflow', 'investing cashflow'],
    'profit_margin': ['profit margin'],
    'gross_margin': ['gross margin'],
}

def assign_intent(question):
    q = question.lower() #lowercasing 'question' column so that it is not case-sensitive
    for intent, keywords in intent_keywords.items():
        for kw in keywords:
            if kw in q:
                return intent
    return 'other'

#Applying intent labels on dataframe
df['intent'] = df['question'].apply(assign_intent)
intents = sorted(df['intent'].unique()) #extracting all unique names from the new column, and sorting them alphabetically
label2id = {label: i for i, label in enumerate(intents)} #mapping each intent name to a unique integer index (0, 1, 2, …)
id2label = {i: label for label, i in label2id.items()} #invert mapping
df['label'] = df['intent'].map(label2id)
df

Unnamed: 0,question,response,intent,label
0,What is Apple's revenue in 2022?,3.943280e+11,revenue,7
1,What is Apple's net income in 2022?,9.980300e+10,net_income,4
2,What are Apple's total assets in 2022?,3.527550e+11,assets,0
3,What are Apple's total liabilities in 2022?,3.020830e+11,liabilities,3
4,What is Apple's operating cashflow in 2022?,1.221510e+11,cashflow,1
...,...,...,...,...
142,What is Tesla's investing cashflow growth in 2...,2.055313e+01,cashflow,1
143,What is Tesla's gross margin growth in 2024?,-1.189128e+00,gross_margin,2
144,What is Tesla's profit margin in 2024?,7.322141e+00,profit_margin,6
145,What is Tesla's return on assets in 2024?,5.859753e+00,other,5


In [19]:
#Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
#stratify ensures each label class appears in the train and test sets in roughly the same proportions as in the full dataset.

#Tokenization using BertTokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

#Dataset class
class QADataset(torch.utils.data.Dataset):
    def __init__(self, questions, labels, tokenizer):
        self.encodings = tokenizer(questions.tolist(), truncation=True, padding=True)
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = QADataset(train_df['question'], train_df['label'], tokenizer)
test_dataset = QADataset(test_df['question'], test_df['label'], tokenizer)

In [24]:
#TrainingArguments & Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to=[]  # disable all logging integrations, including wandb
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {'accuracy': accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

#Training the model
trainer.train()

{'loss': 1.2325, 'grad_norm': 7.597434997558594, 'learning_rate': 4e-05, 'epoch': 0.6666666666666666}
{'eval_loss': 0.7443996667861938, 'eval_accuracy': 0.9333333333333333, 'eval_runtime': 0.0733, 'eval_samples_per_second': 408.998, 'eval_steps_per_second': 54.533, 'epoch': 1.0}
{'loss': 0.7799, 'grad_norm': 6.353825092315674, 'learning_rate': 2.8888888888888888e-05, 'epoch': 1.3333333333333333}
{'loss': 0.5833, 'grad_norm': 4.936748027801514, 'learning_rate': 1.777777777777778e-05, 'epoch': 2.0}
{'eval_loss': 0.4402664303779602, 'eval_accuracy': 0.9333333333333333, 'eval_runtime': 0.0401, 'eval_samples_per_second': 747.315, 'eval_steps_per_second': 99.642, 'epoch': 2.0}
{'loss': 0.4129, 'grad_norm': 8.81442642211914, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.6666666666666665}
{'eval_loss': 0.3705931007862091, 'eval_accuracy': 0.9333333333333333, 'eval_runtime': 0.0721, 'eval_samples_per_second': 416.259, 'eval_steps_per_second': 55.501, 'epoch': 3.0}
{'train_runtime': 83.0416

TrainOutput(global_step=45, training_loss=0.7219531642066108, metrics={'train_runtime': 83.0416, 'train_samples_per_second': 4.227, 'train_steps_per_second': 0.542, 'train_loss': 0.7219531642066108, 'epoch': 3.0})

In [25]:
#Model Evaluation
eval_results = trainer.evaluate()
print("Evaluation results:\n", eval_results)
print("Classification report on test set:")
preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=1)
print(classification_report(test_df['label'], preds, target_names=intents))

{'eval_loss': 0.7443996667861938, 'eval_accuracy': 0.9333333333333333, 'eval_runtime': 0.0742, 'eval_samples_per_second': 404.34, 'eval_steps_per_second': 53.912, 'epoch': 3.0}
Evaluation results:
 {'eval_loss': 0.7443996667861938, 'eval_accuracy': 0.9333333333333333, 'eval_runtime': 0.0742, 'eval_samples_per_second': 404.34, 'eval_steps_per_second': 53.912, 'epoch': 3.0}
Classification report on test set:
               precision    recall  f1-score   support

       assets       1.00      1.00      1.00         3
     cashflow       1.00      1.00      1.00         9
 gross_margin       0.60      1.00      0.75         3
  liabilities       1.00      1.00      1.00         3
   net_income       1.00      1.00      1.00         3
        other       1.00      1.00      1.00         4
profit_margin       0.00      0.00      0.00         2
      revenue       1.00      1.00      1.00         3

     accuracy                           0.93        30
    macro avg       0.82      0.88    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# 9. Define extraction & retrieval pipeline
class FinanceQA:
    def __init__(self, df, model, tokenizer, label2intent_map):
        self.df = df
        self.model = model
        self.tokenizer = tokenizer
        self.label2intent = label2intent_map
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        # Precompile regex
        self.pattern = re.compile(r"What (?:is|are) (?P<company>.+?)'s (?P<entity>.+?) in (?P<year>\d{4})\?")

    def predict_intent(self, question):
        inputs = self.tokenizer(question, return_tensors='pt', truncation=True, padding=True).to(self.device)
        outputs = self.model(**inputs)
        logits = outputs.logits.detach().cpu().numpy()
        intent_id = np.argmax(logits, axis=1)[0]
        return self.label2intent[intent_id]

    def extract_slots(self, question):
        match = self.pattern.match(question)
        if not match:
            raise ValueError("Could not extract slots from question: {}".format(question))
        return match.group('company'), match.group('entity'), match.group('year')

    def lookup(self, intent, company, year):
        intent_key = intent
        subset = self.df[
            (self.df['intent'] == intent_key) &
            (self.df['question'].str.contains(company, case=False)) &
            (self.df['question'].str.contains(year))
        ]
        if subset.empty:
            raise ValueError("No data found for {}, {}, {}".format(company, intent, year))
        return subset['response'].values[0]

    def answer(self, question):
        intent = self.predict_intent(question)
        company, entity, year = self.extract_slots(question)
        value = self.lookup(intent, company, year)
        return value

# 10. Test pipeline
pipeline = FinanceQA(df, model, tokenizer, id2label)

sample_questions = [
    "What is Apple's revenue in 2022?",
    "What are Apple's total liabilities in 2022?",
]

for q in sample_questions:
    try:
        ans = pipeline.answer(q)
        print(f"Q: {q}\nA: {ans}\n")
    except Exception as e:
        print(f"Error for question '{q}': {e}")

Q: What is Apple's revenue in 2022?
A: 394328000000.0

Q: What are Apple's total liabilities in 2022?
A: 302083000000.0



In [28]:
# ─── Gradio UI ─────────────────────────────────────────────────────────────────
!pip install gradio
import gradio as gr

def gradio_answer_fn(question: str) -> str:
    try:
        return pipeline.answer(question)
    except Exception as e:
        return f"Error: {e}"

iface = gr.Interface(
    fn=gradio_answer_fn,
    inputs=gr.Textbox(lines=1, placeholder="e.g. What is Tesla's profit margin in 2024?"),
    outputs=gr.Textbox(label="Answer"),
    title="Finance Q&A Chatbot",
    description="Ask for revenue, profit margin, assets, liabilities, etc. in the format:\n"
                "`What is <Company>'s <Metric> in <Year>?`"
)

# Launch the UI (in Colab set share=True if you need a public link)
iface.launch()


Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

