Submitted by: Muhammad Uzair - 372609, Fatima Binte Tanveer - 373630, Saleha Ahmed - 369182

#Installations and Imports

In [1]:
!pip install -U transformers datasets accelerate peft bitsandbytes faiss-cpu sentence_transformers

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerat

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

#Data Preprocessing

##Data Extraction

In [3]:
def extract_qa_pairs(excel_file):
    all_qa_pairs = []
    xls = pd.ExcelFile(excel_file)
    sheet_names = xls.sheet_names

    for i, sheet_name in enumerate(sheet_names):
        if i < 2:
            continue

        df = pd.read_excel(xls, sheet_name, header=None)
        qa_pairs = []

        for index, row in df.iterrows():
            first_non_empty_index = -1
            first_non_empty_value = None
            for col_index, cell in enumerate(row):
                if pd.notna(cell):
                    first_non_empty_index = col_index
                    first_non_empty_value = str(cell).strip()
                    break

            if first_non_empty_value and first_non_empty_value.endswith('?'):
                question = first_non_empty_value
                answer = ""
                for next_index in range(index + 1, len(df)):
                    next_row = df.iloc[next_index]
                    next_first_non_empty_index = -1
                    next_first_non_empty_value = None
                    for col_index, cell in enumerate(next_row):
                        if pd.notna(cell):
                            next_first_non_empty_index = col_index
                            next_first_non_empty_value = str(cell).strip()
                            break

                    if next_first_non_empty_value and not next_first_non_empty_value.endswith('?'):
                        answer += next_first_non_empty_value + " "
                    elif next_first_non_empty_value and next_first_non_empty_value.endswith('?'):
                        break
                    elif all(pd.isna(cell) for cell in next_row):
                        break
                if question and answer:
                    qa_pairs.append({'question': question, 'answer': answer.strip()})

        all_qa_pairs.extend(qa_pairs)

    return all_qa_pairs

def extract_qa_from_json(json_file):
    qa_pairs = []
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
            if 'categories' in data:
                for category in data['categories']:
                    if 'questions' in category:
                        for item in category['questions']:
                            if 'question' in item and 'answer' in item:
                                qa_pairs.append({'question': item['question'], 'answer': item['answer']})
    except FileNotFoundError:
        print(f"Error: File not found at {json_file}")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_file}")
    return qa_pairs

excel_file_path = '/content/NUST Bank-Product-Knowledge.xlsx'
json_file_path = 'faqs.json'

excel_qa_pairs = extract_qa_pairs(excel_file_path)
json_qa_pairs = extract_qa_from_json(json_file_path)

combined_qa_pairs = excel_qa_pairs + json_qa_pairs

output_json_path = 'qa_pairs.json'
with open(output_json_path, 'w') as f:
    json.dump(combined_qa_pairs, f, indent=4)

print(f"\nCombined question-answer pairs saved to {output_json_path}")


Combined question-answer pairs saved to qa_pairs.json


##Train Set Preparation

In [4]:
with open('qa_pairs.json', 'r') as f:
    qa_data = json.load(f)

train_data = [{
    "prompt": f"[INST] {item['question']} [/INST]",
    "response": item['answer']
} for item in combined_qa_pairs]

with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=2)


#Model Loading and Fine Tuning

##Connect to Huggingface and Load the Tokenizer

In [5]:
login()

In [6]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="right", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("json", data_files="train_data.json")["train"]

def tokenize(example):
    full_texts = [p + " " + r for p, r in zip(example["prompt"], example["response"])]
    return tokenizer(full_texts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

##Load the Model with LoRA

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

##Fine Tune the Model

In [8]:
training_args = TrainingArguments(
    output_dir="./llama3-qa-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.2551
20,2.7432
30,2.5265
40,2.3509
50,2.2426


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=57, training_loss=2.5771824853462086, metrics={'train_runtime': 1025.3862, 'train_samples_per_second': 0.901, 'train_steps_per_second': 0.056, 'total_flos': 7714391407460352.0, 'train_loss': 2.5771824853462086, 'epoch': 2.883116883116883})

##Save the Fine Tuned Model

In [9]:
model.save_pretrained("llama3-qa-finetuned")
tokenizer.save_pretrained("llama3-qa-finetuned")

('llama3-qa-finetuned/tokenizer_config.json',
 'llama3-qa-finetuned/special_tokens_map.json',
 'llama3-qa-finetuned/tokenizer.json')

#RAG Implementation

In [10]:
with open('/content/qa_pairs.json') as f:
    data = json.load(f)

texts = [f"Q: {entry['question']}\nA: {entry['answer']}" for entry in data]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
embeddings = embedding_model.encode(texts, convert_to_tensor=False, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')
faiss.normalize_L2(embeddings)

embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

#Inference

In [11]:
model_path = "/content/llama3-qa-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [12]:
def get_answer(user_query, k=3, max_new_tokens=150):
    query_embedding = embedding_model.encode([user_query], convert_to_tensor=False)
    query_embedding = np.array(query_embedding).astype('float32')
    faiss.normalize_L2(query_embedding)

    D, I = index.search(query_embedding, k)
    relevant_chunks = [texts[i] for i in I[0]]
    retrieved_qa = "\n\n".join(relevant_chunks)

    prompt = f"""You are a helpful, polite customer service agent for NUST Bank. Answer the customer's question clearly and concisely using the relevant information below. Do not explain your reasoning. Just give a direct, informative paragraph.

              Relevant information:
              {retrieved_qa}

              Customer: {user_query}
              Agent:"""

    prompt_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).input_ids
    prompt = tokenizer.decode(prompt_ids[0])

    result = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.3,
        top_p=0.9,
    )[0]["generated_text"]

    return result.split("Agent:")[-1].strip()

In [14]:
query = "Does NUST Bank offer Any account for Non-Resident Pakistanis (NRPs)?"
response = get_answer(query)
print("\nResponse:\n", response)


Response:
 Yes PakWatan Remittance Account. Through the PakWatan Remittance Account, beneficiaries can quickly and securely receive remittances from their loved one living abroad directly into their bank account without visiting the branch.


In [15]:
query = "What are the main features of the Roshan Digital Account?"
response = get_answer(query)
print("\nResponse:\n", response)


Response:
 NRPs OPF Members Foreign Nationals having Pakistan Origin Card (POC) Pakistani Nationals having declared Assets abroad Govt. Employees & Officials posted abroad Profit Payment All bank charges are applicable as per prevailing Schedule of Charges Zakat and WHT taxes on profit are not applicable on R
