Submitted by: Muhammad Uzair - 372609, Fatima Binte Tanveer - 373630, Saleha Ahmed - 369182

#Installations and Imports

In [None]:
!pip install -U transformers datasets accelerate peft bitsandbytes faiss-cpu sentence_transformers



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

#Data Preprocessing

##Data Extraction

In [None]:
def extract_qa_pairs(excel_file):
    all_qa_pairs = []
    xls = pd.ExcelFile(excel_file)
    sheet_names = xls.sheet_names

    for i, sheet_name in enumerate(sheet_names):
        if i < 2:
            continue

        df = pd.read_excel(xls, sheet_name, header=None)
        qa_pairs = []

        for index, row in df.iterrows():
            first_non_empty_index = -1
            first_non_empty_value = None
            for col_index, cell in enumerate(row):
                if pd.notna(cell):
                    first_non_empty_index = col_index
                    first_non_empty_value = str(cell).strip()
                    break

            if first_non_empty_value and first_non_empty_value.endswith('?'):
                question = first_non_empty_value
                answer = ""
                for next_index in range(index + 1, len(df)):
                    next_row = df.iloc[next_index]
                    next_first_non_empty_index = -1
                    next_first_non_empty_value = None
                    for col_index, cell in enumerate(next_row):
                        if pd.notna(cell):
                            next_first_non_empty_index = col_index
                            next_first_non_empty_value = str(cell).strip()
                            break

                    if next_first_non_empty_value and not next_first_non_empty_value.endswith('?'):
                        answer += next_first_non_empty_value + " "
                    elif next_first_non_empty_value and next_first_non_empty_value.endswith('?'):
                        break
                    elif all(pd.isna(cell) for cell in next_row):
                        break
                if question and answer:
                    qa_pairs.append({'question': question, 'answer': answer.strip()})

        all_qa_pairs.extend(qa_pairs)

    return all_qa_pairs

def extract_qa_from_json(json_file):
    qa_pairs = []
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
            if 'categories' in data:
                for category in data['categories']:
                    if 'questions' in category:
                        for item in category['questions']:
                            if 'question' in item and 'answer' in item:
                                qa_pairs.append({'question': item['question'], 'answer': item['answer']})
    except FileNotFoundError:
        print(f"Error: File not found at {json_file}")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_file}")
    return qa_pairs

excel_file_path = '/content/NUST Bank-Product-Knowledge.xlsx'
json_file_path = 'faqs.json'

excel_qa_pairs = extract_qa_pairs(excel_file_path)
json_qa_pairs = extract_qa_from_json(json_file_path)

combined_qa_pairs = excel_qa_pairs + json_qa_pairs

output_json_path = 'qa_pairs.json'
with open(output_json_path, 'w') as f:
    json.dump(combined_qa_pairs, f, indent=4)

print(f"\nCombined question-answer pairs saved to {output_json_path}")

Error: File not found at faqs.json

Combined question-answer pairs saved to qa_pairs.json


##Train Set Preparation

In [None]:
with open('qa_pairs.json', 'r') as f:
    qa_data = json.load(f)

train_data = [{
    "prompt": f"[INST] {item['question']} [/INST]",
    "response": item['answer']
} for item in combined_qa_pairs]

with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=2)


#Model Loading and Fine Tuning

##Connect to Huggingface and Load the Tokenizer

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="right", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("json", data_files="train_data.json")["train"]

def tokenize(example):
    full_texts = [p + " " + r for p, r in zip(example["prompt"], example["response"])]
    return tokenizer(full_texts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/293 [00:00<?, ? examples/s]

##Load the Model with LoRA

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##Fine Tune the Model

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3-qa-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



##Save the Fine Tuned Model

In [None]:
model.save_pretrained("llama3-qa-finetuned")
tokenizer.save_pretrained("llama3-qa-finetuned")

In [None]:
!pip install fastapi uvicorn transformers sentence-transformers faiss-cpu pyngrok




#RAG Implementation

In [None]:
with open('/content/qa_pairs.json') as f:
    data = json.load(f)

texts = [f"Q: {entry['question']}\nA: {entry['answer']}" for entry in data]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
embeddings = embedding_model.encode(texts, convert_to_tensor=False, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')
faiss.normalize_L2(embeddings)

embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(embeddings)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

#Inference

In [None]:
model_path = "/content/llama3-qa-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
from fastapi import FastAPI, Request, File, UploadFile
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import numpy as np
from typing import List

app = FastAPI()  # <-- This line is essential

# Assume these are initialized elsewhere or above this cell
# embedding_model, index, texts, pipe must be defined globally before using this route

class QueryRequest(BaseModel):
    query: str

@app.post("/generate")
async def generate_answer(req: QueryRequest):
    query_embedding = np.array(embedding_model.encode([req.query])).astype('float32')
    faiss.normalize_L2(query_embedding)
    _, I = index.search(query_embedding, 3)
    retrieved_qa = "\n\n".join([texts[i] for i in I[0]])

    prompt = f"""You are a helpful, polite customer service agent for NUST Bank. Answer the customer's question clearly and concisely using the relevant information below. Do not explain your reasoning. Just give a direct, informative paragraph.

    Relevant information:
    {retrieved_qa}

    Customer: {req.query}
    Agent:"""

    result = pipe(prompt, max_new_tokens=150, do_sample=False, temperature=0.3, top_p=0.9)[0]["generated_text"]
    answer = result.split("Agent:")[-1].strip()
    return {"answer": answer}

@app.post("/upload")
async def upload_documents(files: List[UploadFile] = File(...)):
    print("Files received:", [f.filename for f in files])
    global texts, embeddings, index

    new_qa_format = []

    for file in files:
        try:
            extracted_pairs = extract_qa_pairs(file)
            for pair in extracted_pairs:
                q = pair.get("question", "").strip()
                a = pair.get("answer", "").strip()
                if q and a:
                    new_qa_format.append(f"Q: {q}\nA: {a}")
        except Exception as e:
            continue  # Skip files that fail processing

    if not new_qa_format:
        return {"message": "No valid Q&A pairs found in uploaded Excel files."}

    # Encode and normalize
    new_embeddings = np.array(embedding_model.encode(new_qa_format)).astype('float32')
    faiss.normalize_L2(new_embeddings)

    # Update global RAG state
    texts += new_qa_format
    index.add(new_embeddings)

    return {"message": f"{len(new_qa_format)} Q&A pairs ingested successfully from Excel."}



In [None]:
!pip install python-multipart



In [None]:
import nest_asyncio
import threading
from pyngrok import ngrok
import uvicorn

In [None]:
!ngrok config add-authtoken 2xLQbTR59Ylabkx12yVWHqNP3WX_Fyd8DXnc7RLDDcffMvpA

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
# Uvicorn setup for Colab
nest_asyncio.apply()
public_url = ngrok.connect(8000)
print(f"Your FastAPI app is live at: {public_url}")

threading.Thread(target=uvicorn.run, args=(app,), kwargs={"host": "0.0.0.0", "port": 8000}).start()

Your FastAPI app is live at: NgrokTunnel: "https://f606-34-125-96-116.ngrok-free.app" -> "http://localhost:8000"


In [None]:
import requests

# Replace with the actual query you want to test
query = "How can I open a new bank account?"

# Replace with the actual ngrok URL printed above
ngrok_url = "https://f606-34-125-96-116.ngrok-free.app"

response = requests.post(f"{ngrok_url}/generate", json={"query": query})

print("Response from the API:")
print(response.json())


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-47' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=SystemExit(1)>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 163, in startup
    server = await loop.create_server(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/base_events.py", line 1536, in create_server
    raise OSError(err.errno, msg) from None
OSError: [Errno 98] error while attempting to bind on address ('0.0.0.0', 8000): address already in use

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/m

INFO:     34.125.96.116:0 - "POST /generate HTTP/1.1" 200 OK
Response from the API:
{'answer': 'You can open a new bank account by visiting any of our branches in person or by applying online through our website or mobile app. You can also apply through our digital channel (NUST Digital) or through our branch’s digital channel (NUST Digital Kiosk). You can also apply through our branch’s digital channel (NUST Digital Kiosk) or through our branch’s digital channel (NUST Digital Kiosk). You can also apply through our branch’s digital channel (NUST Digital Kiosk) or through our branch’s digital channel (NUST Digital Kiosk). You can also apply through our branch’s digital channel (NUST Digital Kiosk) or through our branch’s digital channel (NUST Digital Kiosk). You can also apply'}


In [None]:
from pyngrok import ngrok

if False:
    # Kill existing tunnels
    ngrok.kill()

    # If uvicorn is running in background, this will help terminate it
    import os
    os.system("pkill -f uvicorn")
    print("This cell is skipped")



15