Submitted by: Muhammad Uzair - 372609, Fatima Binte Tanveer - 373630, Saleha Ahmed - 369182

#Installations and Imports

In [2]:
!pip install -U transformers==4.50.3 datasets accelerate==1.6.0 peft bitsandbytes faiss-cpu sentence_transformers python-multipart fastapi uvicorn pyngrok

Collecting transformers==4.50.3
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import re
from langchain.text_splitter import MarkdownTextSplitter, Language

#Data Preprocessing

##Clean Text

In [4]:
def clean_text(text):
    """
    Cleans text by removing non-standard characters, extra spaces,
    and common escape sequences.
    """
    if not isinstance(text, str):
        return str(text) # Ensure it's a string for processing

    # Remove common escape characters
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

    # Remove unicode non-breaking space and similar invisible characters
    text = re.sub(r'[\u200b\u00a0\xa0]', ' ', text)

    # Remove specific problematic characters like bullet points (•, ○, etc.) and similar
    text = re.sub(r'[•●■▪\t\v\f\x0b\x0c\u2022\u25cf\u25ba\u25c6\u2013\u2014\u2018\u2019\u201c\u201d\u2026]', '', text)

    # Remove ASCII control characters and extended ASCII control characters
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    # Remove non-ASCII printable characters that are not spaces, but allow common symbols
    # This pattern keeps alphanumeric, common punctuation, and basic math symbols.
    text = re.sub(r'[^\w\s.,!?;:"\'\/\-\(\)\&\%\$#@=+*<>\xA0-\xFF]', '', text)


    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

##Data Extraction

In [5]:
def extract_data_in_markdown(excel_file, faqs_json_file, output_markdown_file="output.md"):
    xls = pd.ExcelFile(excel_file)
    sheet_names = xls.sheet_names

    with open(output_markdown_file, 'w', encoding='utf-8') as md_file:
        md_file.write("# Rate Sheet July 1 2024\n\n")

        # --- Handle the second sheet (Tables) ---
        if len(sheet_names) > 1:
            table_sheet_name = sheet_names[1]
            df_table = pd.read_excel(xls, table_sheet_name, header=None)

            md_file.write("## Indicative Profit Rates\n\n")

            # --- Pass 1: Process Set 1 tables (Profit Payment / Profit Rate - Columns B, C, D) ---
            r = 12 # Start from Excel row 13 (index 12)
            while r < len(df_table):
                is_row_b_empty = (1 >= len(df_table.iloc[r]) or pd.isna(df_table.iloc[r][1]) or clean_text(str(df_table.iloc[r][1])) == "")
                is_row_f_empty = (5 >= len(df_table.iloc[r]) or pd.isna(df_table.iloc[r][5]) or clean_text(str(df_table.iloc[r][5])) == "")

                if is_row_b_empty and is_row_f_empty and r > 12:
                    is_end_of_all_tables = True
                    for check_r in range(r, min(r + 3, len(df_table))):
                        check_b_empty = (1 >= len(df_table.iloc[check_r]) or pd.isna(df_table.iloc[check_r][1]) or clean_text(str(df_table.iloc[check_r][1])) == "")
                        check_f_empty = (5 >= len(df_table.iloc[check_r]) or pd.isna(df_table.iloc[check_r][5]) or clean_text(str(df_table.iloc[check_r][5])) == "")
                        if not (check_b_empty and check_f_empty):
                            is_end_of_all_tables = False
                            break
                    if is_end_of_all_tables:
                        break

                row_data = df_table.iloc[r]
                main_title_b = clean_text(str(row_data[1])) if 1 < len(row_data) and pd.notna(row_data[1]) else ""

                if main_title_b and ("Account" in main_title_b or "Savings" in main_title_b or "Remittance" in main_title_b or "Pensioners" in main_title_b):
                    account_name = main_title_b

                    header_found = False
                    header_row_idx = -1
                    current_row_idx = r + 1
                    while current_row_idx < len(df_table):
                        header_candidate_row = df_table.iloc[current_row_idx]

                        col_b_val = clean_text(str(header_candidate_row[1])) if 1 < len(header_candidate_row) and pd.notna(header_candidate_row[1]) else ""
                        col_d_val = clean_text(str(header_candidate_row[3])) if 3 < len(header_candidate_row) and pd.notna(header_candidate_row[3]) else ""

                        if "Profit Payment" in col_b_val and "Profit Rate" in col_d_val:
                            headers_set1 = ["Account Name", col_b_val, col_d_val]
                            md_file.write("| " + " | ".join(headers_set1) + " |\n")
                            md_file.write("| " + " | ".join(['---'] * len(headers_set1)) + " |\n")
                            header_found = True
                            header_row_idx = current_row_idx
                            break

                        next_title_b_check = clean_text(str(header_candidate_row[1])) if 1 < len(header_candidate_row) and pd.notna(header_candidate_row[1]) else ""
                        if next_title_b_check and ("Account" in next_title_b_check or "Savings" in next_title_b_check or "Remittance" in next_title_b_check or "Pensioners" in next_title_b_check):
                            break
                        current_row_idx += 1

                    if header_found:
                        data_row_start_idx = header_row_idx + 1

                        data_row_set1 = df_table.iloc[data_row_start_idx]

                        val_b = clean_text(str(data_row_set1[1])) if 1 < len(data_row_set1) and pd.notna(data_row_set1[1]) else ""
                        val_d = clean_text(str(data_row_set1[3])) if 3 < len(data_row_set1) and pd.notna(data_row_set1[3]) else ""

                        if val_b and val_d:
                            md_file.write(f"| {account_name} | {val_b} | {val_d} |\n")

                        md_file.write("\n")
                        r = data_row_start_idx + 1
                    else:
                        r += 1
                else:
                    r += 1

            # --- Pass 2: Process Set 2 tables (Tenor / Payout / Profit Rate - Columns F, G, H, I) ---
            r = 12
            while r < len(df_table):
                main_title_f_check = clean_text(str(df_table.iloc[r][5])) if 5 < len(df_table.iloc[r]) and pd.notna(df_table.iloc[r][5]) else ""

                if "FCY" in main_title_f_check:
                    break

                is_row_b_empty = (1 >= len(df_table.iloc[r]) or pd.isna(df_table.iloc[r][1]) or clean_text(str(df_table.iloc[r][1])) == "")
                is_row_f_empty = (5 >= len(df_table.iloc[r]) or pd.isna(df_table.iloc[r][5]) or clean_text(str(df_table.iloc[r][5])) == "")

                if is_row_b_empty and is_row_f_empty and r > 12:
                    is_end_of_all_tables = True
                    for check_r in range(r, min(r + 3, len(df_table))):
                        check_b_empty = (1 >= len(df_table.iloc[check_r]) or pd.isna(df_table.iloc[check_r][1]) or clean_text(str(df_table.iloc[check_r][1])) == "")
                        check_f_empty = (5 >= len(df_table.iloc[check_r]) or pd.isna(df_table.iloc[check_r][5]) or clean_text(str(df_table.iloc[check_r][5])) == "")
                        if not (check_b_empty and check_f_empty):
                            is_end_of_all_tables = False
                            break
                    if is_end_of_all_tables:
                        break

                row_data = df_table.iloc[r]
                main_title_f = clean_text(str(row_data[5])) if 5 < len(row_data) and pd.notna(row_data[5]) else ""

                if main_title_f and ("Deposit" in main_title_f or "SNDR" in main_title_f or "Term" in main_title_f or "NUST Bachat Account" in main_title_f or "NUST Waqaar Account - Senior Citizen - Term Deposit" in main_title_f):
                    table_title = main_title_f

                    header_found = False
                    header_row_idx = -1
                    current_row_idx = r + 1
                    while current_row_idx < len(df_table):
                        header_candidate_row = df_table.iloc[current_row_idx]

                        col_f_val = clean_text(str(header_candidate_row[5])) if 5 < len(header_candidate_row) and pd.notna(header_candidate_row[5]) else ""
                        col_g_val = clean_text(str(header_candidate_row[6])) if 6 < len(header_candidate_row) and pd.notna(header_candidate_row[6]) else ""
                        col_i_val = clean_text(str(header_candidate_row[8])) if 8 < len(header_candidate_row) and pd.notna(header_candidate_row[8]) else ""
                        col_j_val = clean_text(str(header_candidate_row[9])) if 9 < len(header_candidate_row) and pd.notna(header_candidate_row[9]) else ""

                        headers_set2 = []
                        if "Tenor" in col_f_val and "Payout" in col_g_val:
                            headers_set2 = ["Product/Account", col_f_val, col_g_val, col_i_val]
                        else:
                            next_title_f_check = clean_text(str(header_candidate_row[5])) if 5 < len(header_candidate_row) and pd.notna(header_candidate_row[5]) else ""
                            if next_title_f_check and ("Deposit" in next_title_f_check or "SNDR" in next_title_f_check or "Term" in next_title_f_check or "NUST Bachat Account" in next_title_f_check or "NUST Waqaar Account - Senior Citizen - Term Deposit" in next_title_f_check):
                                break
                            current_row_idx += 1
                            continue

                        headers_set2 = [h for h in headers_set2 if h.strip()]
                        if len(headers_set2) > 1:
                            md_file.write("| " + " | ".join(headers_set2) + " |\n")
                            md_file.write("| " + " | ".join(['---'] * len(headers_set2)) + " |\n")
                            header_found = True
                            header_row_idx = current_row_idx
                            break
                        current_row_idx += 1

                    if header_found:
                        data_row_start_idx = header_row_idx + 1

                        while data_row_start_idx < len(df_table):
                            data_row_set2 = df_table.iloc[data_row_start_idx]

                            is_relevant_row_empty = True
                            cols_to_check = [5, 6, 7, 8, 9]
                            for col_idx in cols_to_check:
                                if col_idx < len(data_row_set2) and pd.notna(data_row_set2[col_idx]) and clean_text(str(data_row_set2[col_idx])) != "":
                                    is_relevant_row_empty = False
                                    break

                            # Apply cleaning before checking for keywords like "Discontinued"
                            cleaned_val_f_for_check = clean_text(str(data_row_set2[5])) if 5 < len(data_row_set2) and pd.notna(data_row_set2[5]) else ""
                            if "NUST Bachat Account" in table_title and ("Discontinued" in cleaned_val_f_for_check or "*" in cleaned_val_f_for_check):
                                md_file.write(f"**Note:** {cleaned_val_f_for_check} - {clean_text(str(data_row_set2[6])) if 6 < len(data_row_set2) and pd.notna(data_row_set2[6]) else ''} - {clean_text(str(data_row_set2[8])) if 8 < len(data_row_set2) and pd.notna(data_row_set2[8]) else ''}\n\n")
                                data_row_start_idx += 1
                                continue

                            if is_relevant_row_empty:
                                break

                            is_new_table_start = False
                            next_table_title_f = clean_text(str(data_row_set2[5])) if 5 < len(data_row_set2) and pd.notna(data_row_set2[5]) else ""
                            if next_table_title_f and ("Deposit" in next_table_title_f or "SNDR" in next_table_title_f or "Term" in next_table_title_f or "NUST Bachat Account" in next_table_title_f or "NUST Waqaar Account - Senior Citizen - Term Deposit" in next_table_title_f):
                                is_new_table_start = True

                            next_account_name_b = clean_text(str(data_row_set2[1])) if 1 < len(data_row_set2) and pd.notna(data_row_set2[1]) else ""
                            if next_account_name_b and ("Account" in next_account_name_b or "Savings" in next_account_name_b or "Remittance" in next_account_name_b or "Pensioners" in next_account_name_b):
                                is_new_table_start = True

                            if is_new_table_start:
                                break

                            row_vals = []
                            val_f = clean_text(str(data_row_set2[5])) if 5 < len(data_row_set2) and pd.notna(data_row_set2[5]) else ""
                            val_g = clean_text(str(data_row_set2[6])) if 6 < len(data_row_set2) and pd.notna(data_row_set2[6]) else ""
                            val_h = clean_text(str(data_row_set2[7])) if 7 < len(data_row_set2) and pd.notna(data_row_set2[7]) else ""
                            val_i = clean_text(str(data_row_set2[8])) if 8 < len(data_row_set2) and pd.notna(data_row_set2[8]) else ""
                            val_j = clean_text(str(data_row_set2[9])) if 9 < len(data_row_set2) and pd.notna(data_row_set2[9]) else ""

                            row_vals.append(table_title)

                            payout_val = val_g if val_g else val_h
                            row_vals.extend([val_f, payout_val, val_i])

                            if any(x.strip() for x in row_vals):
                                md_file.write("| " + " | ".join(row_vals) + " |\n")
                            data_row_start_idx += 1
                        md_file.write("\n")
                        r = data_row_start_idx
                    else:
                        r += 1
                else:
                    r += 1
        else:
            md_file.write("Excel file has less than 2 sheets. Cannot process table sheet.\n\n")

        # --- Process sheets from the third onwards for Q&A pairs (from Excel) ---
        if len(sheet_names) > 2:
            question_keywords = ["what", "how", "why", "where", "when", "which", "who", "whom", "whose"]
            for sheet_name in sheet_names[2:]:
                md_file.write(f"## {clean_text(sheet_name.strip())}\n\n")
                df_qa = pd.read_excel(xls, sheet_name, header=None)
                current_account_context = clean_text(sheet_name.strip())

                # Buffer to accumulate lines that might be part of an answer
                answer_buffer = []
                current_question = None

                for index, row in df_qa.iterrows():
                    first_non_empty = None
                    q_col_index = -1
                    for cell_idx, cell in enumerate(row):
                        if pd.notna(cell):
                            first_non_empty = clean_text(str(cell))
                            q_col_index = cell_idx
                            break

                    is_question = False
                    if first_non_empty:
                        normalized_text = first_non_empty.lower()
                        if normalized_text.endswith('?'):
                            is_question = True
                        else:
                            for keyword in question_keywords:
                                if normalized_text.startswith(keyword + " "):
                                    is_question = True
                                    break

                    if is_question:
                        # If a new question starts, finalize the previous Q&A (if any)
                        if current_question and answer_buffer:
                            full_question = f"Regarding {current_account_context}: {current_question}"
                            full_answer = f"For {current_account_context}, the answer is: {' '.join(answer_buffer).strip()}"
                            md_file.write(f"**Question:** {full_question}\n")
                            md_file.write(f"**Answer:** {full_answer}\n\n")

                        # Start new Q&A
                        current_question = first_non_empty
                        answer_buffer = []
                    elif current_question:
                        # If it's not a new question, and we have a current question, assume it's part of the answer
                        answer_part = ""
                        if q_col_index != -1 and q_col_index < len(row) and pd.notna(row[q_col_index]):
                            answer_part = clean_text(str(row[q_col_index]))
                        if answer_part:
                            answer_buffer.append(answer_part)

                # Finalize the last Q&A pair after the loop
                if current_question and answer_buffer:
                    full_question = f"Regarding {current_account_context}: {current_question}"
                    full_answer = f"For {current_account_context}, the answer is: {' '.join(answer_buffer).strip()}"
                    md_file.write(f"**Question:** {full_question}\n")
                    md_file.write(f"**Answer:** {full_answer}\n\n")

                md_file.write("\n")

        # --- Add content from faqs.json to the Markdown file ---
        try:
            with open(faqs_json_file, 'r', encoding='utf-8') as f:
                faqs_data = json.load(f)

            if 'categories' in faqs_data:
                for category_data in faqs_data['categories']:
                    category_name = clean_text(category_data.get('category', 'General FAQs'))
                    md_file.write(f"## {category_name}\n\n")
                    if 'questions' in category_data:
                        for item in category_data['questions']:
                            question = clean_text(item.get('question', ''))
                            answer = clean_text(item.get('answer', ''))
                            if question and answer:
                                full_question = f"Regarding {category_name}: {question}"
                                full_answer = f"For {category_name}, the answer is: {answer}"
                                md_file.write(f"**Question:** {full_question}\n")
                                md_file.write(f"**Answer:** {full_answer}\n\n")
                    md_file.write("\n")
        except FileNotFoundError:
            print(f"Warning: {faqs_json_file} not found. Skipping its inclusion in Markdown.")
        except json.JSONDecodeError:
            print(f"Warning: Could not decode JSON from {faqs_json_file}. Skipping its inclusion in Markdown.")


##Train Set Preparation

In [6]:
def parse_markdown_to_json(md_file_path, output_json_path='train_data.json'):
    with open(md_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    train_data = []
    table_buffer = []
    in_table = False
    current_headers = []
    current_section = None

    def format_profit_rate(rate_str):
        try:
            rate = float(rate_str)
            return f"{rate * 100:.1f}%" # Format as percentage with one decimal place
        except ValueError:
            return rate_str # Return as is if not a valid number

    def flush_table():
        nonlocal table_buffer, current_headers
        for row in table_buffer:
            if len(row) != len(current_headers):
                continue
            row_data = dict(zip(current_headers, row))

            # Format 'Profit Rate' if it exists in the row data
            if "Profit Rate" in row_data:
                row_data["Profit Rate"] = format_profit_rate(row_data["Profit Rate"])

            # Convert row into a synthetic Q/A pair
            if "Account Name" in row_data and "Profit Rate" in row_data:
                q = f"What is the profit rate for {row_data['Account Name']}?"
                a = f"The profit rate for {row_data['Account Name']} is {row_data['Profit Rate']}."
                train_data.append({"prompt": q, "response": a})
            elif "Product/Account" in row_data and "Profit Rate" in row_data:
                q = f"What is the profit rate for {row_data['Product/Account']} with tenor {row_data.get('Tenor', 'N/A')}?"
                a = f"The profit rate for {row_data['Product/Account']} with tenor {row_data.get('Tenor', 'N/A')} is {row_data['Profit Rate']}."
                train_data.append({"prompt": q, "response": a})
        table_buffer = []

    for line in lines:
        line = line.strip()

        # Handle tables
        if re.match(r"^\|.*\|$", line):
            cells = [cell.strip() for cell in line.strip('|').split('|')]
            if any(h in cells for h in ["Account Name", "Product/Account", "FCY"]): # Check for headers that indicate a table
                flush_table()
                current_headers = cells
                in_table = True
                continue
            elif in_table:
                if all(cell.startswith('---') for cell in cells):
                    continue  # Skip header separator
                else:
                    table_buffer.append(cells)
        else:
            if in_table:
                flush_table()
                in_table = False

        # Handle Q/A pairs
        q_match = re.match(r"\*\*Question:\*\* (.+)", line)
        if q_match:
            question = q_match.group(1).strip()
            continue  # response will come in next line(s)

        a_match = re.match(r"\*\*Answer:\*\* (.+)", line)
        if a_match and 'question' in locals():
            answer = a_match.group(1).strip()
            train_data.append({"prompt": question, "response": answer})
            del question  # reset for next pair

    # Flush any remaining table rows
    flush_table()

    # Save to JSON
    with open(output_json_path, 'w', encoding='utf-8') as out_file:
        json.dump(train_data, out_file, indent=2, ensure_ascii=False)

    print(f"Generated {output_json_path} with {len(train_data)} prompt/response pairs.")

In [7]:
excel_file_path = '/content/NUST Bank-Product-Knowledge.xlsx'
faqs_json_file_path = '/content/faqs.json'
output_markdown_path = 'nust_bank_data_final_structured.md'

print("Generating Markdown file from Excel and JSON FAQs...")
extract_data_in_markdown(excel_file_path, faqs_json_file_path, output_markdown_path)
print(f"Markdown file generated: {output_markdown_path}")

parse_markdown_to_json(output_markdown_path)

Generating Markdown file from Excel and JSON FAQs...
Markdown file generated: nust_bank_data_final_structured.md
Generated train_data.json with 333 prompt/response pairs.


#Model Loading and Fine Tuning

##Connect to Huggingface and Load the Tokenizer

In [None]:
login("HF TOKEN")

In [9]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="right", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("json", data_files="train_data.json")["train"]

def tokenize(example):
    full_texts = [p + " " + r for p, r in zip(example["prompt"], example["response"])]
    return tokenizer(full_texts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/333 [00:00<?, ? examples/s]

##Load the Model with LoRA

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

##Fine Tune the Model

In [11]:
training_args = TrainingArguments(
    output_dir="./llama3-qa-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.3123
20,2.6223
30,2.5768
40,2.238
50,2.2663
60,2.0356


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=60, training_loss=2.5085528055826822, metrics={'train_runtime': 1155.0681, 'train_samples_per_second': 0.865, 'train_steps_per_second': 0.052, 'total_flos': 8513630156881920.0, 'train_loss': 2.5085528055826822, 'epoch': 2.9580838323353293})

##Save the Fine Tuned Model

In [12]:
model.save_pretrained("llama3-qa-finetuned")
tokenizer.save_pretrained("llama3-qa-finetuned")

('llama3-qa-finetuned/tokenizer_config.json',
 'llama3-qa-finetuned/special_tokens_map.json',
 'llama3-qa-finetuned/tokenizer.json')

#RAG Implementation

In [13]:
# Load the SentenceTransformer model for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load the training data from JSON
with open("train_data.json", "r", encoding="utf-8") as f:
    qa_pairs = json.load(f)

# Combine each prompt-response pair into a single document
texts = [f"Q: {item['prompt']}\nA: {item['response']}" for item in qa_pairs]

# Embed the documents
embeddings = embedding_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"FAISS index built with {len(texts)} documents from train_data.json.")

# Save the index and corresponding texts
faiss.write_index(index, "faiss_index.idx")
with open("indexed_docs.json", "w", encoding="utf-8") as f:
    json.dump(texts, f)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

FAISS index built with 333 documents from train_data.json.


##Save Embeddings and Index

In [14]:
embeddings_path = "nust_bank_embeddings.npy"
index_path = "nust_bank_faiss_index.faiss"

np.save(embeddings_path, embeddings)
faiss.write_index(index, index_path)

print(f"Embeddings saved to {embeddings_path}")
print(f"FAISS index saved to {index_path}")

Embeddings saved to nust_bank_embeddings.npy
FAISS index saved to nust_bank_faiss_index.faiss


In [15]:
!pip install fastapi uvicorn transformers sentence-transformers faiss-cpu pyngrok python-multipart PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
!ngrok config add-authtoken #ADD AUTH TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [8]:
!pip install guardrails-ai

Collecting guardrails-ai
  Downloading guardrails_ai-0.6.6-py3-none-any.whl.metadata (12 kB)
Collecting diff-match-patch<20230431,>=20230430 (from guardrails-ai)
  Downloading diff_match_patch-20230430-py3-none-any.whl.metadata (5.2 kB)
Collecting faker<26.0.0,>=25.2.0 (from guardrails-ai)
  Downloading Faker-25.9.2-py3-none-any.whl.metadata (15 kB)
Collecting griffe<0.37.0,>=0.36.9 (from guardrails-ai)
  Downloading griffe-0.36.9-py3-none-any.whl.metadata (6.1 kB)
Collecting guardrails-api-client<0.5.0,>=0.4.0a1 (from guardrails-ai)
  Downloading guardrails_api_client-0.4.0a1-py3-none-any.whl.metadata (19 kB)
Collecting guardrails-hub-types<0.0.5,>=0.0.4 (from guardrails-ai)
  Downloading guardrails_hub_types-0.0.4-py3-none-any.whl.metadata (15 kB)
Collecting jsonref<2.0.0,>=1.1.0 (from guardrails-ai)
  Downloading jsonref-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting litellm<2.0.0,>=1.37.14 (from guardrails-ai)
  Downloading litellm-1.70.0-py3-none-any.whl.metadata (38 kB)
Coll

In [1]:
from guardrails import Guard

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
!python backend.py

2025-05-20 22:25:14.288236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747779914.307774   40423 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747779914.313770   40423 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-20 22:25:14.333699: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
  File "/content/backend.py", line 23, in <module>
    from guardrails.hub import 