In [1]:
import pandas as pd

def extract_data(excel_file):
    all_qa_pairs = []
    xls = pd.ExcelFile(excel_file)
    sheet_names = xls.sheet_names

    for i, sheet_name in enumerate(sheet_names):
        if i < 2:
            continue

        df = pd.read_excel(xls, sheet_name, header=None)
        qa_pairs = []

        for index, row in df.iterrows():
            first_non_empty_index = -1
            first_non_empty_value = None
            for col_index, cell in enumerate(row):
                if pd.notna(cell):
                    first_non_empty_index = col_index
                    first_non_empty_value = str(cell).strip()
                    break

            if first_non_empty_value and first_non_empty_value.endswith('?'):
                question = first_non_empty_value
                answer = ""
                for next_index in range(index + 1, len(df)):
                    next_row = df.iloc[next_index]
                    next_first_non_empty_index = -1
                    next_first_non_empty_value = None
                    for col_index, cell in enumerate(next_row):
                        if pd.notna(cell):
                            next_first_non_empty_index = col_index
                            next_first_non_empty_value = str(cell).strip()
                            break

                    if next_first_non_empty_value and not next_first_non_empty_value.endswith('?'):
                        answer += next_first_non_empty_value + " "
                    elif next_first_non_empty_value and next_first_non_empty_value.endswith('?'):
                        break
                    elif all(pd.isna(cell) for cell in next_row):
                        break
                if question and answer:
                    qa_pairs.append({'question': question, 'answer': answer.strip()})

        all_qa_pairs.extend(qa_pairs)

    return all_qa_pairs

excel_file_path = '/content/NUST Bank-Product-Knowledge.xlsx'

# Extract the basic question-answer pairs
extracted_data = extract_data(excel_file_path)

# Print the extracted data
for item in extracted_data:
    print(f"Question: {item['question']}")
    print(f"Answer: {item['answer']}")
    print("-" * 20)

# Save the extracted data to JSON
import json
with open('basic_qa_pairs.json', 'w') as f:
    json.dump(extracted_data, f, indent=4)

Question: I would like to open an account with my son, do u have any product for kids?
Answer: Yes our product is Little Champs Account. It is designed specifically for minors (individuals below the age of 18 years). A child requires the help of a parental/legal guardian to open this account and avail its facilities. Little Champs get a Debit Card and chequebook which is free the first time What are the main Features  of the Little Champs Account. Minimum initial deposit of Rs.100/- Free first chequebook* Free debit card* (annual/replacement fees apply). This debit card has the following limits Daily funds Transfer Limit:     Rs.100,000/- Daily ATM Withdrawal Limit:  Rs.25,000/- Daily POS Limit:                     Rs.50,000/- * For Current Account only
--------------------
Question: What other Value added features does the Little Champs Account have?
Answer: Attractive returns on savings account SMS alert service on digital transactions I Net banking services Free education insurance 

In [19]:
!pip install transformers datasets peft bitsandbytes accelerate



In [20]:
import json

# Load merged QA data
with open('basic_qa_pairs.json', 'r') as f:
    qa_data = json.load(f)

# Convert to prompt-response format
train_data = [{"prompt": f"Q: {item['question']}\nA:", "response": item['answer']} for item in qa_data]

# Save as Hugging Face-style dataset
with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=2)


In [22]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load your dataset
dataset = load_dataset("json", data_files="train_data.json")["train"]

# Tokenize
def tokenize(batch):
    full_texts = [p + " " + r for p, r in zip(batch["prompt"], batch["response"])]
    return tokenizer(full_texts, truncation=True, padding="max_length", max_length=512)


tokenized_dataset = dataset.map(tokenize, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/293 [00:00<?, ? examples/s]

In [27]:
from transformers import AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)


# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# LoRA configuration
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, config)




In [29]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./deepseek-qa-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=6,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()



NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [8]:
model.save_pretrained("deepseek-qa-finetuned")
tokenizer.save_pretrained("deepseek-qa-finetuned")

('deepseek-qa-finetuned/tokenizer_config.json',
 'deepseek-qa-finetuned/special_tokens_map.json',
 'deepseek-qa-finetuned/tokenizer.json')

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model (local or from Hugging Face Hub)
model_path = "./deepseek-qa-finetuned"  # or "your-username/deepseek-qa-finetuned" if uploaded
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto").eval()


In [10]:
def generate_answer(question, max_tokens=256):
    prompt = f"Q: {question}\nA:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the question part and only return the answer
    return generated_text.split("A:")[-1].strip()


In [11]:
question = "What is NUST4Car?"
answer = generate_answer(question)
print("Answer:", answer)


Answer: The benefits of NUST4Car are as follows:
1. Free insurance coverage of 100,000 Rupees
2. 30 days insurance coverage
3. No monthly premium
4. No claim processing charges
5. No service charges
6. No application charges
7. No transfer charges
8. No late fees
9. No understatements
10. No reinsurance charges
11. No reinsurance delays
12. No underwritten insurance charges
13. No underwritten delays
14. No underwritten claim processing charges
15. No underwritten reinsurance charges
16. No underwritten reinsurance delays
17. No underwritten reinsurance reinsurance charges
18. No underwritten reinsurance reinsurance
