In [1]:
import pandas as pd

def extract_data(excel_file):
    all_qa_pairs = []
    xls = pd.ExcelFile(excel_file)
    sheet_names = xls.sheet_names

    for i, sheet_name in enumerate(sheet_names):
        if i < 2:
            continue

        df = pd.read_excel(xls, sheet_name, header=None)
        qa_pairs = []

        for index, row in df.iterrows():
            first_non_empty_index = -1
            first_non_empty_value = None
            for col_index, cell in enumerate(row):
                if pd.notna(cell):
                    first_non_empty_index = col_index
                    first_non_empty_value = str(cell).strip()
                    break

            if first_non_empty_value and first_non_empty_value.endswith('?'):
                question = first_non_empty_value
                answer = ""
                for next_index in range(index + 1, len(df)):
                    next_row = df.iloc[next_index]
                    next_first_non_empty_index = -1
                    next_first_non_empty_value = None
                    for col_index, cell in enumerate(next_row):
                        if pd.notna(cell):
                            next_first_non_empty_index = col_index
                            next_first_non_empty_value = str(cell).strip()
                            break

                    if next_first_non_empty_value and not next_first_non_empty_value.endswith('?'):
                        answer += next_first_non_empty_value + " "
                    elif next_first_non_empty_value and next_first_non_empty_value.endswith('?'):
                        break
                    elif all(pd.isna(cell) for cell in next_row):
                        break
                if question and answer:
                    qa_pairs.append({'question': question, 'answer': answer.strip()})

        all_qa_pairs.extend(qa_pairs)

    return all_qa_pairs

excel_file_path = '/content/NUST Bank-Product-Knowledge.xlsx'

# Extract the basic question-answer pairs
extracted_data = extract_data(excel_file_path)

# Print the extracted data
for item in extracted_data:
    print(f"Question: {item['question']}")
    print(f"Answer: {item['answer']}")
    print("-" * 20)

# Save the extracted data to JSON
import json
with open('basic_qa_pairs.json', 'w') as f:
    json.dump(extracted_data, f, indent=4)

FileNotFoundError: [Errno 2] No such file or directory: '/content/NUST Bank-Product-Knowledge.xlsx'

In [None]:
!pip install transformers datasets peft bitsandbytes accelerate

In [None]:
import json

# Load merged QA data
with open('basic_qa_pairs.json', 'r') as f:
    qa_data = json.load(f)

# Convert to prompt-response format
train_data = [{"prompt": f"Q: {item['question']}\nA:", "response": item['answer']} for item in qa_data]

# Save as Hugging Face-style dataset
with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=2)


In [None]:
from huggingface_hub import login
login()

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load your dataset
dataset = load_dataset("json", data_files="train_data.json")["train"]

# Tokenize
def tokenize(batch):
    full_texts = [p + " " + r for p, r in zip(batch["prompt"], batch["response"])]
    return tokenizer(full_texts, truncation=True, padding="max_length", max_length=512)


tokenized_dataset = dataset.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# Load in 8-bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto"
)

# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# LoRA configuration
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, config)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./deepseek-qa-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()


In [None]:
model.save_pretrained("deepseek-qa-finetuned")
tokenizer.save_pretrained("deepseek-qa-finetuned")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model (local or from Hugging Face Hub)
model_path = "./deepseek-qa-finetuned"  # or "your-username/deepseek-qa-finetuned" if uploaded
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto").eval()


In [None]:
def generate_answer(question, max_tokens=256):
    prompt = f"Q: {question}\nA:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the question part and only return the answer
    return generated_text.split("A:")[-1].strip()


In [None]:
question = "What is NUST4Car?"
answer = generate_answer(question)
print("Answer:", answer)
