In [5]:
# ======================================================
# 1. Install Dependencies
# ======================================================
!pip install -U transformers datasets bitsandbytes peft accelerate gradio sentence-transformers faiss-cpu --quiet

# ======================================================
# 2. Load Retriever and Build FAISS Index
# ======================================================
import torch
import faiss
import numpy as np
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, CrossEncoder

# Load nodes and definitions
df_nodes = pd.read_csv('kg_nodes.csv', encoding='utf-8-sig')
with open('condition_definitions.json', 'r', encoding='utf-8') as f:
    condition_definitions = json.load(f)

# Build corpus
corpus = []
for idx, row in df_nodes.iterrows():
    if row['type'] == 'Condition':
        label = row['label_malay']
        definition = condition_definitions.get(label, '')
        if definition:
            corpus.append(f"{label}: {definition}")
        else:
            corpus.append(label)

# Load embedding model (bi-encoder)
embedder = SentenceTransformer('mesolitica/llama2-embedding-600m-8k-contrastive')

# Encode corpus
corpus_embeddings = embedder.encode(corpus, convert_to_numpy=True, show_progress_bar=True)

# Build FAISS index
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

# Save index and corpus
faiss.write_index(index, 'faiss_index')
with open('corpus.json', 'w', encoding='utf-8') as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)

# Load cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Load FAISS index and corpus for inference
index = faiss.read_index('faiss_index')
with open('corpus.json', 'r', encoding='utf-8') as f:
    corpus = json.load(f)

# Retrieval function with cross-encoder re-ranking
def retrieve_context(query, top_k=5):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    retrieved = [corpus[idx] for idx in indices[0]]

    cross_inp = [[query, ctx] for ctx in retrieved]
    scores = cross_encoder.predict(cross_inp)
    ranked = sorted(zip(retrieved, scores), key=lambda x: x[1], reverse=True)
    top_context = "\n".join([ctx for ctx, _ in ranked[:3]])
    return top_context

# ======================================================
# 3. Load Generator Model and Prepare for LoRA
# ======================================================
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

token = "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"

tokenizer = AutoTokenizer.from_pretrained(
    "mesolitica/Malaysian-Qwen2.5-0.5B-Instruct", use_auth_token=token
)

model = AutoModelForCausalLM.from_pretrained(
    "mesolitica/Malaysian-Qwen2.5-0.5B-Instruct", use_auth_token=token
)

# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()




Batches:   0%|          | 0/4 [00:00<?, ?it/s]



trainable params: 540,672 || all params: 630,708,096 || trainable%: 0.0857


In [6]:

# ======================================================
# 4. Load and Prepare Dataset with Context
# ======================================================
from datasets import Dataset

# Load your training data
with open('mental_health_data.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Build new dataset with retrieved context
new_data = []
for entry in raw_data:
    question = entry['title']
    context = retrieve_context(question)
    merged_text = f"Context: {context}\nSoalan: {question}\nJawapan: {entry['content']}"
    new_data.append({'text': merged_text})

dataset = Dataset.from_list(new_data)

# Tokenise dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    batch_size=8,  # You can even try batch_size=4 if it still crashes
)


# ======================================================
# 5. Fine-Tuning Setup with Trainer
# ======================================================
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./finetuned_model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Start fine-tuning
trainer.train()

# Save final model
model.save_pretrained('./finetuned_model')
tokenizer.save_pretrained('./finetuned_model')

# ======================================================
# 6. Gradio UI for Inference
# ======================================================
import gradio as gr

# Reload model (optional)
model = AutoModelForCausalLM.from_pretrained('./finetuned_model', device_map='auto', load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained('./finetuned_model')
model.eval()

def generate_answer(question):
    context = retrieve_context(question)
    prompt = f"Context: {context}\nSoalan: {question}\nJawapan:"
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(prompt, '').strip()

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## Sistem Tanya Jawab Perubatan Bahasa Melayu")

    with gr.Row():
        with gr.Column():
            question = gr.Textbox(
                label="Soalan Perubatan",
                placeholder="Contoh: Apakah gejala anxiety?",
                lines=3
            )
            submit_btn = gr.Button("Dapatkan Jawapan", variant="primary")

        with gr.Column():
            answer = gr.Textbox(
                label="Jawapan",
                interactive=False,
                lines=5
            )

    gr.Examples(
        examples=[
            ["Apakah rawatan untuk anxiety?"],
            ["Apakah gejala anxiety attack?"],
            ["Bagaimana cara mengurangkan anxiety tanpa ubat?"]
        ],
        inputs=question
    )

    submit_btn.click(fn=generate_answer, inputs=question, outputs=answer)
    question.submit(fn=generate_answer, inputs=question, outputs=answer)

demo.launch(share=True)


Map:   0%|          | 0/122 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.2207
20,2.141
30,2.0702
40,2.0273
50,1.9788
60,1.9661
70,1.9445
80,1.9189


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ff9ece313cadf63a33.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [7]:
import shutil
from google.colab import files

# Step 1: Compress the finetuned_model folder
shutil.make_archive('finetuned_model', 'zip', 'finetuned_model')

# Step 2: Download the zipped file
files.download('finetuned_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>