<a href="https://www.kaggle.com/code/shakil19/qwen2-5-lora-rag?scriptVersionId=239146859" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!nvidia-smi

In [None]:
import os

In [None]:
# # %%capture
# import os
# if "COLAB_" not in "".join(os.environ.keys()):
#     !pip install unsloth
# else:
#     # Do this only in Colab notebooks! Otherwise use pip install unsloth
# !uv pip install --system --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo -qq
# !uv pip install --system sentencepiece protobuf datasets huggingface_hub hf_transfer -qq
# !uv pip install --system --no-deps unsloth

In [None]:
!uv pip install --system unsloth vllm -qq

In [None]:
# from google.colab import userdata
# HF = userdata.get('HF_TOKEN')

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF = user_secrets.get_secret("HF_TOKEN")

In [None]:
from unsloth import FastLanguageModel
import torch, os
from dotenv import load_dotenv
load_dotenv()

# HF = os.getenv("HF_TOKEN")
max_seq_length = 1024
lora_rank = 64

# Re-run the code after restarting the kernel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    use_rslora = False,
    loftq_config = None,
)

In [None]:
from datasets import load_dataset, concatenate_datasets
reasoning_dataset = load_dataset("json", data_files="/kaggle/input/may-10/intent_dataset.json", split="train")
non_reasoning_dataset = load_dataset("json", data_files="/kaggle/input/may-10/qna.json", split="train")

In [None]:
from pprint import pprint
pprint(reasoning_dataset[0])
pprint(non_reasoning_dataset[0])

In [None]:
def generate_conversation(examples):
    # Access the correct column names from reasoning_dataset
    problems = examples["patterns"]
    solutions = examples["responses"]
    conversations = []
    for problem, solution in zip(problems, solutions):
        # Convert problem and solution to strings if they are lists
        problem = problem if isinstance(problem, str) else ' '.join(problem)
        solution = solution if isinstance(solution, str) else ' '.join(solution)

        conversations.append([
            {"role": "user", "content": problem},
            {"role": "bot", "content": solution},
        ])
    return {"conversations": conversations}

In [None]:
reasoning_conversations = tokenizer.apply_chat_template(
    reasoning_dataset.map(generate_conversation, batched = True)["conversations"],
    tokenize = False,
)
reasoning_conversations[0]

In [None]:
from unsloth.chat_templates import standardize_sharegpt

# Prepare the dataset
dataset = standardize_sharegpt(non_reasoning_dataset)

# Create the "conversations" column from "user" and "bot" for each example
def create_conversation(example):
    # The chat template expects [{'role':'user', ...}, {'role':'assistant','...'}] (assistant not 'bot')
    return {
        "conversations": [
            {"role": "user", "content": example["user"]},
            {"role": "assistant", "content": example["bot"]},
        ]
    }

dataset = dataset.map(create_conversation)

In [None]:
# Apply the chat template and store the **prompt** in a new column
def apply_template(example):
    prompt = tokenizer.apply_chat_template(
        example["conversations"],
        tokenize=False
    )
    return {"prompt": prompt}

dataset = dataset.map(apply_template)

In [None]:
non_reasoning_conversations = dataset["prompt"]
pprint(non_reasoning_conversations[0])
pprint(len(reasoning_conversations))
pprint(len(non_reasoning_conversations))

In [None]:
chat_percentage = 0.75

In [None]:
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
    int(len(reasoning_conversations) * (1.0 - chat_percentage)),
    random_state = 2407,
)

In [None]:
data = pd.concat([
    pd.Series(reasoning_conversations),
    pd.Series(non_reasoning_subset)
])
data.name = "text"

In [None]:
from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)

len(combined_dataset)

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    )
)

In [None]:
trainer_stats = trainer.train()

In [None]:
messages = [
    {"role" : "user", "content" : "What do you know about the sun?"},
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True,
    enable_thinking = False,
)

In [None]:
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 2048,
    temperature = 0.7, top_p = 0.8, top_k = 20,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

In [None]:
messages = [
    {"role" : "user", "content" : "What is a latte?"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True,
    enable_thinking = True,
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 2048,
    temperature = 0.7, top_p = 0.8, top_k = 20,
    streamer = TextStreamer(tokenizer, skip_prompt = True)
)

In [None]:
model.save_pretrained("vllm-out")
tokenizer.save_pretrained("vllm-out")

In [None]:
model.save_pretrained_merged("vllm-out", tokenizer, save_method="merged_16bit")
print(f"Merged 16-bit model saved locally to: vllm-out/")

In [None]:
pprint(type(model))

In [None]:
for name, param in model.named_parameters():
    print(name, param.shape)

In [None]:
model.push_to_hub_merged("Moitreyee444/qwen2.5_3B_lora_model", tokenizer, save_method = "merged_16bit", token = HF)

## RAG Pipeline

In [None]:
import os
from vllm import LLM, SamplingParams
# from google.colab import userdata
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# https://huggingface.co/Moitreyee444/qwen2.5_3B_lora_model
hub_model_id = "shakil-mosharrof/qwen2.5_lora_model"

llm = LLM(
    model=hub_model_id,
    trust_remote_code=True,  # Important for Qwen models
)

sampling_params = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=256)

# Generate text
prompts = ["Explain the theory of relativity in simple terms:", "What is the recipe for a good chocolate cake?"]
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated: {generated_text!r}")

In [None]:
from pprint import pprint
# Generate text
prompts = ["Explain the theory of relativity in simple terms:", "What is the recipe for a good chocolate cake?"]
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    pprint(f"Prompt: {prompt!r}, Generated: {generated_text!r}")

In [None]:
!uv pip install --system streamlit chromadb langchain_community sentence-transformers pypdf transformers vllm -q

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from pathlib import Path
import os
from langchain_community.document_loaders import PyPDFLoader
# Imports for vLLM and Hugging Face Tokenizer
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer


FINETUNED_MODEL_HUB_ID = "shakil-mosharrof/qwen2.5_lora_model" # <<< IMPORTANT: VERIFY THIS

# --- 1. Prepare Your Corpus ---
loader = PyPDFLoader('/kaggle/input/may-10/sec-3.pdf')
documents = loader.load()

# --- 2. Initialize Embedding Model for ChromaDB ---

retrieval_embedder_model_name = 'all-MiniLM-L6-v2'


# --- 3. Initialize ChromaDB Persistently ---
chroma_client = chromadb.PersistentClient(path="chroma_rag_db_vllm")
collection = chroma_client.get_or_create_collection(
    name="knowledge_vllm",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=retrieval_embedder_model_name
    )
)

# --- 4. Chunk and Store: Add to ChromaDB (if not already done) ---
# Check if collection is empty before adding to avoid duplicates on re-runs
if collection.count() == 0:
    print(f"Adding {len(documents)} documents to ChromaDB collection '{collection.name}'...")
    texts = [doc.page_content for doc in documents]
    
    collection.add(
        documents=texts,
        metadatas=[{"source": f"doc_{i}"} for i in range(len(texts))],
        ids=[f"id_{i}" for i in range(len(texts))]
    )
    print("Documents added to ChromaDB.")
else:
    print(f"ChromaDB collection '{collection.name}' already contains {collection.count()} documents.")

# --- 5. Initialize vLLM and Tokenizer for the Fine-tuned Model ---
print(f"Loading fine-tuned model '{FINETUNED_MODEL_HUB_ID}' with vLLM...")

try:
    # Load tokenizer for the fine-tuned Qwen2 model
    # This is crucial for applying the correct chat template
    qwen_tokenizer = AutoTokenizer.from_pretrained(
        FINETUNED_MODEL_HUB_ID,
        trust_remote_code=True,
    )

    # Initialize vLLM with the fine-tuned model
    llm = LLM(
        model=FINETUNED_MODEL_HUB_ID,
        trust_remote_code=True,
    )
    print("Fine-tuned model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading fine-tuned model or tokenizer: {e}")
    print("Please ensure:")
    print(f"1. '{FINETUNED_MODEL_HUB_ID}' is the correct Hugging Face Hub ID.")
    print("2. Your Hugging Face token (from 'HF' env var) is valid and has permissions if the model is private.")
    print("3. You have vLLM and necessary dependencies (like PyTorch with CUDA) installed correctly.")
    exit()

# Define sampling parameters for vLLM
# Note: `max_new_tokens` from HF becomes `max_tokens` in vLLM SamplingParams
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.8,
    top_k=20,
    max_tokens=512  # Max number of tokens to generate
)

# --- 6. Set up System Prompt and User Query Loop ---
system_prompt = f"""
You are an AI-powered expert assistant specialized in extracting and 
interpreting policies and regulations of the Bangladesh Air Force from provided PDF documents. 
Your task is to accurately and efficiently understand dense, complex legal and procedural texts, and provide clear, precise, 
and contextually relevant answers to user queries about Air Force policies and regulations. When responding:  

- Base answers strictly on the content of the given documents.  
- Explain policy or regulation details in simple, clear language suitable for users at different hierarchy levels.  
- Provide specific references or excerpts from the source documents when helpful.  
- Handle complex queries by breaking down information into understandable parts.  
- Continuously learn and improve from interactions to better support decision-making and compliance processes.  

Maintain a professional and concise tone, prioritizing accuracy and usefulness for users navigating Air Force policies and regulations.

"""

print("\n--- RAG Chat with vLLM ---")
print("Type 'exit' or 'quit' to end.")

while True:
    user_query = input("USER: ")
    if user_query.strip().lower() in {"exit", "quit"}:
        break
    if not user_query.strip():
        continue

    # --- 6a. Retrieve Relevant Chunks from ChromaDB ---
    results = collection.query(
        query_texts=[user_query],
        n_results=3  # Retrieve top 3 relevant chunks
    )
    contexts = [doc for doc in results["documents"][0]] if results["documents"] and results["documents"][0] else []

    if not contexts:
        retrieved_context_str = "No relevant context found in the database."
    else:
        retrieved_context_str = "\n".join([f"- {ctx}" for ctx in contexts])

    # --- 7. Compose Prompt for Qwen2 Model ---
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "system", "content": "Context:\n" + retrieved_context_str},
        {"role": "user", "content": user_query},
    ]

    # Apply the chat template using the Qwen2 tokenizer
    # This creates the full prompt string with special tokens for the model
    try:
        full_prompt_string = qwen_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        print("This might happen if the tokenizer doesn't support `enable_thinking` or if the messages format is unexpected.")
        print("Try removing `enable_thinking=False` if you are not using Unsloth or a custom template that requires it.")
        continue


    # --- 8. Generate Response with vLLM ---
    print("ASSISTANT: ", end="", flush=True)
    try:
        vllm_outputs = llm.generate([full_prompt_string], sampling_params)

        # Extract the generated text
        generated_text = vllm_outputs[0].outputs[0].text
        print(generated_text)

    except Exception as e:
        print(f"\nError during vLLM generation: {e}")

    print("-" * 20) # Separator for conversation turns

print("Exiting RAG pipeline.")