# Genenerate Dataset

Using the deepseek, conver the extracted text preserving the chapters content as exact will be send to generate Q&A pairs.

In [None]:
!pip install requests dotenv




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import time
import json
import requests
from dotenv import load_dotenv

load_dotenv()

# Replace with your DeepSeek API key
API_KEY = os.getenv("DEEPSEEK_API_KEY")

# DeepSeek API endpoint
API_URL = "https://api.deepseek.com/v1/chat/completions"

# Input and output directories
INPUT_DIR = "./data/text_extraction_data"
OUTPUT_DIR = "./data/json_dataset"

# Wait time between API calls (in seconds)
WAIT_TIME = 10  # Adjust based on API rate limits

# Function to generate Q&A pairs
def generate_qa_pairs(text):
    # Define the prompt for the API
    prompt = f"""
    Generate question-answer pairs in JSON format from the following text. 
    Ensure the questions are clear and the answers are concise and accurate.
    The output should be a list of dictionaries, where each dictionary has two keys: "question" and "answer".

    Example Output:
    [
      {{
        "question": "What is the purpose of DeepSeek's open-source initiative?",
        "answer": "DeepSeek's open-source initiative aims to share production-ready tools and frameworks to accelerate AGI exploration. By open-sourcing their code, DeepSeek fosters collaboration, transparency, and innovation within the AI community."
      }},
      {{
        "question": "What is FlashMLA, and how is it optimized for Hopper GPUs?",
        "answer": "FlashMLA is an efficient Multi-Head Latent Attention (MLA) decoding kernel optimized for Hopper GPUs. It supports BF16, uses a paged KV cache with a block size of 64, and achieves 3000 GB/s memory-bound performance and 580 TFLOPS compute-bound performance on H800 GPUs."
      }}
    ]

    Text:
    {text}
    """

    # Prepare the request payload
    payload = {
        "model": "deepseek-chat",  # Specify the model
        "messages": [
            {"role": "system", "content": "You are a helpful assistant that generates question-answer pairs in JSON format."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 2000,  # Adjust based on the length of the text
        "temperature": 0.7,  # Controls creativity (0.7 is a good balance)
    }

    # Set up headers with the API key
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    # Send the request to the DeepSeek API
    response = requests.post(API_URL, headers=headers, json=payload)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response
        response_data = response.json()
        qa_pairs = response_data["choices"][0]["message"]["content"]

        # Convert the response to a Python list of dictionaries
        try:
            qa_pairs = json.loads(qa_pairs)
            return qa_pairs
        except json.JSONDecodeError:
            print("Error: The response is not valid JSON.")
            return None
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Function to process all input files
def process_files(input_dir, output_dir, wait_time):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Get a list of all .txt files in the input directory
    input_files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]

    # Process each file sequentially
    for file_name in input_files:
        # Construct full file paths
        input_path = os.path.join(input_dir, file_name)
        output_path = os.path.join(output_dir, file_name.replace(".txt", ".json"))

        # Read the input file
        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()

        # Generate Q&A pairs
        print(f"Processing {file_name}...")
        qa_pairs = generate_qa_pairs(text)

        # Save the result as a JSON file
        if qa_pairs:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(qa_pairs, f, indent=4)
            print(f"Saved Q&A pairs to {output_path}")
        else:
            print(f"Failed to generate Q&A pairs for {file_name}")

        # Wait before processing the next file
        print(f"Waiting for {wait_time} seconds before the next file...")
        time.sleep(wait_time)

# Run the script
process_files(INPUT_DIR, OUTPUT_DIR, WAIT_TIME)

Processing data_1.txt...
Error: 402 - {"error":{"message":"Insufficient Balance","type":"unknown_error","param":null,"code":"invalid_request_error"}}
Failed to generate Q&A pairs for data_1.txt
Waiting for 10 seconds before the next file...
Processing data_10.txt...
Error: 402 - {"error":{"message":"Insufficient Balance","type":"unknown_error","param":null,"code":"invalid_request_error"}}
Failed to generate Q&A pairs for data_10.txt
Waiting for 10 seconds before the next file...
Processing data_11.txt...
Error: 402 - {"error":{"message":"Insufficient Balance","type":"unknown_error","param":null,"code":"invalid_request_error"}}
Failed to generate Q&A pairs for data_11.txt
Waiting for 10 seconds before the next file...
Processing data_2.txt...
Error: 402 - {"error":{"message":"Insufficient Balance","type":"unknown_error","param":null,"code":"invalid_request_error"}}
Failed to generate Q&A pairs for data_2.txt
Waiting for 10 seconds before the next file...
Processing data_3.txt...
Error: 

## Combine all json data to single file

In [4]:
import os
import json

# Output directory containing individual JSON files
OUTPUT_DIR = "./data/json_dataset"

# Output file for the combined dataset
COMBINED_OUTPUT_FILE = "./data/dataset.json"

# Function to combine JSON files
def combine_json_files(output_dir, combined_output_file):
    # Initialize an empty list to store all Q&A pairs
    combined_data = []

    # Get a list of all .json files in the output directory
    json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]

    # Process each JSON file
    for file_name in json_files:
        # Construct the full file path
        file_path = os.path.join(output_dir, file_name)

        # Read the JSON file
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Append the data to the combined list
        combined_data.extend(data)
        print(f"Added {file_name} to the combined dataset.")

    # Save the combined data to a single JSON file
    with open(combined_output_file, "w", encoding="utf-8") as f:
        json.dump(combined_data, f, indent=4)
    print(f"Combined dataset saved to {combined_output_file}")

# Run the script
combine_json_files(OUTPUT_DIR, COMBINED_OUTPUT_FILE)

Added data_1.json to the combined dataset.
Added data_10.json to the combined dataset.
Added data_11.json to the combined dataset.
Added data_12.json to the combined dataset.
Added data_13.json to the combined dataset.
Added data_14.json to the combined dataset.
Added data_15.json to the combined dataset.
Added data_2.json to the combined dataset.
Added data_3.json to the combined dataset.
Added data_4.json to the combined dataset.
Added data_5.json to the combined dataset.
Added data_6.json to the combined dataset.
Added data_7.json to the combined dataset.
Added data_8.json to the combined dataset.
Added data_9.json to the combined dataset.
Combined dataset saved to ./data/dataset.json


## Fien-tune the model

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def finetune_qwen_colab_lightweight(train_file, model_name="Qwen/Qwen2.5-3B-Instruct", output_dir="./qwen2.5-3b-research-qa-lora"):
    """
    Fine-tunes a Qwen model using a custom dataset.
    Optimized for minimal memory usage in Google Colab to avoid OutOfMemoryError.

    Key memory reduction strategies applied:
    - Reduced per_device_train_batch_size to 1
    - Kept gradient_accumulation_steps to 4 (effective batch size 4)
    - Reduced max_length for tokenization to 256
    - Using 4-bit quantization (load_in_4bit=True)
    - CPU Offloading enabled

    Args:
        train_file (str): Path to the JSON file containing the training data (in Colab environment).
        model_name (str): Name of the Qwen model to fine-tune.
        output_dir (str): Output directory for saving the fine-tuned LoRA adapters (in Colab environment).
    """

    # --- Google Colab Specific Setup (Installation - Run this in Colab) ---
    # print("Installing required libraries in Colab...")
    # !pip install -q transformers datasets peft accelerate bitsandbytes

    # --- Clear GPU Cache ---
    print("Clearing GPU memory cache...")
    torch.cuda.empty_cache()

    # --- 1. Load Model and Tokenizer ---
    print(f"Loading tokenizer and model: {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        offload_folder="offload",
        offload_state_dict=True,
        # Remove load_in_4bit=True  <--- REMOVE THIS LINE
        quantization_config={"load_in_4bit": True} # KEEP quantization_config
    )

    # --- 2. Prepare Model for QLoRA ---
    print("Preparing model for QLoRA...")
    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj"]
    )
    model = get_peft_model(model, config)

    # --- 3. Load and Tokenize Dataset ---
    print(f"Loading and tokenizing dataset from: {train_file}...")
    def preprocess_function(examples):
        inputs = [f"Question: {q} Answer: " for q in examples["question"]]
        targets = [a for a in examples["answer"]]
        model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, # Reduced max_length to 256
                                   padding="max_length")
        return model_inputs

    try:
        dataset = load_dataset("json", data_files=train_file, split="train")
        tokenized_train_dataset = dataset.map(preprocess_function, batched=True)
    except FileNotFoundError:
        print(f"Error: Training data file not found at {train_file}. Make sure to upload it to Colab.")
        return
    except Exception as e:
        print(f"Error: An error occurred reading the JSON file: {e}")
        return

    # --- 4. Set up Training Arguments ---
    print("Setting up training arguments (Lightweight Config)...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1, # **Reduced to 1 for minimal memory**
        gradient_accumulation_steps=4, # Effective batch size = 4 (still reasonable)
        learning_rate=2e-4,
        num_train_epochs=3,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="no",
        fp16=True,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        report_to="none",
        push_to_hub=False,
    )

    # --- 5. Create Trainer and Train ---
    print("Initializing Trainer and starting training...")
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_train_dataset,
    )

    trainer.train()

    # --- 6. Save Trained Model (LoRA Adapters) ---
    print("Saving trained LoRA adapters...")
    model.save_pretrained(output_dir)
    print(f"Fine-tuning complete! LoRA adapters saved to {output_dir}")


if __name__ == "__main__":
    # ---  Instructions for Google Colab Users (LIGHTWEIGHT VERSION) ---
    print("\n--- Instructions for Google Colab (LIGHTWEIGHT VERSION) ---")
    print("1. **Upload Dataset:** Upload your `dataset.json` file to the Colab environment.")
    print("   You can do this by dragging and dropping it into the Files sidebar (left side in Colab).")
    print("2. **Set `train_data_file` Path:** Ensure `train_data_file` below points to the correct path")
    print("   where you uploaded `dataset.json` in Colab.  For example: `'dataset.json'` or `'./data/dataset.json'`")
    print("3. **Run the Code:** Execute this Python code cell in Colab.")
    print("4. **Check Output:** After training, LoRA adapters will be in `qwen2.5-3b-research-qa-lora` folder.")
    print("   Download this folder from Colab's Files sidebar.")
    print("---")
    print("\n**This version is optimized for minimal memory usage in Colab.**")
    print("**If you still get OutOfMemoryError, consider further reducing `max_length` to 128 in the code.**")
    print("---")

    train_data_file = "./data/dataset.json"  # Path to your JSON training data file in Colab
    finetune_qwen_colab_lightweight(train_data_file)  # Run the lightweight fine-tuning function

## Generate the gguf File

In [None]:
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make

Run the below command to generate the .gguf file

In [None]:
py ./convert_lora_to_gguf.py ../qwen2.5-3b-research-qa-lora --outfile ../qwen2.5-3b-research-qa-lora.gguf --base-model-id Qwen/Qwen2.5-3B

## Build a Chatbot with In-Memory Database

In [None]:
!pip install transformers torch llama-cpp-python redis datasets peft




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Redis Server:

- Linux: `sudo apt install redis-server`

- macOS: `brew install redis`

- Windows: Download from [Microsoft's Redis repo](https://github.com/microsoftarchive/redis/releases).

In [None]:
from llama_cpp import Llama
import redis

# Load the fine-tuned .gguf model
model_path = "./qwen2.5-3b-research-qa-lora.gguf"  # Replace with the correct path
llm = Llama(model_path=model_path, n_ctx=2048)  # Increase context size if needed

# Initialize Redis for chat history
redis_client = redis.Redis(host="localhost", port=6379, db=0)

# Function to generate a response
def generate_response(prompt, chat_history):
    # Limit chat history length (avoid excessive token usage)
    max_history_length = 10  # Keep last 10 exchanges
    chat_history = chat_history[-max_history_length:]

    # Combine chat history with the new prompt
    full_prompt = "\n".join(chat_history + [f"User: {prompt}"]) + "\nChatbot:"

    # Generate response
    output = llm.create_completion(
        prompt=full_prompt,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9,
        stop=["User:", "\nUser:"],  # Ensure bot stops before next user input
        echo=False
    )

    return output["choices"][0]["text"].strip()

# Chat loop
def chat():
    chat_id = "user_123"  # Unique ID for the chat session
    chat_history = redis_client.lrange(chat_id, 0, -1)  # Load chat history
    chat_history = [msg.decode("utf-8") for msg in chat_history]

    print("Chatbot: Hello! How can I assist you today?")
    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Chatbot: Goodbye!")
            break

        # Generate response
        response = generate_response(user_input, chat_history)
        print(f"Chatbot: {response}")

        # Update chat history (keeping only recent interactions)
        chat_history.append(f"User: {user_input}")
        chat_history.append(f"Chatbot: {response}")
        redis_client.rpush(chat_id, f"User: {user_input}", f"Chatbot: {response}")

        # Trim Redis history to avoid excessive memory usage
        redis_client.ltrim(chat_id, -20, -1)  # Keep last 20 messages

# Start the chat
if __name__ == "__main__":
    chat()
