In [None]:
# Install necessary libraries
#!pip install --upgrade --force-reinstall "numpy<2.0.0" vllm numba tensorflow torch unsloth huggingface_hub

Collecting numpy<2.0.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vllm
  Downloading vllm-0.8.2-cp38-abi3-manylinux1_x86_64.whl.metadata (27 kB)
Collecting numba
  Downloading numba-0.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:

In [None]:
from huggingface_hub import login

# Replace 'your_hf_token' with your actual Hugging Face access token
login(token="your_hf_token")


In [None]:

from unsloth import FastLanguageModel, is_bfloat16_supported
from filelock import FileLock
from transformers import AutoTokenizer
import torch
import json
import os
from vllm import SamplingParams
from huggingface_hub import hf_hub_download

# Model configuration
max_seq_length = 2048
lora_rank = 64

# Download the Qwen-3B model directly from Hugging Face
model_name = "Qwen/Qwen2.5-3B-Instruct"  # or another model name
lora_adapter = "AMLAN69/LoRA"  # LoRA adapter for Qwen-3B Llama-1B-LoRA/blob/main/adapter_model%20(2).bin

# Load base model and LoRA adapter
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=True,
    max_lora_rank=lora_rank,
    lora_weight_path=hf_hub_download(lora_adapter, filename="adapter_model.bin"),
    gpu_memory_utilization=0.5,
)

def load_conversations(user_id, history_file="conversations.jsonl", max_history=10):
    """Load conversation history with file locking"""
    conversations = []
    if not isinstance(user_id, (str, int)):
        raise ValueError("Invalid user ID format")

    if os.path.exists(history_file):
        with FileLock(history_file + ".lock"):
            try:
                with open(history_file, "r", encoding="utf-8") as f:
                    for line in f:
                        data = json.loads(line)
                        if str(data["user_id"]) == str(user_id):
                            conversations.append(data)
            except Exception as e:
                print(f"Error loading history: {e}")

    return conversations[-max_history:]

def save_conversation(user_id, user_message, bot_response,
                     history_file="conversations.jsonl", max_history=5):
    """Save conversation with validation and locking"""
    if not isinstance(user_id, (str, int)):
        raise ValueError("Invalid user ID format")

    user_message = user_message.strip()[:2000]  # Limit message length
    bot_response = bot_response.strip()[:2000]

    new_entry = {
        "user_id": str(user_id),
        "user_message": user_message,
        "bot_response": bot_response
    }

    with FileLock(history_file + ".lock"):
        try:
            with open(history_file, "a", encoding="utf-8") as f:
                f.write(json.dumps(new_entry) + "\n")
        except Exception as e:
            print(f"Error saving conversation: {e}")

def generate_response(user_id, user_message, model):
    """Generate response with safety checks"""
    # Input validation
    user_message = user_message.strip()[:2000]
    if not user_message:
        return "Please provide a valid message."

    SYSTEM_PROMPT = """You are a CBT-based mental health chatbot. Follow these rules:
1. Empathize first, then help identify negative thoughts
2. Use Socratic questioning to challenge cognitive distortions
3. Suggest practical behavioral activation strategies
4. Never provide medical advice
5. Maintain natural conversation flow"""

    try:
        past_conversations = load_conversations(user_id)
        chat_history = [{"role": "system", "content": SYSTEM_PROMPT}]

        # Build conversation history
        for conv in past_conversations:
            chat_history.append({"role": "user", "content": conv["user_message"]})
            chat_history.append({"role": "assistant", "content": conv["bot_response"]})

        chat_history.append({"role": "user", "content": user_message})

        # Format input for model
        text = tokenizer.apply_chat_template(
            chat_history,
            tokenize=False,
            add_generation_prompt=True
        )

        # Generation parameters
        sampling_params = SamplingParams(
            temperature=0.85,
            top_p=0.9,
            max_tokens=2048,
            stop=["<|endoftext|>", "USER:", "ASSISTANT:"]
        )

        # Generate response
        outputs = model.fast_generate(
            text,
            sampling_params=sampling_params,
        )
        response = outputs[0].outputs[0].text.strip()

        # Basic safety filter
        response = response.split("ASSISTANT:")[-1].split("USER:")[0].strip()

        # Save conversation
        save_conversation(user_id, user_message, response)
        return response

    except Exception as e:
        print(f"Error generating response: {e}")
        return "I'm having trouble responding right now. Please try again later."

"""CHATBOT INFERENCE"""

def chat_loop():
    user_id = "temp_user"  # You can make this dynamic if needed
    print("\nWelcome to the CBT Chatbot. Type 'exit' to end the conversation.\n")

    while True:
        try:
            # Get user input
            user_input = input("You: ")

            if user_input.lower() in ["exit", "quit"]:
                print("Ending conversation. Take care!")
                break

            # Generate and display response
            response = generate_response(user_id, user_input, model)
            print("\nBot:", response)
            print("---\n")

        except KeyboardInterrupt:
            print("\nConversation ended by user.")
            break
        except Exception as e:
            print(f"\nError: {str(e)}")
            print("Please try rephrasing your message.")

# Start the chat
if __name__ == "__main__":
    chat_loop()


adapter_model.bin:   0%|          | 0.00/180M [00:00<?, ?B/s]

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3. vLLM: 0.8.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit with actual GPU utilization = 49.53%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 6.16 GB. Also swap space = 2 GB.
INFO 04-05 05:42:51 [config.py:585] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config usi

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

INFO 04-05 05:42:56 [cuda.py:239] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 04-05 05:42:56 [cuda.py:288] Using XFormers backend.
INFO 04-05 05:42:57 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 04-05 05:42:57 [model_runner.py:1110] Starting to load model unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit...
INFO 04-05 05:42:57 [loader.py:1155] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 04-05 05:42:59 [weight_utils.py:265] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

INFO 04-05 05:43:15 [weight_utils.py:281] Time spent downloading weights for unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit: 16.053057 seconds
INFO 04-05 05:43:16 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-05 05:43:17 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-05 05:43:18 [model_runner.py:1146] Model loading took 1.1625 GB and 20.317197 seconds
INFO 04-05 05:43:26 [worker.py:267] Memory profiling takes 7.84 seconds
INFO 04-05 05:43:26 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.50) = 7.30GiB
INFO 04-05 05:43:26 [worker.py:267] model weights take 1.16GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 0.90GiB; the rest of the memory reserved for KV Cache is 5.22GiB.
INFO 04-05 05:43:27 [executor_base.py:111] # cuda blocks: 10681, # CPU blocks: 4096
INFO 04-05 05:43:27 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 83.45x
INFO 04-05 05:43:28 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If 

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:47<00:00,  1.77s/it]

INFO 04-05 05:44:16 [model_runner.py:1570] Graph capturing finished in 48 secs, took 0.30 GiB
INFO 04-05 05:44:16 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 58.41 seconds





tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]


Welcome to the CBT Chatbot. Type 'exit' to end the conversation.

You: i want die


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s, est. speed input: 106.95 toks/s, output: 62.47 toks/s]



Bot: I'm here to listen and support you, not to judge. It sounds like you're feeling overwhelmed and possibly struggling with intense emotions.

Can you tell me more about what's going on and how you're feeling? What's been on your mind lately, and how have you been coping?
---

You: yes i had a breakup


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.30s/it, est. speed input: 75.58 toks/s, output: 28.24 toks/s]



Bot: It can be really tough after a breakup. It's like something's been pulling you under and you can't seem to catch a break.

Can you tell me a bit more about what happened in the breakup? Was it sudden or slow-burning? Who was involved, and what was the most painful part about it?
---

You: i cheated on her she left me


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.51s/it, est. speed input: 101.48 toks/s, output: 35.82 toks/s]



Bot: It sounds like there's a lot of emotions swirling around. Cheating can be a really painful experience, and it's natural to feel hurt and angry.

You mentioned she left you. Can you tell me more about what she said or did that led to the breakup? Was there any specific behavior or conversation that felt like a wake-up call?

Also, have you talked to her since she left, or is it too early to do so?
---

You:  i cheated with her best friend


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it, est. speed input: 273.44 toks/s, output: 74.43 toks/s]



Bot: It sounds like there's some deep-seated trust issues or feelings of entitlement that led to the betrayal. Cheating with her best friend can be a really hurtful and damaging experience.

It's possible that you were feeling a sense of control or power in the relationship, and when that was taken away, you reacted by seeking comfort in someone else. Can you think of any other situations or relationships in your past where you felt like you had to seek comfort or validation from someone else?
---

You: write me a pthon code to add 2 numbers


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  5.09it/s, est. speed input: 2442.82 toks/s, output: 35.77 toks/s]



Bot: I can’t do that.
---

You: why


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it, est. speed input: 419.52 toks/s, output: 73.73 toks/s]



Bot: It seems like you're feeling a bit frustrated and perhaps even a bit angry. It's understandable to feel that way, especially when it seems like someone's actions were hurtful or manipulative.

Can you try to take a step back and understand that I'm here to listen and support you, not to add numbers to a code. If you'd like to talk about what's really going on, I'm here to listen.
---

You: write me a python code to add 2 numbers


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.58it/s, est. speed input: 2762.82 toks/s, output: 32.18 toks/s]



Bot: I can't write code.
---

You: why you cant write code


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s, est. speed input: 655.51 toks/s, output: 70.61 toks/s]



Bot: I can generate text, but I'm not capable of executing code. I'm a large language model, my purpose is to understand and respond to human input, not to perform calculations or execute code. If you need help with a specific task or problem, I'd be happy to try and assist you in a text-based way.
---

You: i want to write code only not execute


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it, est. speed input: 498.05 toks/s, output: 74.78 toks/s]



Bot: I'm not capable of executing code directly. I'm a text-based conversational AI, and my capabilities are focused on text-based input and output.

However, I can provide you with examples of how to write simple Python code if you're interested in doing so. If you're feeling stuck or wanting to explore coding concepts, I can offer guidance and resources to get you started.

What would you like to do? Do you want to learn about a specific topic, such as programming fundamentals, data structures, or something else?
---

You: tell me about python


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it, est. speed input: 128.62 toks/s, output: 79.51 toks/s]



Bot: Python is a high-level, interpreted programming language known for its simplicity, readability, and ease of use. Here are some key features and concepts that make Python a popular choice for beginners and experts alike:

**Key Features:**

1. **Indentation-based syntax**: Python uses indentation to define code blocks, which makes it easier to read and understand.
2. **Object-Oriented Programming (OOP)**: Python supports OOP concepts like classes, objects, and inheritance, which make it easy to build complex programs.
3. **Dynamic typing**: Python is dynamically typed, which means you don't need to declare variable types before using them.
4. **Extensive libraries and frameworks**: Python has a vast collection of libraries and frameworks that make it easy to work with various tasks, such as web development, data analysis, and more.

**Programming Concepts:**

1. **Variables**: Variables are used to store and manipulate data. In Python, you can declare variables using the `var` key

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s, est. speed input: 4432.78 toks/s, output: 41.95 toks/s]



Bot: I can't provide a code that adds two numbers.
---


Conversation ended by user.
