In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# @title Mount Google Drive

from google.colab import drive

try:
    drive.mount('/content/drive')
    print(" Google Drive mounted successfully!")
    print(" You can now access files at /content/drive/MyDrive/")
except Exception as e:
    print(f" Could not mount Google Drive: {e}")

Mounted at /content/drive
 Google Drive mounted successfully!
 You can now access files at /content/drive/MyDrive/


In [None]:
!pip install flash-attn --no-build-isolation -U bitsandbytes transformers datasets peft trl accelerate packaging ninja sentencepiece

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/8.4 MB[0m [31m94.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m7.3/8.4 MB[0m [31m123.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting ninja
  Using cached ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64

In [None]:
import re
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os

# Configuration - Base model only
BASE_MODEL_PATH = "meta-llama/Llama-2-7b-chat-hf"
HF_TOKEN = ""

# Optimized quantization config for faster inference
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

#  system message
SYSTEM_MESSAGE = """You are a professional AI therapist. Provide empathetic, supportive therapeutic responses based on your training."""

def load_base_model():
    """Load base model with optimizations for speed"""
    try:
        print("Loading base Llama-2 model...")
        torch.cuda.empty_cache()

        # Load base model with speed optimizations
        model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_PATH,
            token=HF_TOKEN,
            quantization_config=nf4_config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            use_cache=True
        )

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            BASE_MODEL_PATH,
            token=HF_TOKEN,
            trust_remote_code=True
        )

        # Setup tokenizer for speed
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"

        print("Base model loaded successfully!")
        return model, tokenizer, True

    except Exception as e:
        print(f"Failed to load model: {e}")
        return None, None, False

def clean_response(response):
    """Quick response cleaning"""
    # Remove unwanted patterns
    unwanted = ["[INST]", "[/INST]", "<s>", "</s>", "<<SYS>>", "<</SYS>>", "Human:", "Assistant:", "Context:", "Response:"]
    for pattern in unwanted:
        response = response.replace(pattern, "")

    # Clean whitespace
    response = re.sub(r'\s+', ' ', response).strip()

    # Ensure proper ending
    if response and not response.endswith(('.', '!', '?')):
        if '.' in response:
            response = response[:response.rfind('.') + 1]
        else:
            response += "."

    return response

def therapy_response(message, history):
    """Generate fast therapeutic response"""

    if not message.strip():
        return "I'm here to listen. What would you like to share?"

    if not model_loaded or model is None:
        return "I'm here to support you. Can you tell me more about what you're experiencing?"

    try:
        # Simplified prompt format for speed
        prompt = f"[INST] {SYSTEM_MESSAGE}\n\nUser: {message.strip()}\nAssistant: [/INST]"

        # Fast tokenization
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=200,  # Reduced for speed
            truncation=True,
            padding=False
        )

        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        # Optimized generation for speed
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,  # Shorter responses
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                top_k=50,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
                no_repeat_ngram_size=3
            )

        # Quick decode
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract response quickly
        response_start = response.find("[/INST]")
        if response_start != -1:
            response = response[response_start + 7:].strip()

        # Clean and limit response
        response = clean_response(response)

        if len(response) > 10:
            return response[:400]  # Limit length

    except Exception as e:
        print(f"Generation error: {e}")
        pass

    # Fast fallback responses
    fallbacks = [
        "I understand you're going through something difficult. Can you tell me more?",
        "Thank you for sharing with me. What's been on your mind lately?",
        "I'm here to listen and support you. How are you feeling right now?",
        "It sounds like you're dealing with some challenges. I'm here for you."
    ]
    import random
    return random.choice(fallbacks)

# Load model at startup
print("Initializing Mental Health Assistant...")
model, tokenizer, model_loaded = load_base_model()

status_message = "Base model loaded successfully" if model_loaded else "Model loading failed - using fallback responses"
print(f"Status: {status_message}")

# Minimal CSS
custom_css = """
.gradio-container { max-width: 800px !important; margin: 0 auto !important; }
#chatbot { height: 400px !important; }
"""

# Streamlined Gradio interface
with gr.Blocks(
    css=custom_css,
    theme=gr.themes.Soft(),
    title="MindSpace - Mental Health Assistant"
) as demo:

    gr.Markdown(f"""
    # 🧠 MindSpace - Mental Health Assistant

    **Status:** {status_message}

    Share your thoughts and feelings in a supportive environment.

    *⚠️ This is an AI assistant, not a replacement for professional mental health care.*
    """)

    chatbot = gr.Chatbot(
        elem_id="chatbot",
        height=400,
        show_label=False
    )

    with gr.Row():
        msg = gr.Textbox(
            placeholder="How are you feeling today?",
            show_label=False,
            scale=4
        )
        submit_btn = gr.Button("Send", scale=1, variant="primary")

    # Quick examples
    gr.Examples(
        examples=[
            "I'm feeling anxious",
            "I'm stressed about work",
            "I feel sad today",
            "I'm having trouble sleeping"
        ],
        inputs=msg,
        label="Quick examples:"
    )

    def respond(message, chat_history):
        """Fast response handler"""
        if not message.strip():
            return "", chat_history

        bot_message = therapy_response(message, chat_history)
        chat_history.append((message, bot_message))
        return "", chat_history

    # Event handlers
    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    submit_btn.click(respond, [msg, chatbot], [msg, chatbot])

# Launch
if __name__ == "__main__":
    print("Launching Mental Health Assistant...")
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        show_error=True
    )

Initializing Mental Health Assistant...
Loading base Llama-2 model...


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Base model loaded successfully!
Status: Base model loaded successfully
Launching Mental Health Assistant...


  chatbot = gr.Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8937b10899b6501f70.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
