In [12]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    AutoConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from datasets import Dataset
import pandas as pd
import logging
import os

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
MODEL_SAVE_PATH = "./qwen2.5-0.5B_finetuned_mentalhealth"
DATA_FILE = "cleaned_data27.csv"
EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_LENGTH = 384  
GRADIENT_ACCUMULATION_STEPS = 4

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model (base or already fine-tuned)
if os.path.exists(MODEL_SAVE_PATH):
    logger.info("Loading fine-tuned model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_SAVE_PATH,
        trust_remote_code=True,
        device_map="auto"
    )
else:
    logger.info("Loading base model with 4-bit quantization...")
    config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if hasattr(config, "quantization_config"):
        del config.quantization_config

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        config=config,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    # Optional: Try flash attention if supported (uncomment to test)
    # model.config.attn_implementation = "flash_attention_2"

    model = prepare_model_for_kbit_training(model)

    # Inject LoRA adapters
    peft_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

# Format dataset
def format_conversation(row):
    return f"User: {row['Context']}\nTherapist: {row['Response']}{tokenizer.eos_token}"

df = pd.read_csv(DATA_FILE)
df['text'] = df.apply(format_conversation, axis=1)
# Optional: Use subset for faster testing
# df = df.sample(n=1000)

dataset = Dataset.from_pandas(df[['text']])

# Tokenization (don't return tensors)
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

# Data collator (handles dynamic padding + labels)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    bf16=True,  # Use bf16 if supported, otherwise switch to fp16=True
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="no",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Train
logger.info("🚀 Starting fine-tuning...")
trainer.train()
logger.info("✅ Fine-tuning completed!")

# Save model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

print("\n✅ Fine-tuned Qwen2.5 mental health chatbot is ready!")


INFO:__main__:Loading base model with 4-bit quantization...
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359


Map:   0%|          | 0/34772 [00:00<?, ? examples/s]

INFO:__main__:🚀 Starting fine-tuning...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
100,2.3045
200,1.9982
300,1.8948
400,1.8446
500,1.8186
600,1.7882
700,1.7645
800,1.7658
900,1.7514
1000,1.7444


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
INFO:__main__:✅ Fine-tuning completed!



✅ Fine-tuned Qwen2.5 mental health chatbot is ready!


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load your fine-tuned model and tokenizer
model_name = "qwen2.5-0.5B_finetuned_mentalhealth"  # Update this with your actual model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to("cuda" if torch.cuda.is_available() else "cpu")
device = model.device

# Generation parameters
TEMPERATURE = 0.7
TOP_P = 0.9
TOP_K = 50
MAX_NEW_TOKENS = 150

# Base system prompt
SYSTEM_PROMPT = "You are a compassionate mental health support chatbot. Respond empathetically to user messages."

# Optional greeting responses
GREETING_RESPONSES = [
    "Hello! I'm here to listen. How are you feeling today?",
    "Hi there! I'm here if you need someone to talk to.",
    "Hey! I'm glad you're here. How are things going for you?"
]

# Chat loop
while True:
    try:
        user_input = input("You: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'bye']:
            print("Bot: Goodbye! Take care.")
            break

        # Handle greetings manually
        if user_input.lower() in ["hi", "hello", "hey"]:
            print(f"Bot: {GREETING_RESPONSES[0]}")
            continue

        # Build full input prompt
        prompt = f"{SYSTEM_PROMPT}\nUser: {user_input}\nBot:"
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

        # Generate response
        with torch.no_grad():
            output = model.generate(
                input_ids,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=True,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                top_k=TOP_K,
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=3
            )

        # Decode and postprocess
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # Extract response part only
        if "Bot:" in decoded_output:
            response = decoded_output.split("Bot:")[-1].strip()
        else:
            response = decoded_output.strip()

        print(f"Bot: {response}")

    except KeyboardInterrupt:
        print("\nBot: Session ended. Wishing you well.")
        break
    except Exception as e:
        print(f"Bot: I encountered an error. Let's try again. ({str(e)})")


You: hi
Bot: Hello! I'm here to listen. How are you feeling today?
You: i am going through a rough time . i am not able to concentrate on my work
Bot: it sounds like you might be experiencing some challenges with your focus and productivity at home. can we try talking about any recent changes or developments in your life that could be contributing to these difficulties? also do you have any specific concerns about your work environment or responsibilities that need more attention regarding concentration and efficiency. if so please let me know. as a mental health professional, I would be happy to help analyze the situation further and provide suggestions for improving your productivity and focus at home while also considering possible external factors such as stress levels or other personal issues that may be impacting your ability to concentrate. remember though it is important to approach this conversation with empathy and understanding as well as an open mind to exploring potential 

### test

In [7]:
!pip install -q evaluate bert_score transformers

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate
import pandas as pd
from tqdm import tqdm

model_path = "./qwen2.5-0.5B_finetuned_mentalhealth"  # or "./falcon3b-lora-checkpoint"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Load test dataset
df = pd.read_csv("cleaned_data27.csv").dropna().sample(n=10, random_state=4)
contexts = df["Context"].tolist()
references = df["Response"].tolist()

generated_responses = []
for context in tqdm(contexts, desc="Generating"):
    input_text = f"User: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_responses.append(decoded.split("Bot:")[-1].strip() if "Bot:" in decoded else decoded)

# Evaluate
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_results = rouge.compute(predictions=generated_responses, references=references)
bert_results = bertscore.compute(predictions=generated_responses, references=references, lang="en")

print(f"\n✅ ROUGE Scores:")
for k, v in rouge_results.items():
    print(f"  {k}: {v:.4f}")

print(f"\n✅ BERTScore:")
print(f"  Precision: {sum(bert_results['precision']) / len(bert_results['precision']):.4f}")
print(f"  Recall:    {sum(bert_results['recall']) / len(bert_results['recall']):.4f}")
print(f"  F1:        {sum(bert_results['f1']) / len(bert_results['f1']):.4f}")



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating: 100%|███████████████████████████| 10/10 [00:41<00:00,  4.15s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ ROUGE Scores:
  rouge1: 0.3355
  rouge2: 0.0818
  rougeL: 0.1565
  rougeLsum: 0.1962

✅ BERTScore:
  Precision: 0.8532
  Recall:    0.8589
  F1:        0.8558
