ENVIRONMENT SETUP


In [14]:
# Install required libraries
# transformers: model loading & training
# datasets: dataset handling
# evaluate: evaluation metrics
# torch: deep learning backend
!pip install -q transformers datasets accelerate sentencepiece evaluate scikit-learn pandas numpy torch

In [15]:
# Import core libraries
import pandas as pd
import numpy as np
import torch

# Hugging Face utilities
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments
)

LOAD AND PREPARE DATA

In [16]:
# Load the two CounselChat CSV datasets
df1 = pd.read_csv("/content/20220401_counsel_chat.csv")
df2 = pd.read_csv("/content/counselchat-data.csv")

# Combine both datasets row-wise
# This keeps all question–answer pairs together
df = pd.concat([df1, df2], ignore_index=True)

#display first few rows of joint data
df.head()

Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views,questionUrl,topics,therapistName,therapistUrl
0,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,Jennifer MolinariHypnotherapist & Licensed Cou...,https://counselchat.com/therapists/jennifer-mo...,It is very common for people to have multiple ...,3,1971.0,,,,
1,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,"Jason Lynch, MS, LMHC, LCAC, ADSIndividual & C...",https://counselchat.com/therapists/jason-lynch...,"I've never heard of someone having ""too many i...",2,386.0,,,,
2,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,Shakeeta TorresFaith Based Mental Health Couns...,https://counselchat.com/therapists/shakeeta-to...,Absolutely not. I strongly recommending worki...,2,3071.0,,,,
3,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,"Noorayne ChevalierMA, RP, CCC, CCAC, LLP (Mich...",https://counselchat.com/therapists/noorayne-ch...,Let me start by saying there are never too man...,2,2643.0,,,,
4,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,"Toni Teixeira, LCSWYour road to healing begins...",https://counselchat.com/therapists/toni-teixei...,I just want to acknowledge you for the courage...,1,256.0,,,,


In [17]:
# Standardise column names so both datasets share the same schema
df = df.rename(columns={
    "questionText": "question",
    "answerText": "answer"
})

# Retain only the columns required for the project
# These directly support the problem → advice modelling task
# Remove rows with missing values
df = df[["question", "answer"]].dropna()

# Remove duplicate question–answer pairs
df = df.drop_duplicates()

LENGTH-BASED FILTERING

In [18]:
# Compute character length of questions and answers
# These helper columns are used only for filtering
df["question_len"] = df["question"].str.len()
df["answer_len"] = df["answer"].str.len()

# Remove outliers that are too long for transformer models
# This ensures stable training and concise advice generation
df = df[(df["question_len"] < 500) & (df["answer_len"] < 600)]

# Drop helper columns after filtering
df = df.reset_index(drop=True)

MENTAL HEALTH TEXT ANALYSIS MODEL

(BERT-based classifier)

In [19]:
# We use a publicly available clinical BERT model
# This avoids gated access issues and is widely used in research
from transformers import AutoTokenizer, AutoModelForSequenceClassification
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"

# Load tokenizer for the mental-health analysis model
mental_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load model for sequence classification
# We use 2 labels as a simplified setup (e.g., distress vs non-distress)
mental_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prepare Dataset for Mental Health Analysis

In [20]:
# Tokenisation function for BERT-style models
def mental_tokenize(batch):
    return mental_tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [21]:
# Create a dataset for classification
# Since we do not have labelled distress data, labels are simulated
mental_df = pd.DataFrame({
    "text": df["question"],
    "label": np.random.randint(0, 2, size=len(df))  # proxy labels
})

# Convert to Hugging Face Dataset format
mental_dataset = Dataset.from_pandas(mental_df)

# Apply tokenisation
mental_dataset = mental_dataset.map(mental_tokenize, batched=True)

# Split into training and validation sets
mental_dataset = mental_dataset.train_test_split(test_size=0.2)

# Define training configuration
from transformers import TrainingArguments, Trainer
mental_args = TrainingArguments(
    output_dir="./mentalbert",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=50,
    save_strategy="epoch",
    report_to="none"
)

# Create Trainer object
mental_trainer = Trainer(
    model=mental_model,
    args=mental_args,
    train_dataset=mental_dataset["train"],
    eval_dataset=mental_dataset["test"],
    tokenizer=mental_tokenizer
)

# Train the model
mental_trainer.train()


Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

  mental_trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.714,0.692501
2,0.7012,0.698528




TrainOutput(global_step=238, training_loss=0.7075399030156496, metrics={'train_runtime': 1876.6765, 'train_samples_per_second': 1.009, 'train_steps_per_second': 0.127, 'total_flos': 124583084712960.0, 'train_loss': 0.7075399030156496, 'epoch': 2.0})

PREPARE DATA FOR FLAN-T5

(Advice Generation)


In [22]:
# Convert question–answer pairs into instruction-based format
def format_flan(example):
    prompt = (
        "Provide empathetic, safe advice for the following concern:\n"
        f"{example['question']}"
    )
    return {
        "input_text": prompt,
        "target_text": example["answer"]
    }

# Apply formatting
flan_df = df.apply(format_flan, axis=1, result_type="expand")


In [23]:
# Convert to Hugging Face Dataset
flan_dataset = Dataset.from_pandas(flan_df)

# Split into training and validation sets
flan_dataset = flan_dataset.train_test_split(test_size=0.1)

LOAD FLAN-T5 MODEL

In [24]:
FLAN_MODEL = "google/flan-t5-small"

# Load tokenizer and model
flan_tokenizer = AutoTokenizer.from_pretrained(FLAN_MODEL)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_MODEL)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [25]:
def flan_tokenize(batch):

  # Tokenise inputs
    model_inputs = flan_tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # Tokenise targets (labels)
    labels = flan_tokenizer(
        batch["target_text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenisation
tokenized_flan = flan_dataset.map(flan_tokenize, batched=True, remove_columns=flan_dataset["train"].column_names)

Map:   0%|          | 0/1065 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

TRAIN FLAN-T5

In [26]:
# Training configuration for Flan-T5
flan_args = TrainingArguments(
    output_dir="./flan_t5",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
    fp16=torch.cuda.is_available()
)


In [27]:
# Create Trainer for Flan-T5
flan_trainer = Trainer(
    model=flan_model,
    args=flan_args,
    train_dataset=tokenized_flan["train"],
    eval_dataset=tokenized_flan["test"],
    tokenizer=flan_tokenizer
)

  flan_trainer = Trainer(


In [28]:
# Train Flan-T5 model
flan_trainer.train()



Epoch,Training Loss,Validation Loss
1,1.2344,1.079248
2,1.1721,1.054412
3,1.0898,1.047736




TrainOutput(global_step=801, training_loss=1.6677627456322144, metrics={'train_runtime': 5405.4337, 'train_samples_per_second': 0.591, 'train_steps_per_second': 0.148, 'total_flos': 296960081264640.0, 'train_loss': 1.6677627456322144, 'epoch': 3.0})

In [29]:
# Save fine-tuned Flan-T5 model
flan_model.save_pretrained("./flan_t5")
flan_tokenizer.save_pretrained("./flan_t5")

('./flan_t5/tokenizer_config.json',
 './flan_t5/special_tokens_map.json',
 './flan_t5/spiece.model',
 './flan_t5/added_tokens.json',
 './flan_t5/tokenizer.json')

INFERENCE PIPELINE

In [30]:
# Analyse emotional content using BERT-based model
def analyze_emotion(text):
    inputs = mental_tokenizer(text, return_tensors="pt", truncation=True)
    outputs = mental_model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    return probs.detach().numpy()

In [31]:
# Generate advice using Flan-T5
def generate_advice(user_input):
    emotion_score = analyze_emotion(user_input)

    instruction = (
        "Provide calm, supportive advice suitable for a young person.\n"
        f"User concern: {user_input}"
    )

    inputs = flan_tokenizer(instruction, return_tensors="pt")
    outputs = flan_model.generate(
        **inputs,
        max_length=120,
        temperature=0.7,
        do_sample=True
    )

    advice = flan_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return {
        "emotion_score": emotion_score,
        "ai_advice": advice
    }


In [34]:
# Test the full pipeline
test_input = "I feel overwhelmed by school and like I am not good enough."
result = generate_advice(test_input)

result



{'emotion_score': array([[0.46988758, 0.5301124 ]], dtype=float32),
 'ai_advice': "I don't think I'm well enough to be feeling like I'm bad enough. I do not think I'm the best person to be able to take care of myself. Hopefully, you can find a counselor that offers counseling and resources. If you are not able to find the therapy you need, do your homework."}

In [36]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2ed4ae01be9e0c92ea3843e41dbd786f5d3d1017ef8489d5ed49c12eb994d508
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [37]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

preds = []
refs = []

for i in range(50):
    out = generate_advice(df["question"][i])
    preds.append(out["ai_advice"])
    refs.append([df["answer"][i]])

bleu.compute(predictions=preds, references=refs)


{'bleu': 0.018407753090859597,
 'precisions': [0.29068110572812766,
  0.039317721884937845,
  0.007040187738339689,
  0.0017862459065197975],
 'brevity_penalty': 0.9454054926110894,
 'length_ratio': 0.9468429573664328,
 'translation_length': 3509,
 'reference_length': 3706}

In [38]:
rouge.compute(predictions=preds, references=[r[0] for r in refs])

{'rouge1': np.float64(0.26415725344317575),
 'rouge2': np.float64(0.039753303003177885),
 'rougeL': np.float64(0.15665816434267232),
 'rougeLsum': np.float64(0.15843300553869538)}