In [1]:
# Core Libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup

# Sklearn Utilities
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Utilities
from tqdm import tqdm
import numpy as np
import os

In [2]:
# DATA PREPREOCESSING AND GROUPING

In [3]:
# Load dataset
df = pd.read_csv('counselchat-data.csv')

# Keep necessary columns and drop missing values
df = df[['questionText', 'topics']].dropna()

# Ensure 'topics' is properly formatted
def parse_topics(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        if ',' in x:
            return [topic.strip() for topic in x.split(',')]
        else:
            return [x.strip()]
    return []

df['topics'] = df['topics'].apply(parse_topics)

# Define topic groups
topic_groups = {
    'Addiction & Abuse': ['Addiction', 'Substance Abuse', 'Self-harm', 'Eating Disorders'],
    'Anxiety & Stress': ['Anxiety', 'Stress', 'Sleep Improvement', 'Trauma', 'Self-esteem'],
    'Family & Parenting': ['Family Conflict', 'Parenting', 'Children & Adolescents', 'Marriage'],
    'Relationships': ['Relationships', 'Intimacy', 'Relationship Dissolution', 'Social Relationships'],
    'Mental Health Disorders': ['Depression', "Alzheimer's", 'Diagnosis', 'Grief and Loss'],
    'Violence & Safety': ['Domestic Violence', 'Anger Management', 'Military Issues'],
    'Professional & Legal': ['Career Counseling', 'Workplace Relationships', 'Legal & Regulatory', 'Professional Ethics'],
    'Identity & Spirituality': ['Human Sexuality', 'LGBTQ', 'Spirituality'],
    'Behavioral Changes': ['Behavioral Change', 'Counseling Fundamentals']
}

# Map topics to groups
def map_to_group(topics):
    for group, keywords in topic_groups.items():
        if any(topic in keywords for topic in topics):
            return group
    return 'Other'

df['topic_group'] = df['topics'].apply(map_to_group)

# Encode topic groups
le = LabelEncoder()
df['topic_group_encoded'] = le.fit_transform(df['topic_group'])

# Display class distribution
print("Class Distribution:")
print(df['topic_group'].value_counts())

Class Distribution:
topic_group
Anxiety & Stress           402
Relationships              361
Family & Parenting         281
Mental Health Disorders     83
Addiction & Abuse           81
Behavioral Changes          68
Professional & Legal        39
Identity & Spirituality     37
Violence & Safety           24
Name: count, dtype: int64


In [4]:
# Tokenization and Dataset Preparation

In [5]:
# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define Dataset class
class CounselChatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Create dataset
dataset = CounselChatDataset(df['questionText'], df['topic_group_encoded'], tokenizer)

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [6]:
# Define Model, Optimizer, and Scheduler

In [7]:
# Load pre-trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(le.classes_))
model.to(device)

# Optimizer and Scheduler
optimizer = optim.AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * 3  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Loss Function
criterion = nn.CrossEntropyLoss()

# Mixed Precision Training (AMP)
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


In [8]:
# Training Loop with GPU & Mixed Precision  

In [9]:
# Training Loop
epochs = 10
gradient_accumulation_steps = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(tqdm(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps
        
        scaler.scale(loss).backward()
        
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

  with autocast():
100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:24<00:00,  2.80it/s]


Epoch 1/10, Loss: 0.9137


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:23<00:00,  2.89it/s]


Epoch 2/10, Loss: 0.6808


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:24<00:00,  2.87it/s]


Epoch 3/10, Loss: 0.4692


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:24<00:00,  2.76it/s]


Epoch 4/10, Loss: 0.3246


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:26<00:00,  2.57it/s]


Epoch 5/10, Loss: 0.2480


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:25<00:00,  2.72it/s]


Epoch 6/10, Loss: 0.2041


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:23<00:00,  2.95it/s]


Epoch 7/10, Loss: 0.1954


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:23<00:00,  2.97it/s]


Epoch 8/10, Loss: 0.1921


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:23<00:00,  2.96it/s]


Epoch 9/10, Loss: 0.1924


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:23<00:00,  2.96it/s]

Epoch 10/10, Loss: 0.1918





In [10]:
# Validation Loop

In [11]:
# Validation Loop
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        
        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")
torch.save(model.state_dict(), 'roberta_gpu_classifier.pth')
print("✅ Model Saved Successfully!")

100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:04<00:00,  3.63it/s]


Validation Accuracy: 0.8080
✅ Model Saved Successfully!


In [12]:
# Testing with New Input

In [13]:
# Load Model for Inference
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(le.classes_))
model.load_state_dict(torch.load('roberta_gpu_classifier.pth'))
model.to(device)
model.eval()

# Prediction Function
def predict_topic(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return le.inverse_transform([predicted_class])[0]

# Test Predictions
test_texts = [
    "I feel anxious all the time, and I don't know how to cope.",
    "My partner and I are having relationship problems.",
    "I am struggling with substance abuse and need help quitting."
]

for text in test_texts:
    print(f"📝 Input: {text}\n🔮 Predicted Topic: {predict_topic(text)}\n")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('roberta_gpu_classifier.pth'))


📝 Input: I feel anxious all the time, and I don't know how to cope.
🔮 Predicted Topic: Anxiety & Stress

📝 Input: My partner and I are having relationship problems.
🔮 Predicted Topic: Relationships

📝 Input: I am struggling with substance abuse and need help quitting.
🔮 Predicted Topic: Addiction & Abuse



In [14]:
# Load Model for Inference
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(le.classes_))
model.load_state_dict(torch.load('roberta_gpu_classifier.pth'))
model.to(device)
model.eval()

# Prediction Function
def predict_topic(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return le.inverse_transform([predicted_class])[0]

# Test Predictions
test_texts = [
        "I want to quit smoking but can't seem to stop.",
        "My self-esteem is so low that I avoid social situations.",
        "I feel hopeless and have no motivation to get out of bed.",
         "My sibling and I are always in conflict, and it's exhausting.",
        "I'm struggling to come to terms with my sexual orientation.",
        "I feel disconnected from my faith and purpose.",
    "I'm experiencing burnout from my high-pressure job.",
    "My workplace has become toxic, and I can't handle it anymore."
]

for text in test_texts:
    print(f"📝 Input: {text}\n🔮 Predicted Topic: {predict_topic(text)}\n")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('roberta_gpu_classifier.pth'))


📝 Input: I want to quit smoking but can't seem to stop.
🔮 Predicted Topic: Addiction & Abuse

📝 Input: My self-esteem is so low that I avoid social situations.
🔮 Predicted Topic: Anxiety & Stress

📝 Input: I feel hopeless and have no motivation to get out of bed.
🔮 Predicted Topic: Mental Health Disorders

📝 Input: My sibling and I are always in conflict, and it's exhausting.
🔮 Predicted Topic: Family & Parenting

📝 Input: I'm struggling to come to terms with my sexual orientation.
🔮 Predicted Topic: Identity & Spirituality

📝 Input: I feel disconnected from my faith and purpose.
🔮 Predicted Topic: Anxiety & Stress

📝 Input: I'm experiencing burnout from my high-pressure job.
🔮 Predicted Topic: Anxiety & Stress

📝 Input: My workplace has become toxic, and I can't handle it anymore.
🔮 Predicted Topic: Anxiety & Stress



In [15]:
import torch
import joblib
from transformers import RobertaTokenizer

# Save Model, Tokenizer, and LabelEncoder Together
checkpoint = {
    'model_state_dict': model.state_dict(),  # Model weights
    'tokenizer': tokenizer,                 # Tokenizer
    'label_encoder': le                     # Label Encoder
}

torch.save(checkpoint, 'rag_model_checkpoint.pth')
print("✅ Model, Tokenizer, and LabelEncoder saved successfully in 'rag_model_checkpoint.pth'")

✅ Model, Tokenizer, and LabelEncoder saved successfully in 'rag_model_checkpoint.pth'
