# Classification for Advice Types

## Import data and understand costs for classification using Open AI

In [4]:
import pandas as pd
from transformers import pipeline
import numpy as np
import csv
import os


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = pd.read_csv("data/20200325_counsel_chat.csv")

In [6]:
answers = df["answerText"].dropna().unique().tolist()
super_string = ' '.join(answers)
num_tokens = len(super_string) / 4
input_price = 1.100 * num_tokens / 1000000
output_price = 4.4 * len(answers) * 20 / 1000000
total_price = input_price + output_price
print(input_price, output_price)
print(f'Minimum cost of classification using openAI o4-mini ${total_price}')

0.542162775 0.176704
Minimum cost of classification using openAI o4-mini $0.718866775


## Zero-shot Classification Approach

In [7]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["Practical Life Advice", "Emotional Support and Validation", "Resource suggestion", "Psychoeducation"]

Device set to use mps:0


In [8]:
practical_life_advice = classifier("I recommend establishing a consistent sleep schedule and limiting screen time an hour before bed. Try using a sleep diary to track your patterns and notice what improves your rest.", candidate_labels=labels, multi_label=True)
print(practical_life_advice["labels"], practical_life_advice["scores"])

['Practical Life Advice', 'Resource suggestion', 'Emotional Support and Validation', 'Psychoeducation'] [0.8553107976913452, 0.7835465669631958, 0.3436373174190521, 0.25082144141197205]


In [62]:
empathetic_response = classifier("It sounds like you’ve been carrying a lot on your own lately. I want you to know that it’s completely valid to feel overwhelmed, and I’m here to support you through this.", candidate_labels=labels, multi_label=True)
print(empathetic_response["labels"], empathetic_response["scores"])

['Empathetic response', 'Resource suggestion', ' Practical Life Advice'] [0.9629988074302673, 0.6677781939506531, 0.46740734577178955]


In [63]:
resource_suggestion = classifier("You might find it helpful to try the Headspace app for guided meditations, or explore 'The Feeling Good Handbook' by Dr. David Burns — it’s a great resource for managing negative thoughts.", candidate_labels=labels, multi_label=True)
print(resource_suggestion["labels"], resource_suggestion["scores"])

['Resource suggestion', 'Empathetic response', ' Practical Life Advice'] [0.962764322757721, 0.5002696514129639, 0.014397886581718922]


In [11]:
psychoeducation = classifier("Anxiety is your body’s natural response to stress or danger. It’s part of our ‘fight or flight’ system, which helps protect us in threatening situations. When you feel anxious, your brain is sending signals to release stress hormones like adrenaline. These can cause symptoms like a racing heart, shortness of breath, or dizziness—similar to what you’ve described during your panic attacks.", candidate_labels=labels, multi_label=True)
print(psychoeducation["labels"], psychoeducation["scores"])

['Resource suggestion', 'Psychoeducation', 'Practical Life Advice', 'Emotional Support and Validation'] [0.7540246248245239, 0.35327744483947754, 0.23505909740924835, 0.03809049725532532]


Zero shot classification approach used above is free. However, it may not be the most accurate. From the above 5 examples, it seems like its good at classifying the first label but not that great with the second or third. But, it leaves room for false positives for the second and third labels. OpenAI will most likely be alot more accurate but it is not free. Due to the time constraint, and low cost of classification using OpenAI, it seems like the most reasonable move to label a reasonable amount of rows, and then use these synthetically labelled data to train my own classifier. There is a chance that OPEN AI might hallucinate or give undesirable results. But overall it is the least time-consuming option and the trade-off is worth it. Additionally, the labels could be useful in training a supervised machine learning algorithm in the future and using that to classify instead of making API calls to OpenAI

## Classification using OpenAI

In [66]:
import time
from openai import OpenAI

client = OpenAI(api_key="")

answers = df["answerText"].dropna().tolist()

classified = []

system_prompt = """You are a classifier that assigns therapist answers into one or more of the following categories:
   Practical Life Advice: Actionable suggestions to improve sleep, routine, self-care, gratitude, or navigating social environments.
    Emotional Support and Validation: Suggest ways the therapist can validate the patient’s feelings and build rapport.,
    Resource Suggestion: Recommend tools, exercises, or materials the therapist can offer the patient..,
    Psychoeducation: Diagnose whether the patient's symptoms align with a mental health condition, based on their description.


Return only the most relevant categories as a Comma separated list. Do not include anything else"""



In [72]:
# File to save results
output_file = "data/combined_results.csv"

# Create output file and write header if it doesn't exist
if not os.path.exists(output_file):
    with open(output_file, mode="w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["index", "answerText", "adviceType"])

# Iterate over the dataframe
for i, row in df.head(5000).iterrows():
    answer = row['answerText'].strip()
    label = "error"  # Default to "error" in case of failure

    # Try 3 times before moving to the next row
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model="gpt-4.1-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": answer}
                ],
                temperature=0,
            )
            label = response.choices[0].message.content.strip()
            break  # Exit the retry loop if successful
        except Exception as e:
            print(f"Error on row {i}, attempt {attempt + 1}: {e}")
            time.sleep(5)

    # Write the result to CSV immediately after processing the row
    with open(output_file, mode="a", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([i, answer, label])

## Training my own multi-class, multi-label classifier using OpenAI generated labels

### Selecting LinearSVC for the model. Implementing Multi-Label, Multi-Class Classificaiton

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import  hamming_loss
from sklearn.multioutput import MultiOutputClassifier
import pickle

# Format Dataframe for training
def get_training_df():
    training_dataset = 'data/combined_results.csv'
    data = pd.read_csv(training_dataset)
    data['document'] = data['answerText']

    # Create a column for each advice type
    advice_types = [
    'Practical Life Advice',
    'Emotional Support and Validation',
    'Resource Suggestion',
    'Psychoeducation'
        ]

    for advice in advice_types:
        data[advice] = data['adviceType'].apply(lambda x: advice in str(x))

    data = data[['document', 'Practical Life Advice', 'Emotional Support and Validation', 'Resource Suggestion', 'Psychoeducation']]

    return data

# Get training and taget dataframes
def get_training_and_target_df(data):
    X = data['document']
    y = data[['Practical Life Advice', 'Emotional Support and Validation', 'Resource Suggestion', 'Psychoeducation']]
    return X, y

# Model Training
def train_model(X, y, save=False):
    model = make_pipeline(
        TfidfVectorizer(ngram_range=(1, 3), stop_words='english'),
        MultiOutputClassifier(LinearSVC())
    )
    model.fit(X, y)
    if save:
        pickle.dump(model, open('classifier/therapal_classifier.sav', 'wb'))
    return model

# K-Fold Cross Validation
def cross_validate_model(x, y, pipe, test_size=10):
    kf = KFold(n_splits=test_size, shuffle=True, random_state=42)
    cross_val_results = np.array([])
    for train_index, test_index in kf.split(x, y):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        pipe.fit(x_train, y_train)
        y_pred = pipe.predict(x_test)
        hamming_loss_value = hamming_loss(y_test, y_pred)
        cross_val_results = np.append(cross_val_results, 1 - hamming_loss_value)
    return cross_val_results, cross_val_results.mean()


# Function to predict new documents
def get_prediction(document, model):
    prediction = model.predict([document])
    labels = ['Practical Life Advice', 'Emotional Support and Validation', 'Resource Suggestion', 'Psychoeducation']
    result = [label for value, label in zip(prediction[0], labels) if value]
    return result



In [27]:
get_training_df()

Unnamed: 0,document,Practical Life Advice,Emotional Support and Validation,Resource Suggestion,Psychoeducation
0,"If everyone thinks you're worthless, then mayb...",True,True,False,False
1,"Hello, and thank you for your question and see...",False,True,True,False
2,First thing I'd suggest is getting the sleep y...,True,True,False,False
3,Therapy is essential for those that are feelin...,False,False,True,True
4,I first want to let you know that you are not ...,True,True,False,False
...,...,...,...,...,...
2366,There are probably no two therapists alike bec...,False,True,False,False
2367,"Each counselor may have a different process, s...",True,True,False,False
2368,"After meeting a client, many Counselors will a...",False,False,True,True
2369,A good therapist will discuss what brought you...,True,True,False,False


### Testing model accuracy using k-fold cross validation

In [28]:
pipe = make_pipeline(
        TfidfVectorizer(ngram_range=(1, 3), stop_words='english'),
        MultiOutputClassifier(LinearSVC())
    )

In [29]:
X, y = get_training_and_target_df(get_training_df())

In [30]:
cross_validate_model(X, y, pipe, test_size=10)

(array([0.83928571, 0.84810127, 0.84599156, 0.85443038, 0.82805907,
        0.8407173 , 0.84599156, 0.85021097, 0.84388186, 0.83438819]),
 np.float64(0.8431057866184449))

In [10]:
model = train_model(X, y, save=True)

### Test model functionality using a test_string

In [32]:
test_document = """
"It's completely normal to feel overwhelmed at times. Try to take things one step at a time, and don’t hesitate to lean on those around you. Have you considered making a daily routine or journaling your thoughts? It might help bring some structure and emotional clarity. If these feelings persist, speaking to a mental health professional could offer you additional support.
"""

get_prediction(test_document, model)

['Practical Life Advice',
 'Emotional Support and Validation',
 'Resource Suggestion']