In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import transformers

In [66]:
import pandas as pd

df1 = pd.read_csv("./database/Synthetic_Data_10K.csv")
df2 = pd.read_csv("./database/Interview_Data_6K.csv")

In [67]:
df1.shape

(9774, 3)

In [68]:
df2.shape

(6310, 3)

In [69]:
print(df1.columns)
print(df2.columns)

Index(['instruction', 'input', 'output'], dtype='object')
Index(['instruction', 'input', 'output'], dtype='object')


In [70]:
common_cols = df1.columns.intersection(df2.columns)
print("Common columns:", common_cols)

Common columns: Index(['instruction', 'input', 'output'], dtype='object')


In [71]:
df_combined = pd.concat([df1, df2], ignore_index=True)

In [72]:
df_combined.shape

(16084, 3)

In [73]:
import pandas as pd
df_combined.head()
print(df_combined.head())

                                         instruction  \
0  You are a helpful mental health counselling as...   
1  You are a helpful mental health counselling as...   
2  You are a helpful mental health counselling as...   
3  You are a helpful mental health counselling as...   
4  You are a helpful mental health counselling as...   

                                               input  \
0  I think I might be developing a substance abus...   
1  Parenting has become such a challenge for me. ...   
2  Intimacy has always been a struggle for me. I ...   
3  I've been struggling with substance abuse for ...   
4  Being a parent is overwhelming and exhausting....   

                                              output  
0  I'm really glad that you reached out and share...  
1  I can understand how challenging parenting can...  
2  I can understand how challenging it must be fo...  
3  I'm really glad that you reached out and share...  
4  Parenting can definitely be overwhelming and e..

In [74]:
df_combined['input'] = df_combined['input'].astype(str).fillna("I'm fine")


In [75]:
anxiety_keywords = r"anxiety|anxious|panic|nervous|worried|racing thoughts|chest tightness|heart pounding|restless"
depression_keywords = r"depressed|hopeless|empty|sad|numb|lost interest|no energy"
eating_disorder_keywords = r"eating disorder|binge|purge|bulimia|anorexia|restricting|body image"
family_issues_keywords = r"family problems|siblings fight|step siblings fight|parents fights|childhood abuse|parents fighting|scared as a child|early trauma"
sleeping_disorder_keywords = r"troubles sleeping|insomania|sleeping disorders|can’t sleep|nightmares| disturbed sleep|tired|waking up"
chronic_stress_keywords = r"burned out|exhausted|too much pressure|overwhelmed|workplace stress|chronic stress|stressed|very stressed|overwhelmed|really stressed"
worry_keywords = r"worries|hopeless|helpless|lost|uncertain|uncertainty|fear"
bipolar_disorder_keywords = r"mood swings|mania|depressive episodes|energetic|impulsive|irritable|high energy"
ocd_keywords =r"obsessive|compulsive|obesessive compulsive|rituals|checking|cleaning|intrusive thoughts|repeat actions"
body_dysmorphic_disorder_keywords =r"hate my appearance|ugly|look wrong|body image|physical flaw|obsessed with looks"
borderline_personality_keywords = r"fear of abandonment|unstable relationships|impulsive|mood swings|intense emotions"
ptsd_keywords = r"trauma|flashbacks|nightmares|triggered|assault|abuse|military|fear|avoidance"
social_anxiety_keywords = r"fear of people|shy|avoid crowds|panic in public|speaking anxiety|judged"
schizophrenia_keywords = r"hearing voices|delusions|hallucinations|paranoia|disorganized|psychosis"
autism_disorder_keywords = r"overwhelmed|sensory|routines|socially awkward|communication issues|meltdowns"
dissociative_disorder_keywords = r"zoning out|out of body|disconnected|memory loss|dissociate|numb"
adhd_keywords = r"can’t focus|distracted|hyper|impulsive|forgetful|restless"
grief_keywords = r"loss|passed away|grieving|can't move on|sadness after death"
suicide_keywords = r"cut myself|suicidal|want to die|no reason to live|hurting myself"
addiction_keywords = r"addicted|alcohol|drugs|relapse|rehab|can’t stop|withdrawal|cravings"
lse_keywords = r"not good enough|worthless|no confidence|self-doubt|insecure"
anger_keywords = r"rage|can’t control anger|lash out|violent|temper|irritability"
relationship_issues_keywords = r"toxic|break up|cheated|trust issues|abandoned|emotionally unavailable"
caregiver_fatigue_keywords = r"caring for|overwhelmed by responsibility|parent with illness|burnout from caregiving"
phobia_keywords = r"phobia|fear of heights|claustrophobic|terrified of insects|irrational fears"
academic_pressure_keywords = r"school stress|academic anxiety|exams|failing|overwhelmed by studies"

In [76]:
df_combined['has_anxiety'] = df_combined['input'].str.contains(anxiety_keywords, case=False).astype(int)
df_combined['has_depression'] = df_combined['input'].str.contains(depression_keywords, case=False).astype(int)
df_combined['has_eating_disorder'] = df_combined['input'].str.contains(eating_disorder_keywords, case=False).astype(int)
df_combined['family_issues'] = df_combined['input'].str.contains(family_issues_keywords, case=False).astype(int)
df_combined['sleeping_disorder'] = df_combined['input'].str.contains(sleeping_disorder_keywords, case=False).astype(int)
df_combined['chronic_stress'] = df_combined['input'].str.contains(chronic_stress_keywords, case=False).astype(int)
df_combined['worry'] = df_combined['input'].str.contains(worry_keywords, case=False).astype(int)
df_combined['bipolar'] = df_combined['input'].str.contains(bipolar_disorder_keywords, case=False).astype(int)
df_combined['ocd'] = df_combined['input'].str.contains(ocd_keywords, case=False).astype(int)
df_combined['dysmorphic'] = df_combined['input'].str.contains(body_dysmorphic_disorder_keywords, case=False).astype(int)
df_combined['borderline_personality'] = df_combined['input'].str.contains(borderline_personality_keywords, case=False).astype(int)
df_combined['PTSD'] = df_combined['input'].str.contains(ptsd_keywords, case=False).astype(int)
df_combined['social_anxiety'] = df_combined['input'].str.contains(social_anxiety_keywords, case=False).astype(int)
df_combined['schizophrenia'] = df_combined['input'].str.contains(schizophrenia_keywords, case=False).astype(int)
df_combined['autism_disorder'] = df_combined['input'].str.contains(autism_disorder_keywords, case=False).astype(int)
df_combined['dissociative_disorder'] = df_combined['input'].str.contains(dissociative_disorder_keywords, case=False).astype(int)
df_combined['ADHD'] = df_combined['input'].str.contains(adhd_keywords, case=False).astype(int)
df_combined['grief'] = df_combined['input'].str.contains(grief_keywords, case=False).astype(int)
df_combined['suicide'] = df_combined['input'].str.contains(suicide_keywords, case=False).astype(int)
df_combined['addiction'] = df_combined['input'].str.contains(addiction_keywords, case=False).astype(int)
df_combined['low_self_esteem'] = df_combined['input'].str.contains(lse_keywords, case=False).astype(int)
df_combined['anger_issues'] = df_combined['input'].str.contains(anger_keywords, case=False).astype(int)
df_combined['relationship_issues'] = df_combined['input'].str.contains(relationship_issues_keywords, case=False).astype(int)
df_combined['caregiver_fatigue'] = df_combined['input'].str.contains(caregiver_fatigue_keywords, case=False).astype(int)
df_combined['phobias'] = df_combined['input'].str.contains(phobia_keywords, case=False).astype(int)
df_combined['academic_pressure'] = df_combined['input'].str.contains(academic_pressure_keywords, case=False).astype(int)


In [77]:
df_combined.isnull().sum()

instruction               0
input                     0
output                    0
has_anxiety               0
has_depression            0
has_eating_disorder       0
family_issues             0
sleeping_disorder         0
chronic_stress            0
worry                     0
bipolar                   0
ocd                       0
dysmorphic                0
borderline_personality    0
PTSD                      0
social_anxiety            0
schizophrenia             0
autism_disorder           0
dissociative_disorder     0
ADHD                      0
grief                     0
suicide                   0
addiction                 0
low_self_esteem           0
anger_issues              0
relationship_issues       0
caregiver_fatigue         0
phobias                   0
academic_pressure         0
dtype: int64

In [78]:
X = df_combined['input'] #(user message)
y = df_combined.drop(columns=['input'])
y = y.drop(columns=['output', 'instruction', 'family_issues', 'dysmorphic', 'suicide', 'schizophrenia', 'has_eating_disorder', 'phobias'])

In [79]:
X

0        I think I might be developing a substance abus...
1        Parenting has become such a challenge for me. ...
2        Intimacy has always been a struggle for me. I ...
3        I've been struggling with substance abuse for ...
4        Being a parent is overwhelming and exhausting....
                               ...                        
16079    I'm feeling overwhelmed and guilty about my un...
16080    I've been feeling overwhelmed lately, and I've...
16081    I understand what you're saying, but I can't s...
16082    I've been thinking about my relationship with ...
16083    I've been feeling stressed out lately because ...
Name: input, Length: 16084, dtype: object

In [80]:
len(X)

16084

In [81]:
len(y_train)

12867

In [82]:
y

Unnamed: 0,has_anxiety,has_depression,sleeping_disorder,chronic_stress,worry,bipolar,ocd,borderline_personality,PTSD,social_anxiety,autism_disorder,dissociative_disorder,ADHD,grief,addiction,low_self_esteem,anger_issues,relationship_issues,caregiver_fatigue,academic_pressure
0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16079,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
16080,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
16081,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

In [84]:
X_train_vectorized, X_test_vectorized, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

In [None]:
model = MultiOutputClassifier(LogisticRegression(class_weight='balanced', max_iter=1000))
model = MultiOutputClassifier(RandomForestClassifier())

model.fit(X_train_vectorized, y_train)


In [None]:
import numpy as np

# Get the list of prediction probabilities (one array per label)
proba_list = model.predict_proba(X_test_vectorized)

# Apply threshold (e.g., 0.3) to each array in the list
y_test_pred = (np.array([score[:, 1] for score in proba_list]).T > 0.4).astype(int)


In [None]:
print(y_test_pred)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_test_pred))

In [None]:
from sklearn.metrics import hamming_loss
print("Hamming Loss:", hamming_loss(y_test, y_test_pred))

In [None]:
from sklearn.metrics import f1_score
print("F1 (micro):", f1_score(y_test, y_test_pred, average='micro'))
print("F1 (macro):", f1_score(y_test, y_test_pred, average='macro'))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred, target_names=y.columns))


from sklearn.metrics import classification_report

print(classification_report(
    y_test, y_test_pred,
    target_names=y.columns,
    zero_division=0  # This removes the warning
))




In [98]:
num_classes = y.shape[1]

In [151]:
import joblib

# Save model
joblib.dump(model, "psychai_model.pkl")

['psychai_model.pkl']

In [152]:
#Save the vectorizer (e.g., TfidfVectorizer)
joblib.dump(vectorizer, "psychai_vectorizer.pkl")

['psychai_vectorizer.pkl']

In [148]:
loaded_model = joblib.load("psychai_model.pkl")  # Load the trained model

In [149]:
preds = loaded_model.predict(X_test_vectorized)

In [47]:
labels = y.columns

In [48]:
len(labels)

20

In [60]:
import transformers
print(transformers.__version__)

4.53.0
