In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('mental_health_dataset.csv',engine='python')

In [7]:
df['Gender'] = df['Gender'].astype('category')
df.drop(['Mood_Description', 'Sentiment_Score'], axis=1, inplace=True)

In [8]:
from nrclex import NRCLex
import nltk
from nltk.corpus import stopwords
import plotly.express as px


In [10]:
from collections import defaultdict

def get_emotion_vector(text):
    if pd.isna(text) or not str(text).strip():
        # Handle empty or NaN reflections
        return defaultdict(float)
    
    doc = NRCLex(text)
    emotion_scores = doc.raw_emotion_scores
    total = sum(emotion_scores.values()) or 1  # avoid division by zero
    
    # Normalize scores
    normalized = {emotion: count / total for emotion, count in emotion_scores.items()}
    
    # Make sure all emotions are represented
    all_emotions = ['fear', 'anger', 'anticipation', 'trust', 'surprise',
                    'sadness', 'joy', 'disgust', 'negative', 'positive']
    full_vector = {emotion: normalized.get(emotion, 0.0) for emotion in all_emotions}
    
    return full_vector

# Apply the function to each row
emotion_vectors = df['Daily_Reflections'].apply(get_emotion_vector)

# Convert list of dicts to a DataFrame
emotion_df = pd.DataFrame(emotion_vectors.tolist())

# Concatenate with the original df
df_with_emotions = pd.concat([df, emotion_df], axis=1)

print(df_with_emotions)


     Student_ID  Age  Gender   GPA  Stress_Level  Anxiety_Score  \
0             1   23   Other  2.52             5             20   
1             2   19    Male  2.74             5              3   
2             3   21  Female  3.53             5             11   
3             4   18    Male  2.04             4             15   
4             5   19   Other  2.87             1              2   
..          ...  ...     ...   ...           ...            ...   
495         496   20   Other  3.34             4              0   
496         497   18  Female  3.22             2              7   
497         498   23   Other  2.86             4             17   
498         499   18  Female  2.45             4             14   
499         500   18   Other  3.52             1              0   

     Depression_Score                                  Daily_Reflections  \
0                   6  Onto foreign do environmental anyone every nea...   
1                   7  Party but others vis

In [11]:
numerical_cols = ['Age','GPA','Stress_Level', 'Sleep_Hours', 'Anxiety_Score', 'Depression_Score', 'Steps_Per_Day', 'joy', 'sadness', 'fear', 'anger', 'trust', 'anticipation', 'surprise', 'positive', 'negative']

In [12]:
# standardize:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_with_emotions[numerical_cols] = scaler.fit_transform(df_with_emotions[numerical_cols])


In [56]:
df_with_emotions

Unnamed: 0,Student_ID,Age,Gender,GPA,Stress_Level,Anxiety_Score,Depression_Score,Daily_Reflections,Sleep_Hours,Steps_Per_Day,...,fear,anger,anticipation,trust,surprise,sadness,joy,disgust,negative,positive
0,1,1.159999,Other,-0.566791,1.354829,1.607159,-0.916966,Onto foreign do environmental anyone every nea...,0.483148,-0.963883,...,-0.589006,-0.403906,-0.754115,-0.959785,-0.409189,-0.401565,-0.739343,0.000000,5.918226,-1.297918
1,2,-0.601579,Male,-0.138201,1.354829,-1.074594,-0.796598,Party but others visit admit industry country ...,-0.633426,-0.566872,...,-0.589006,-0.403906,-0.754115,-0.959785,-0.409189,-0.401565,-0.739343,0.000000,-0.558704,2.905716
2,3,0.279210,Female,1.400826,1.354829,0.187407,1.249664,Religious sure wait do chance decade according...,1.468360,0.793510,...,-0.589006,-0.403906,0.977859,-0.959785,2.005550,-0.401565,-0.739343,0.000000,1.060529,-0.247009
3,4,-1.041973,Male,-1.501896,0.655743,0.818408,0.045981,A task effect entire coach join series.,1.402679,-0.263157,...,-0.589006,-0.403906,-0.754115,1.588991,-0.409189,-0.401565,-0.739343,0.000000,-0.558704,0.803899
4,5,-0.601579,Other,0.115057,-1.441515,-1.232344,-1.157703,Knowledge several camera wait week write quali...,-0.107979,-1.201176,...,-0.589006,0.271686,0.311715,-0.175546,0.333807,-0.401565,0.895866,0.076923,0.437747,-0.651205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,-0.161184,Other,1.030681,0.655743,-1.547845,0.888559,Land floor page trade social away animal cut e...,-0.436383,1.347702,...,-0.589006,-0.403906,-0.754115,4.137768,-0.409189,-0.401565,-0.739343,0.000000,-0.558704,-1.297918
496,497,-1.041973,Female,0.796904,-0.742429,-0.443594,-1.278071,Almost wide majority technology positive parti...,-1.027510,-1.204219,...,-0.589006,-0.403906,-0.754115,0.314603,0.798180,-0.401565,1.917872,0.000000,-0.558704,0.278445
497,498,1.159999,Other,0.095576,0.655743,1.133908,-1.518808,Property answer method call law dream maybe mo...,1.402679,-0.033976,...,-0.589006,-0.403906,-0.754115,1.588991,-0.409189,-0.401565,-0.739343,0.000000,-0.558704,0.803899
498,499,-1.041973,Female,-0.703160,0.655743,0.660658,-1.639176,Care can now outside real rest that perform.,-0.042298,-0.534422,...,-0.589006,-0.403906,-0.754115,0.739399,-0.409189,-0.401565,-0.739343,0.000000,-0.558704,1.504504


In [13]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nrclex import NRCLex


In [17]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    if pd.isnull(text):  # handle missing values
        return ""
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]  # keep words only
    tokens = [t for t in tokens if t not in stop_words]
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(lemmas)

# Apply to the column
df['Daily_Reflections_Cleaned'] = df['Daily_Reflections'].apply(preprocess)

print(df[['Daily_Reflections', 'Daily_Reflections_Cleaned']])


                                     Daily_Reflections  \
0    Onto foreign do environmental anyone every nea...   
1    Party but others visit admit industry country ...   
2    Religious sure wait do chance decade according...   
3              A task effect entire coach join series.   
4    Knowledge several camera wait week write quali...   
..                                                 ...   
495  Land floor page trade social away animal cut e...   
496  Almost wide majority technology positive parti...   
497  Property answer method call law dream maybe mo...   
498       Care can now outside real rest that perform.   
499  Get turn Congress list mouth city decision eas...   

                             Daily_Reflections_Cleaned  
0    onto foreign environmental anyone every nearly...  
1            party others visit admit industry country  
2    religious sure wait chance decade according wa...  
3                 task effect entire coach join series  
4    knowledge sev

In [19]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(processed_texts)

print(pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out()))


     ability  able  about  above  accept  according  account  across  \
0        0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
1        0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
2        0.0   0.0    0.0    0.0     0.0   0.276562      0.0     0.0   
3        0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
4        0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
..       ...   ...    ...    ...     ...        ...      ...     ...   
495      0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
496      0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
497      0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
498      0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   
499      0.0   0.0    0.0    0.0     0.0   0.000000      0.0     0.0   

          act  action  ...  wrong  yard  yeah  year  yes  yet  you  young  \
0    0.000000     0.0  ...    0.0   0.0   0.0   0.0  0.0  