In [20]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv('mental_health_dataset.csv',engine='python')

In [22]:
from sklearn.preprocessing import LabelEncoder
# Encode 'Gender' (categorical) as numeric
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

df.drop(['Mood_Description', 'Sentiment_Score','Student_ID', 'Stress_Level', 'Anxiety_Score', 'Depression_Score'], axis=1, inplace=True)



In [23]:
from nrclex import NRCLex
import nltk
from nltk.corpus import stopwords
import plotly.express as px


In [37]:
# Convert 'Daily_Reflections' into numerical emotion vectors

from collections import defaultdict

def get_emotion_vector(text):
    if pd.isna(text) or not str(text).strip():
        # Handle empty or NaN reflections
        return defaultdict(float)
    
    doc = NRCLex(text)
    emotion_scores = doc.raw_emotion_scores
    total = sum(emotion_scores.values()) or 1  # avoid division by zero
    
    # Normalize scores
    normalized = {emotion: count / total for emotion, count in emotion_scores.items()}
    
    # Make sure all emotions are represented
    all_emotions = ['fear', 'anger', 'anticipation', 'trust', 'surprise',
                    'sadness', 'joy', 'disgust', 'negative', 'positive']
    full_vector = {emotion: normalized.get(emotion, 0.0) for emotion in all_emotions}
    
    return full_vector

# Apply the function to each row
emotion_vectors = df['Daily_Reflections'].apply(get_emotion_vector)

# Convert list of dicts to a DataFrame
emotion_df = pd.DataFrame(emotion_vectors.tolist())

# Concatenate with the original df
df_with_emotions = pd.concat([df, emotion_df], axis=1)

print(df_with_emotions)


     Age  Gender   GPA                                  Daily_Reflections  \
0     23       2  2.52  Onto foreign do environmental anyone every nea...   
1     19       1  2.74  Party but others visit admit industry country ...   
2     21       0  3.53  Religious sure wait do chance decade according...   
3     18       1  2.04            A task effect entire coach join series.   
4     19       2  2.87  Knowledge several camera wait week write quali...   
..   ...     ...   ...                                                ...   
495   20       2  3.34  Land floor page trade social away animal cut e...   
496   18       0  3.22  Almost wide majority technology positive parti...   
497   23       2  2.86  Property answer method call law dream maybe mo...   
498   18       0  2.45       Care can now outside real rest that perform.   
499   18       2  3.52  Get turn Congress list mouth city decision eas...   

     Sleep_Hours  Steps_Per_Day  Mental_Health_Status  \
0            6.8  

In [28]:
numerical_cols = ['Age','Gender', 'GPA', 'Sleep_Hours', 'Steps_Per_Day', 'fear', 'anger', 'anticipation', 'trust', 'surprise','sadness', 'joy']

In [29]:
df_with_emotions

Unnamed: 0,Age,Gender,GPA,Daily_Reflections,Sleep_Hours,Steps_Per_Day,Mental_Health_Status,fear,anger,anticipation,trust,surprise,sadness,joy,disgust,negative,positive
0,23,2,2.52,Onto foreign do environmental anyone every nea...,6.8,4166,2,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000
1,19,1,2.74,Party but others visit admit industry country ...,5.1,4949,2,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000
2,21,0,3.53,Religious sure wait do chance decade according...,8.3,7632,2,0.0,0.000000,0.250000,0.000000,0.250000,0.0,0.000000,0.000000,0.250000,0.250000
3,18,1,2.04,A task effect entire coach join series.,8.2,5548,2,0.0,0.000000,0.000000,0.500000,0.000000,0.0,0.000000,0.000000,0.000000,0.500000
4,19,2,2.87,Knowledge several camera wait week write quali...,5.9,3698,0,0.0,0.076923,0.153846,0.153846,0.076923,0.0,0.153846,0.076923,0.153846,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,20,2,3.34,Land floor page trade social away animal cut e...,5.4,8725,2,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
496,18,0,3.22,Almost wide majority technology positive parti...,4.5,3692,0,0.0,0.000000,0.000000,0.250000,0.125000,0.0,0.250000,0.000000,0.000000,0.375000
497,23,2,2.86,Property answer method call law dream maybe mo...,8.2,6000,2,0.0,0.000000,0.000000,0.500000,0.000000,0.0,0.000000,0.000000,0.000000,0.500000
498,18,0,2.45,Care can now outside real rest that perform.,6.0,5013,1,0.0,0.000000,0.000000,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.666667


In [30]:
# standardize:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_with_emotions[numerical_cols] = scaler.fit_transform(df_with_emotions[numerical_cols])


In [32]:
df_with_emotions.to_csv("df_with_emotions.csv", index=False)

### Optional: Alternative methods to convert textual data into numerical

In [33]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nrclex import NRCLex


In [34]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    if pd.isnull(text):  # handle missing values
        return ""
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]  # keep words only
    tokens = [t for t in tokens if t not in stop_words]
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(lemmas)

# Apply to the column
df['Daily_Reflections_Cleaned'] = df['Daily_Reflections'].apply(preprocess)

print(df[['Daily_Reflections', 'Daily_Reflections_Cleaned']])


                                     Daily_Reflections  \
0    Onto foreign do environmental anyone every nea...   
1    Party but others visit admit industry country ...   
2    Religious sure wait do chance decade according...   
3              A task effect entire coach join series.   
4    Knowledge several camera wait week write quali...   
..                                                 ...   
495  Land floor page trade social away animal cut e...   
496  Almost wide majority technology positive parti...   
497  Property answer method call law dream maybe mo...   
498       Care can now outside real rest that perform.   
499  Get turn Congress list mouth city decision eas...   

                             Daily_Reflections_Cleaned  
0    onto foreign environmental anyone every nearly...  
1            party others visit admit industry country  
2    religious sure wait chance decade according wa...  
3                 task effect entire coach join series  
4    knowledge sev

In [36]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Daily_Reflections_Cleaned'])

print(pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out()))


     ability  able  accept  according  account  across       act  action  \
0        0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
1        0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
2        0.0   0.0     0.0   0.288305      0.0     0.0  0.309231     0.0   
3        0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
4        0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
..       ...   ...     ...        ...      ...     ...       ...     ...   
495      0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
496      0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
497      0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
498      0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   
499      0.0   0.0     0.0   0.000000      0.0     0.0  0.000000     0.0   

     activity  actually  ...  would     write  writer  wrong  yard  yeah  \
0         0