### Extract

In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("mysql+mysqlconnector://user:password@localhost:3306/your_db")

# 1. Indicators of Anxiety or Depression Based on Reported Frequency of Symptoms During Last 7 Days
df1 = pd.read_json("datasets/cdc-indicators-of-anxiety-or-depression.json")
# 2. Student Depression Dataset
df2 = pd.read_csv("datasets/kaggle-student-depression-dataset.csv")
# 3. Student Mental Health Crisis After COVID-19
df3 = pd.read_excel("datasets/kaggle-student-mental-health-crisis-after-covid19-final.xlsx")
# 4. Student Performance and Behavior Dataset
df4 = pd.read_json("datasets/kaggle-student-performance-and-behavior-dataset.json")
# 5. Students Social Media Addiction Dataset
df5 = pd.read_csv("datasets/kaggle-students-social-media-addiction.csv")
# 6. PHQ-9 Student Depression Dataset
df6 = pd.read_csv("datasets/mendeley-phq9-student-depression-dataset.csv")

Rename columns for consistency

In [2]:
df1.columns = ["indicator", "grp", "state", "subgroup", "phase", "time_period", "time_period_label", "time_period_start_date", "time_period_end_date", "value", "low_CI", "high_CI", "confidence_interval", "quartile_range"]
df2.columns = ["id", "gender", "age", "city", "profession", "academic_pressure", "work_pressure", "cgpa", "study_satisfaction", "job_satisfaction", "sleep_duration", "dietary_habits", "degree", "has_suicidal_thoughts", "work_study_Hours", "financial_stress", "has_mental_illness_family_history", "has_depression"]
df3.columns = ["gender", "age", "city", "profession", "academic_pressure", "work_pressure", "cgpa", "study_satisfaction", "job_satisfaction", "sleep_duration", "dietary_habits", "degree", "has_suicidal_thoughts", "work_study_Hours", "financial_stress", "has_mental_illness_family_history", "has_depression"]
df4.columns = ["student_id", "first_name", "last_name", "email", "gender", "age", "department", "attendance", "midterm_score", "final_score", "assignments_ave", "quizzes_ave", "participation_score", "projects_score", "total_score", "grade", "study_hours_per_week", "has_extracurricular_activities", "has_internet_access", "parent_education_level", "family_income_level", "stress_level", "sleep_hours"]
df5.columns = ["student_id", "age", "gender", "academic_level", "country", "ave_daily_usage_hours", "most_used_platform", "affects_academic_performance", "sleep_hours", "mental_health_score", "relationship_status", "conflicts_over_social_media", "addicted_score"] 
df6.columns = ['age','gender','interest_loss','depressed_mood','sleep_trouble','fatigue','appetite_change','guilt_failure','concentration','fidgety_restless','suicidal_thoughts','phq9_score','depression_level']

Split and extract the age and gender subgroups from CDC dataset.

In [3]:
# Split df1 into two dataframes based on the "group" column
df1 = df1[df1["grp"].isin(["By Age", "By Sex"])]
df1_age = df1[df1["grp"] == "By Age"].copy()
df1_sex = df1[df1["grp"] == "By Sex"].copy()
df1_age = df1_age.rename(columns={"subgroup": "age"})
df1_sex = df1_sex.rename(columns={"subgroup": "gender"})

In [4]:
print("DF1: ", df1_age["age"].unique())
print("DF2: ", np.sort(df2["age"].unique()))
print("DF3: ", np.sort(df3["age"].unique()))
print("DF4: ", np.sort(df4["age"].unique()))
print("DF5: ", np.sort(df5["age"].unique()))
print("DF6: ", np.sort(df6["age"].unique()))

DF1:  ['18 - 29 years' '30 - 39 years' '40 - 49 years' '50 - 59 years'
 '60 - 69 years' '70 - 79 years' '80 years and above']
DF2:  [18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
 36. 37. 38. 39. 41. 42. 43. 44. 46. 48. 49. 51. 54. 56. 58. 59.]
DF3:  [18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34]
DF4:  [18 19 20 21 22 23 24]
DF5:  [18 19 20 21 22 23 24]
DF6:  [18 19 20 21 22 23 24 25 26]


### Transform

In [5]:
# Drop unnecessary columns
df1_age = df1_age.drop(columns=["grp", "state", "phase", "time_period", "time_period_label", "time_period_start_date", "time_period_end_date", "quartile_range"])
df1_sex = df1_sex.drop(columns=["grp", "state", "phase", "time_period", "time_period_label", "time_period_start_date", "time_period_end_date", "quartile_range"])

df2 = df2.drop(columns=["id", "city", "profession", "work_pressure", "job_satisfaction"])
df3 = df3.drop(columns=["city", "profession", "work_pressure", "job_satisfaction"])
df4 = df4.drop(columns=["student_id", "first_name", "last_name", "email", "grade"])
df5 = df5.drop(columns=["student_id", "country"])

In [6]:
# Fill and drop missing values
df2["financial_stress"] = df2["financial_stress"].fillna(df2["financial_stress"].mean())
df4 = df4.dropna(subset=["attendance", "assignments_ave"])

In [7]:
# Standardize 
# Convert age from float to int from df2
df2["age"] = df2["age"].fillna(df2["age"].median()).astype(int)

In [8]:
# mapping
survey_cols = [
    'interest_loss',
    'depressed_mood',
    'sleep_trouble',
    'fatigue',
    'appetite_change',
    'guilt_failure',
    'concentration',
    'fidgety_restless',
    'suicidal_thoughts'
]


mapping = {
    'Not at all': 0,
    'Several days': 1,
    'More than half the days': 2,
    'Nearly every day': 3
}

df6[survey_cols] = df6[survey_cols].apply(lambda col: col.map(mapping))


In [9]:
depression_mapping = {
    'Minimal': 0,
    'Mild': 1,
    'Moderate': 2,
    'Moderately Severe': 3,
    'Severe': 4
}

df6['depression_level'] = df6['depression_level'].map(depression_mapping)