# ETL Script

### 1. Extract

In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
engine = create_engine("mysql+mysqlconnector://user:password@localhost:3306/your_db")

# 1. Indicators of Anxiety or Depression Based on Reported Frequency of Symptoms During Last 7 Days
df1 = pd.read_json("datasets/cdc-indicators-of-anxiety-or-depression.json")
# 2. Student Depression Dataset
df2 = pd.read_csv("datasets/kaggle-student-depression-dataset.csv")
# 3. Student Mental Health Crisis After COVID-19
df3 = pd.read_excel("datasets/kaggle-student-mental-health-crisis-after-covid19-final.xlsx")
# 4. Student Performance and Behavior Dataset
df4 = pd.read_json("datasets/kaggle-student-performance-and-behavior-dataset.json")
# 5. Students Social Media Addiction Dataset
df5 = pd.read_csv("datasets/kaggle-students-social-media-addiction.csv")
# 6. PHQ-9 Student Depression Dataset
df6 = pd.read_csv("datasets/mendeley-phq9-student-depression-dataset.csv")

## 2. Transform

### Data Preprocessing and Cleaning

In [3]:
# Rename columns for consistency
df1.columns = ["indicator", "grp", "state", "subgroup", "phase", "time_period", "time_period_label", "time_period_start_date", "time_period_end_date", "value", "low_CI", "high_CI", "confidence_interval", "quartile_range"]
df2.columns = ["id", "gender", "age", "city", "profession", "academic_pressure", "work_pressure", "cgpa", "study_satisfaction", "job_satisfaction", "sleep_duration", "dietary_habits", "degree", "has_suicidal_thoughts", "work_study_hours", "financial_stress", "has_family_mental_illness", "has_depression"]
df3.columns = ["gender", "age", "city", "profession", "academic_pressure", "work_pressure", "cgpa", "study_satisfaction", "job_satisfaction", "sleep_duration", "dietary_habits", "degree", "has_suicidal_thoughts", "work_study_hours", "financial_stress", "has_family_mental_illness", "has_depression"]
df4.columns = ["student_id", "first_name", "last_name", "email", "gender", "age", "department", "attendance", "midterm_score", "final_score", "assignments_ave", "quizzes_ave", "participation_score", "projects_score", "total_score", "grade", "study_hours_per_week", "has_extracurricular", "has_internet_access", "parent_education_level", "family_income_level", "stress_level", "sleep_hours"]
df5.columns = ["student_id", "age", "gender", "academic_level", "country", "ave_daily_usage_hours", "most_used_platform", "affects_academic_performance", "sleep_hours", "mental_health_score", "relationship_status", "conflicts_over_social_media", "addicted_score"] 
df6.columns = ['age','gender','interest_loss','depressed_mood','sleep_trouble','fatigue','appetite_change','guilt_failure','concentration','fidgety_restless','suicidal_thoughts','phq9_score','depression_level']

In [4]:
# Split DF1 into two dataframes based on gender and age groups
df1 = df1[df1["grp"].isin(["By Age", "By Sex"])]
df1_age = df1[df1["grp"] == "By Age"].copy()
df1_sex = df1[df1["grp"] == "By Sex"].copy()
df1_age = df1_age.rename(columns={"subgroup": "age"})
df1_sex = df1_sex.rename(columns={"subgroup": "gender"})

In [5]:
# Drop unnecessary columns
df1_age = df1_age.drop(columns=["grp", "state", "phase", "time_period", "time_period_label", "time_period_start_date", "time_period_end_date", "quartile_range"])
df1_sex = df1_sex.drop(columns=["grp", "state", "phase", "time_period", "time_period_label", "time_period_start_date", "time_period_end_date", "quartile_range"])
df2 = df2.drop(columns=["id", "city", "profession", "work_pressure", "job_satisfaction"])
df3 = df3.drop(columns=["city", "profession", "work_pressure", "job_satisfaction"])
df4 = df4.drop(columns=["student_id", "first_name", "last_name", "email", "grade"])
df5 = df5.drop(columns=["student_id", "country"])

In [6]:
# Drop rows with exceeding age groups and missing values 
df1_age = df1_age[~df1_age["age"].isin(['60 - 69 years', '70 - 79 years', '80 years and above'])]
df4 = df4.dropna(subset=["attendance", "assignments_ave"])

# Fill missing values with mean and default values
df2["financial_stress"] = df2["financial_stress"].fillna(df2["financial_stress"].mean())
df4["parent_education_level"] = df4["parent_education_level"].fillna("None")

### Converting Data Types and Standardization

In [7]:
# Convert age from float to integer
df2["age"] = df2["age"].fillna(df2["age"].median()).astype(int)

In [8]:
# Convert boolean columns from 'Yes'/'No' to 1/0
bool_columns = {
    'df2': ["has_suicidal_thoughts", "has_family_mental_illness"],
    'df3': ["has_suicidal_thoughts", "has_family_mental_illness", "has_depression"],
    'df4': ["has_extracurricular", "has_internet_access"],
    'df5': ["affects_academic_performance"],
}

for df_name, columns in bool_columns.items():
    df = globals()[df_name]
    for col in columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

In [9]:
# Map sleep hours to numerical values in DF2 and DF3
sleep_map = {
    'Less than 5 hours': 4.5,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9,
    'Others': None
}
df2['sleep_duration'] = df2['sleep_duration'].map(sleep_map)
df3['sleep_duration'] = df3['sleep_duration'].map(sleep_map)

In [10]:
# Map academic level in DF5 to numerical values
academic_map = {
    'High School': 0,
    'Undergraduate': 1,
    'Graduate': 2
}
df5['academic_level'] = df5['academic_level'].map(academic_map)

In [11]:
# Map survey responses and depression levels in DF6 to numerical values
survey_cols = [
    'interest_loss',
    'depressed_mood',
    'sleep_trouble',
    'fatigue',
    'appetite_change',
    'guilt_failure',
    'concentration',
    'fidgety_restless',
    'suicidal_thoughts'
]

depression_mapping = {
    'Minimal': 0,
    'Mild': 1,
    'Moderate': 2,
    'Moderately Severe': 3,
    'Severe': 4
}

mapping = {
    'Not at all': 0,
    'Several days': 1,
    'More than half the days': 2,
    'Nearly every day': 3
}

df6[survey_cols] = df6[survey_cols].apply(lambda col: col.map(mapping))
df6['depression_level'] = df6['depression_level'].map(depression_mapping)

In [12]:
# One-hot encode dietary_habits in DF2 and DF3
df2 = pd.concat([df2, pd.get_dummies(df2['dietary_habits'], prefix='diet')], axis=1)
df3 = pd.concat([df3, pd.get_dummies(df3['dietary_habits'], prefix='diet')], axis=1)
df2.drop(columns=['dietary_habits'], inplace=True)
df3.drop(columns=['dietary_habits'], inplace=True)

In [13]:
# One-hot encode dietary_habits and relationship_status in DF5
df5 = pd.concat([df5, pd.get_dummies(df5['most_used_platform'], prefix='platform')], axis=1)
df5 = pd.concat([df5, pd.get_dummies(df5['relationship_status'], prefix='relationship')], axis=1)
df5.drop(columns=['most_used_platform'], inplace=True)
df5.drop(columns=['relationship_status'], inplace=True)

### Aggregate to Fact Table