In [252]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder ## coding the categorical variables
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning


In [253]:
## cargar datos
data = '../data/raw/Student Depression Dataset.csv'
datos = pd.read_csv(data)

In [254]:
## chequear datos
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [255]:
## estadisticas descriptivas de los datos
datos.describe()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27898.0,27901.0
mean,70442.149421,25.8223,3.141214,0.00043,7.656104,2.943837,0.000681,7.156984,3.139867,0.585499
std,40641.175216,4.905687,1.381465,0.043992,1.470707,1.361148,0.044394,3.707642,1.437347,0.492645
min,2.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,35039.0,21.0,2.0,0.0,6.29,2.0,0.0,4.0,2.0,0.0
50%,70684.0,25.0,3.0,0.0,7.77,3.0,0.0,8.0,3.0,1.0
75%,105818.0,30.0,4.0,0.0,8.92,4.0,0.0,10.0,4.0,1.0
max,140699.0,59.0,5.0,5.0,10.0,5.0,4.0,12.0,5.0,1.0


In [256]:
## mostrar los primeros 10 datos
datos.head(10)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0
5,33,Male,29.0,Pune,Student,2.0,0.0,5.7,3.0,0.0,Less than 5 hours,Healthy,PhD,No,4.0,1.0,No,0
6,52,Male,30.0,Thane,Student,3.0,0.0,9.54,4.0,0.0,7-8 hours,Healthy,BSc,No,1.0,2.0,No,0
7,56,Female,30.0,Chennai,Student,2.0,0.0,8.04,4.0,0.0,Less than 5 hours,Unhealthy,Class 12,No,0.0,1.0,Yes,0
8,59,Male,28.0,Nagpur,Student,3.0,0.0,9.79,1.0,0.0,7-8 hours,Moderate,B.Ed,Yes,12.0,3.0,No,1
9,62,Male,31.0,Nashik,Student,2.0,0.0,8.38,3.0,0.0,Less than 5 hours,Moderate,LLB,Yes,2.0,5.0,No,1


In [257]:
## Agrupacion de datos si esta o no deprimido
agrupados_depresion = datos.groupby('Depression')
print(agrupados_depresion.size())

Depression
0    11565
1    16336
dtype: int64


In [258]:
##verificar si hay datos nulos
print(datos.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [259]:
## Rellenar los datos nulos con la media
datos.loc[:, 'Financial Stress'] = datos['Financial Stress'].fillna(datos['Financial Stress'].mean())
print(datos.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [260]:
##codificar las variables categoricas
print(datos['Dietary Habits'].value_counts())


Dietary Habits
Unhealthy    10317
Moderate      9921
Healthy       7651
Others          12
Name: count, dtype: int64


In [261]:
## columnas que tienen datos numericos
numerical_columns = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']

In [262]:
# Definicion de columnas nominales y ordinales
nominal_columns = ['Gender', 'City', 'Profession', 'Degree']
ordinal_columns = {
    'Sleep Duration': ['Less than 5 hours', '5-6 hours',  '7-8 hours', 'More than 8 hours', 'Others'],  # Example levels
    'Have you ever had suicidal thoughts ?': ['No', 'Yes'],  # Example levels
    'Family History of Mental Illness': ['No', 'Yes'],  # Example levels
    'Dietary Habits': ['Unhealthy', 'Moderate', 'Healthy', 'Others']  # Example levels
}

In [263]:
# Codificacion de columnas nominales utilizando LabelEncoder
label_encoders = {}
for column in nominal_columns:
    le = LabelEncoder()
    datos[column] = le.fit_transform(datos[column])
    label_encoders[column] = le

In [264]:
# Codificacion de columnas ordinales utilizando OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[ordinal_columns[col] for col in ordinal_columns])
datos[list(ordinal_columns.keys())] = ordinal_encoder.fit_transform(datos[list(ordinal_columns.keys())])

In [265]:
## ver los datos codificados
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  int64  
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  int64  
 4   Profession                             27901 non-null  int64  
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  float64
 11  Di

In [266]:
# limitar los datos atipicos
for column in numerical_columns:
    q1 = datos[column].quantile(0.25)
    q3 = datos[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    datos[column] = datos[column].clip(lower_bound, upper_bound)

In [267]:
# Separar datos en X y y
x = datos.drop(['Depression', 'id'], axis=1)
y = datos['Depression']

In [268]:
# Entrenar el modelo Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=27)
model.fit(x, y)

In [269]:
# Obtener la importancia de las caracteristicas
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [270]:
# Crear un DataFrame para las importancias de las características
feature_importances = pd.DataFrame({
    'Feature': x.columns[indices],
    'Importance': importances[indices]
})

In [271]:
# Imprimir el ranking de las características
print("Feature ranking:")
print(feature_importances)

Feature ranking:
                                  Feature  Importance
0   Have you ever had suicidal thoughts ?    0.224825
1                       Academic Pressure    0.164808
2                        Financial Stress    0.092955
3                                    CGPA    0.087139
4                                     Age    0.084018
5                                    City    0.072452
6                        Work/Study Hours    0.069495
7                                  Degree    0.062749
8                      Study Satisfaction    0.042480
9                          Dietary Habits    0.035201
10                         Sleep Duration    0.032666
11                                 Gender    0.015772
12       Family History of Mental Illness    0.015132
13                             Profession    0.000309
14                       Job Satisfaction    0.000000
15                          Work Pressure    0.000000


In [272]:
## normalizacion de los datos
scaler = StandardScaler()
datos[numerical_columns] = scaler.fit_transform(datos[numerical_columns])

datos.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,1,1.468036,51,11,1.345543,0.0,0.89491,-0.693425,0.0,1.0,2.0,3,1.0,-1.121213,-1.488868,0.0,1
1,8,0,-0.371929,3,11,-0.826104,0.0,-1.197308,1.510636,0.0,1.0,1.0,10,0.0,-1.121213,-0.793092,1.0,0
2,26,1,1.059155,44,11,-0.102222,0.0,-0.427208,1.510636,0.0,0.0,2.0,5,0.0,0.497095,-1.488868,1.0,0
3,30,0,0.445833,49,11,-0.102222,0.0,-1.408574,-0.693425,0.0,2.0,1.0,7,1.0,-0.851495,1.294237,1.0,1
4,32,0,-0.167489,16,11,0.62166,0.0,0.322447,0.041262,0.0,1.0,1.0,17,1.0,-1.66065,-1.488868,0.0,0


In [273]:
# Seleccion de las características principales según su importancia
top_features = [x.columns[indices[i]] for i in range(11)]  # Select top 10 features
X_top = x[top_features]

In [274]:
# Eliminar la columna 'id' si está presente
if 'id' in X_top.columns:
    X_top = X_top.drop('id', axis=1)


In [275]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=27)

In [276]:
## predefinir la misma semilla para todos los modelos
random = 27

In [277]:
## definir los modelos a utilizar
models = {
    "Logistic Regression": LogisticRegression(random_state=random),
    "Stochastic Gradient Descent (SGD)": SGDClassifier(random_state=random),
    "Decision Tree": DecisionTreeClassifier(random_state=random),
    "Random Forest": RandomForestClassifier(random_state=random),
    "Gradient Boosting": GradientBoostingClassifier(random_state=random),
    "SVM": SVC(random_state=random),  # Not always necessary but included if needed
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),  # No random_state parameter
    "Naive Bayes": GaussianNB(),  # No random_state parameter
}

In [278]:
# Lista vacia para almacenar los resultados
results = []

warnings.simplefilter("ignore", ConvergenceWarning)
# Entrenar y evaluar modelos
for name, model in models.items():
    model.fit(X_train, y_train)  # entrenar modelo
    y_pred = model.predict(X_test)  # Hacer predicciones
    
    # Calcular precisión
    accuracy = accuracy_score(y_test, y_pred)
    
    # Almacenar resultados en en la lista results
    results.append({
        "Model": name,
        "Accuracy": round(accuracy,2),
        "Precision": round(classification_report(y_test, y_pred, output_dict=True)['weighted avg']['precision'], 2),
        "Recall": round(classification_report(y_test, y_pred, output_dict=True)['weighted avg']['recall'],2),
        "F1-Score": round(classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score'],2)
    })

# Crear un DataFrame
results_df = pd.DataFrame(results)

In [280]:
# Ver los resultados
results_df = results_df.sort_values(by='Accuracy', ascending=False)
print(results_df)

                               Model  Accuracy  Precision  Recall  F1-Score
0                Logistic Regression      0.85       0.85    0.85      0.85
3                      Random Forest      0.85       0.85    0.85      0.85
5                                SVM      0.85       0.85    0.85      0.85
4                  Gradient Boosting      0.85       0.85    0.85      0.85
7                        Naive Bayes      0.84       0.84    0.84      0.84
2                      Decision Tree      0.77       0.77    0.77      0.77
6          K-Nearest Neighbors (KNN)      0.75       0.75    0.75      0.75
1  Stochastic Gradient Descent (SGD)      0.70       0.80    0.70      0.70
