# ***1. Conexión al Drive***

In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
  print("Drive montado correctamente.")
except ModuleNotFoundError:
  print("No estás en Colab, omitiendo montaje de Drive.")

Mounted at /content/drive
Drive montado correctamente.


# ***2. Cargar funciones de otro notebook***

In [4]:
%run "/content/drive/MyDrive/cod/Health/HealthProjectG8/a_funciones.ipynb"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive montado correctamente.
Error: El archivo kaggle.json no se encontró en /content/drive/MyDrive/UdeA/Salud/kaggle.json
Descargando dataset de Kaggle...
Descarga finalizada.


# ***3. Librerias***

In [5]:
# --- Importar Librerias ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os # Asegurar que os se importa si se usa internamente y no es globalmente accesible
from IPython.display import display # Asegurar que display se importa si se usa internamente
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# ***4. Cargar dataset original***

In [6]:
df = pd.read_csv('/content/drive/MyDrive/cod/Health/HealthProjectG8/data/mental_health_dataset.csv')
print("Archivo cargado correctamente")

Archivo cargado correctamente


## ***4.1 Crear copia del dataset***

In [7]:
# Hacer una copia antes de modificar
df_original = df.copy(deep=True)
print("Copia del dataset original creada.")

Copia del dataset original creada.


## ***4.2 Guardar la copia en Drive***

In [9]:
df_original.to_csv('/content/drive/MyDrive/cod/Health/HealthProjectG8/data/mental_health_dataset_original.csv', index=False)


## ***4.3 Verificar columnas del dataset***

In [10]:
print(df.columns)


Index(['age', 'gender', 'employment_status', 'work_environment',
       'mental_health_history', 'seeks_treatment', 'stress_level',
       'sleep_hours', 'physical_activity_days', 'depression_score',
       'anxiety_score', 'social_support_score', 'productivity_score',
       'mental_health_risk'],
      dtype='object')


# ***5. Crear variable objetivo binaria***

In [11]:
def crear_variable_objetivo_categorica(df, columna='mental_health_risk', nuevo_nombre='target'):

    """
    Crea una variable binaria a partir de la columna de riesgo ('High' = 1, otros = 0)
    """

    # Definir el mapeo: solo 'High' es riesgo (1), los demás 0
    mapping = {'High': 1, 'Medium': 0, 'Low': 0}

    df[nuevo_nombre] = df[columna].map(mapping)

    # Opcional: eliminar la columna original
    df.drop(columna, axis=1, inplace=True)

    print(f"Variable objetivo '{nuevo_nombre}' creada a partir de '{columna}'.")
    print(df[nuevo_nombre].value_counts())

    return df


In [12]:
crear_variable_objetivo_categorica(df)

Variable objetivo 'target' creada a partir de 'mental_health_risk'.
target
0    7631
1    2369
Name: count, dtype: int64


Unnamed: 0,age,gender,employment_status,work_environment,mental_health_history,seeks_treatment,stress_level,sleep_hours,physical_activity_days,depression_score,anxiety_score,social_support_score,productivity_score,target
0,56,Male,Employed,On-site,Yes,Yes,6,6.2,3,28,17,54,59.7,1
1,46,Female,Student,On-site,No,Yes,10,9.0,4,30,11,85,54.9,1
2,32,Female,Employed,On-site,Yes,No,7,7.7,2,24,7,62,61.3,0
3,60,Non-binary,Self-employed,On-site,No,No,4,4.5,4,6,0,95,97.0,0
4,25,Female,Self-employed,On-site,Yes,Yes,3,5.4,0,24,12,70,69.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,34,Female,Employed,On-site,Yes,Yes,5,6.1,3,9,21,8,90.0,0
9996,47,Male,Employed,On-site,Yes,No,1,5.7,3,5,12,45,90.8,0
9997,56,Female,Employed,On-site,Yes,No,1,8.3,0,1,18,7,99.2,0
9998,24,Male,Employed,On-site,Yes,Yes,9,6.1,0,28,21,32,56.7,1


# ***6. Eliminar duplicados***

In [13]:
def eliminar_duplicados(df):
    """
    Elimina filas duplicadas del DataFrame.
    """
    n_antes = df.shape[0]
    df = df.drop_duplicates()
    n_despues = df.shape[0]
    print(f"Duplicados eliminados: {n_antes - n_despues}")
    return df

In [14]:
eliminar_duplicados(df)

Duplicados eliminados: 0


Unnamed: 0,age,gender,employment_status,work_environment,mental_health_history,seeks_treatment,stress_level,sleep_hours,physical_activity_days,depression_score,anxiety_score,social_support_score,productivity_score,target
0,56,Male,Employed,On-site,Yes,Yes,6,6.2,3,28,17,54,59.7,1
1,46,Female,Student,On-site,No,Yes,10,9.0,4,30,11,85,54.9,1
2,32,Female,Employed,On-site,Yes,No,7,7.7,2,24,7,62,61.3,0
3,60,Non-binary,Self-employed,On-site,No,No,4,4.5,4,6,0,95,97.0,0
4,25,Female,Self-employed,On-site,Yes,Yes,3,5.4,0,24,12,70,69.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,34,Female,Employed,On-site,Yes,Yes,5,6.1,3,9,21,8,90.0,0
9996,47,Male,Employed,On-site,Yes,No,1,5.7,3,5,12,45,90.8,0
9997,56,Female,Employed,On-site,Yes,No,1,8.3,0,1,18,7,99.2,0
9998,24,Male,Employed,On-site,Yes,Yes,9,6.1,0,28,21,32,56.7,1


# ***7. Imputar valores nulos***

In [15]:
def imputar_valores(df):
    """
    Imputa valores nulos: moda para categóricas, mediana para numéricas.
    """
    for col in df.columns:
        n_null = df[col].isnull().sum()
        if n_null > 0:
            if df[col].dtype == 'object':
                moda = df[col].mode()[0]
                df[col].fillna(moda, inplace=True)
                print(f"Imputados {n_null} valores nulos en '{col}' con la moda: {moda}")
            else:
                mediana = df[col].median()
                df[col].fillna(mediana, inplace=True)
                print(f"Imputados {n_null} valores nulos en '{col}' con la mediana: {mediana}")
    return df

In [16]:
imputar_valores(df)

Unnamed: 0,age,gender,employment_status,work_environment,mental_health_history,seeks_treatment,stress_level,sleep_hours,physical_activity_days,depression_score,anxiety_score,social_support_score,productivity_score,target
0,56,Male,Employed,On-site,Yes,Yes,6,6.2,3,28,17,54,59.7,1
1,46,Female,Student,On-site,No,Yes,10,9.0,4,30,11,85,54.9,1
2,32,Female,Employed,On-site,Yes,No,7,7.7,2,24,7,62,61.3,0
3,60,Non-binary,Self-employed,On-site,No,No,4,4.5,4,6,0,95,97.0,0
4,25,Female,Self-employed,On-site,Yes,Yes,3,5.4,0,24,12,70,69.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,34,Female,Employed,On-site,Yes,Yes,5,6.1,3,9,21,8,90.0,0
9996,47,Male,Employed,On-site,Yes,No,1,5.7,3,5,12,45,90.8,0
9997,56,Female,Employed,On-site,Yes,No,1,8.3,0,1,18,7,99.2,0
9998,24,Male,Employed,On-site,Yes,Yes,9,6.1,0,28,21,32,56.7,1


# ***8. Codificar variables categóricas***

In [17]:
def codificar_variables(df, metodo='onehot'):
    """
    Codifica variables categóricas. Por defecto, one-hot encoding. Alternativamente, label encoding.
    """
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    print(f"Columnas categóricas a codificar: {cat_cols}")
    if metodo == 'onehot':
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
        print("Se aplicó One-Hot Encoding.")
    elif metodo == 'label':
        le = LabelEncoder()
        for col in cat_cols:
            df[col] = le.fit_transform(df[col])
        print("Se aplicó Label Encoding.")
    else:
        raise ValueError("Método de codificación no soportado.")
    return df

In [18]:
codificar_variables(df)

Columnas categóricas a codificar: ['gender', 'employment_status', 'work_environment', 'mental_health_history', 'seeks_treatment']
Se aplicó One-Hot Encoding.


Unnamed: 0,age,stress_level,sleep_hours,physical_activity_days,depression_score,anxiety_score,social_support_score,productivity_score,target,gender_Male,gender_Non-binary,gender_Prefer not to say,employment_status_Self-employed,employment_status_Student,employment_status_Unemployed,work_environment_On-site,work_environment_Remote,mental_health_history_Yes,seeks_treatment_Yes
0,56,6,6.2,3,28,17,54,59.7,1,True,False,False,False,False,False,True,False,True,True
1,46,10,9.0,4,30,11,85,54.9,1,False,False,False,False,True,False,True,False,False,True
2,32,7,7.7,2,24,7,62,61.3,0,False,False,False,False,False,False,True,False,True,False
3,60,4,4.5,4,6,0,95,97.0,0,False,True,False,True,False,False,True,False,False,False
4,25,3,5.4,0,24,12,70,69.0,1,False,False,False,True,False,False,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,34,5,6.1,3,9,21,8,90.0,0,False,False,False,False,False,False,True,False,True,True
9996,47,1,5.7,3,5,12,45,90.8,0,True,False,False,False,False,False,True,False,True,False
9997,56,1,8.3,0,1,18,7,99.2,0,False,False,False,False,False,False,True,False,True,False
9998,24,9,6.1,0,28,21,32,56.7,1,True,False,False,False,False,False,True,False,True,True


# ***9. Dividir en train/test***

In [19]:
def dividir_datos(df, target_col='target', test_size=0.3, random_state=42):
    """
    Divide el DataFrame en conjuntos de entrenamiento y prueba.
    """
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y)
    print(f"Datos divididos en entrenamiento y prueba con test_size={test_size}")
    print("Distribución en entrenamiento:")
    print(y_train.value_counts())
    print("Distribución en prueba:")
    print(y_test.value_counts())
    return X_train, X_test, y_train, y_test


In [20]:
X_train, X_test, y_train, y_test = dividir_datos(df, target_col='target')


Datos divididos en entrenamiento y prueba con test_size=0.3
Distribución en entrenamiento:
target
0    5342
1    1658
Name: count, dtype: int64
Distribución en prueba:
target
0    2289
1     711
Name: count, dtype: int64


# ***10. Balancear set de entrenamiento (SMOTE)***


In [21]:
def balancear_clases(X_train, y_train, metodo='smote'):
    """
    Aplica sobremuestreo con SMOTE para balancear las clases.
    """
    if metodo == 'smote':
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train, y_train)
        print("Distribución después de SMOTE:")
        print(y_res.value_counts())
        return X_res, y_res
    else:
        print("Método de balanceo no soportado.")
        return X_train, y_train

# ***11. Ejecucion del preprocesamiento***

In [22]:
# Pipeline completo de preprocesamiento
def pipeline_preprocesamiento(df, cod_metodo='onehot', test_size=0.3, random_state=42, balancear=True):
    """
    Pipeline integrado para preprocesamiento.
    """
    print("--- Eliminando duplicados ---")
    df = eliminar_duplicados(df)
    print("--- Imputando valores nulos ---")
    df = imputar_valores(df)
    print("--- Excluyendo variable 'productivity_score' para evitar multicolinealidad ---")
    df = df.drop(columns=['productivity_score'])
    print("--- Codificando variables categóricas ---")
    df = codificar_variables(df, metodo=cod_metodo)
    print("--- Dividiendo datos ---")
    X_train, X_test, y_train, y_test = dividir_datos(df, test_size=test_size, random_state=random_state)
    if balancear:
        print("--- Balanceando clases con SMOTE ---")
        X_train, y_train = balancear_clases(X_train, y_train, metodo='smote')
    return X_train, X_test, y_train, y_test

In [23]:
def pipeline_preprocesamiento(df, cod_metodo='onehot', test_size=0.3, random_state=42, balancear=True):
    print("--- Eliminando duplicados ---")
    df = eliminar_duplicados(df)

    print("--- Imputando valores nulos ---")
    df = imputar_valores(df)

    print("--- Codificando variables categóricas ---")
    df = codificar_variables(df, metodo=cod_metodo)  # Aquí ya df tiene variables codificadas

    # Opcionalmente eliminar variable correlacionada antes de escalar y dividir
    if 'productivity_score' in df.columns:
        df = df.drop(columns=['productivity_score'])
        print("Variable 'productivity_score' eliminada para evitar multicolinealidad")

    # Ahora dividir ya con df codificado
    print("--- Dividiendo datos ---")
    X_train, X_test, y_train, y_test = dividir_datos(df, test_size=test_size, random_state=random_state)

    # Aquí X_train y X_test ya tienen variables codificadas (columnas dummies)
    if balancear:
        print("--- Balanceando clases con SMOTE ---")
        X_train, y_train = balancear_clases(X_train, y_train, metodo='smote')

    return X_train, X_test, y_train, y_test


In [24]:
pipeline_preprocesamiento(df)

--- Eliminando duplicados ---
Duplicados eliminados: 0
--- Imputando valores nulos ---
--- Codificando variables categóricas ---
Columnas categóricas a codificar: ['gender', 'employment_status', 'work_environment', 'mental_health_history', 'seeks_treatment']
Se aplicó One-Hot Encoding.
Variable 'productivity_score' eliminada para evitar multicolinealidad
--- Dividiendo datos ---
Datos divididos en entrenamiento y prueba con test_size=0.3
Distribución en entrenamiento:
target
0    5342
1    1658
Name: count, dtype: int64
Distribución en prueba:
target
0    2289
1     711
Name: count, dtype: int64
--- Balanceando clases con SMOTE ---
Distribución después de SMOTE:
target
0    5342
1    5342
Name: count, dtype: int64


(       age  stress_level  sleep_hours  physical_activity_days  \
 0       57             3     6.000000                       0   
 1       47             8     6.700000                       6   
 2       30             1     8.100000                       7   
 3       51             6     8.000000                       1   
 4       65             6     5.200000                       4   
 ...    ...           ...          ...                     ...   
 10679   41             3     6.966006                       0   
 10680   20             1     6.078406                       5   
 10681   57             2     6.651336                       0   
 10682   36             6     6.785830                       6   
 10683   51             3     6.752044                       5   
 
        depression_score  anxiety_score  social_support_score  gender_Male  \
 0                     8             12                    38         True   
 1                    11             11           

# ***12. Crear y guardar dataset en Drive***

In [26]:
X_train, X_test, y_train, y_test = pipeline_preprocesamiento(df)

X_train.to_csv('/content/drive/MyDrive/cod/Health/HealthProjectG8/data/X_train.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/cod/Health/HealthProjectG8/data/y_train.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/cod/Health/HealthProjectG8/data/X_test.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/cod/Health/HealthProjectG8/data/y_test.csv', index=False)

print("Archivos guardados con variables codificadas.")


--- Eliminando duplicados ---
Duplicados eliminados: 0
--- Imputando valores nulos ---
--- Codificando variables categóricas ---
Columnas categóricas a codificar: ['gender', 'employment_status', 'work_environment', 'mental_health_history', 'seeks_treatment']
Se aplicó One-Hot Encoding.
Variable 'productivity_score' eliminada para evitar multicolinealidad
--- Dividiendo datos ---
Datos divididos en entrenamiento y prueba con test_size=0.3
Distribución en entrenamiento:
target
0    5342
1    1658
Name: count, dtype: int64
Distribución en prueba:
target
0    2289
1     711
Name: count, dtype: int64
--- Balanceando clases con SMOTE ---
Distribución después de SMOTE:
target
0    5342
1    5342
Name: count, dtype: int64
Archivos guardados con variables codificadas.
