## ***Pipe***

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, LabelEncoder

In [5]:
# Cargar los datos
data = pd.read_csv('../Data/final_depression_dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Name                                   2556 non-null   object 
 1   Gender                                 2556 non-null   object 
 2   Age                                    2556 non-null   int64  
 3   City                                   2556 non-null   object 
 4   Working Professional or Student        2556 non-null   object 
 5   Profession                             1883 non-null   object 
 6   Academic Pressure                      502 non-null    float64
 7   Work Pressure                          2054 non-null   float64
 8   CGPA                                   502 non-null    float64
 9   Study Satisfaction                     502 non-null    float64
 10  Job Satisfaction                       2054 non-null   float64
 11  Slee

In [6]:
# Lista de columnas a convertir
columns_to_category = [
    'Name', 'Gender', 'City', 'Working Professional or Student', 'Profession',
    'Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction',
    'Financial Stress', 'Sleep Duration', 'Dietary Habits', 'Degree',
    'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Depression'
]

# Convertir cada columna en category
for column in columns_to_category:
    data[column] = data[column].astype('category')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   Name                                   2556 non-null   category
 1   Gender                                 2556 non-null   category
 2   Age                                    2556 non-null   int64   
 3   City                                   2556 non-null   category
 4   Working Professional or Student        2556 non-null   category
 5   Profession                             1883 non-null   category
 6   Academic Pressure                      502 non-null    category
 7   Work Pressure                          2054 non-null   category
 8   CGPA                                   502 non-null    float64 
 9   Study Satisfaction                     502 non-null    category
 10  Job Satisfaction                       2054 non-null   categ

In [7]:
# Unir columnas para crear nuevas características
data['Job/Study Satisfaction'] = data['Job Satisfaction'].fillna(data['Study Satisfaction'])
data['Work/Academic Pressure'] = data['Academic Pressure'].fillna(data['Work Pressure'])

# Eliminar columnas innecesarias
data = data.drop(columns=['CGPA', 'Name', 'Profession', 'City', 'Family History of Mental Illness', 'Gender', 'Sleep Duration', 'Job Satisfaction', 'Study Satisfaction', 'Academic Pressure', 'Work Pressure'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 10 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   Age                                    2556 non-null   int64   
 1   Working Professional or Student        2556 non-null   category
 2   Dietary Habits                         2556 non-null   category
 3   Degree                                 2556 non-null   category
 4   Have you ever had suicidal thoughts ?  2556 non-null   category
 5   Work/Study Hours                       2556 non-null   int64   
 6   Financial Stress                       2556 non-null   category
 7   Depression                             2556 non-null   category
 8   Job/Study Satisfaction                 2556 non-null   category
 9   Work/Academic Pressure                 2556 non-null   category
dtypes: category(8), int64(2)
memory usage: 62.4 KB


In [8]:
# Graficación de datos
#from ydata_profiling import ProfileReport

#profile_data = ProfileReport(data, minimal=True)
#profile_data

In [9]:
# Categorical Columns
categorical_cols = ['Working Professional or Student', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Financial Stress', 'Job/Study Satisfaction', 'Work/Academic Pressure']

# Numeric columns
numeric_cols = ['Age', 'Work/Study Hours']

In [10]:
# Paso 1: Limpiar atípicos
def remove_outliers(X):
    # Se asigna valor nulo a los atípicos en variables numéricas
    X.loc[(X['Age'] < 18) | (X['Age'] > 60), 'Age'] = np.nan
    X.loc[(X['Work/Study Hours'] < 0) | (X['Work/Study Hours'] > 12), 'Work/Study Hours'] = np.nan

    # Se asigna valor nulo a los valores fuera de categorías esperadas en variables categóricas
    X['Working Professional or Student'] = X['Working Professional or Student'].apply(lambda x: x if x in ['Working Professional', 'Student'] else np.nan)
    X['Dietary Habits'] = X['Dietary Habits'].apply(lambda x: x if x in ['Unhealthy', 'Healthy', 'Moderate'] else np.nan)
    X['Have you ever had suicidal thoughts ?'] = X['Have you ever had suicidal thoughts ?'].apply(lambda x: x if x in ['Yes', 'No'] else np.nan)
    X['Financial Stress'] = X['Financial Stress'].apply(lambda x: x if x in [1, 2, 3, 4, 5] else np.nan)
    X['Job/Study Satisfaction'] = X['Job/Study Satisfaction'].apply(lambda x: x if x in [1.0, 2.0, 3.0, 4.0, 5.0] else np.nan)
    X['Work/Academic Pressure'] = X['Work/Academic Pressure'].apply(lambda x: x if x in [1.0, 2.0, 3.0, 4.0, 5.0] else np.nan)

    # Validar todas las variables
    return X

In [11]:
# Paso 2: Imputar y transformaciones
# Transformación de las variables categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('dummies', OneHotEncoder(drop='if_binary',handle_unknown='ignore', sparse_output=False))
])

# Transformación de las variables numéricas
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Combinación de transformaciones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [12]:
# Definir el pipeline completo

pipe = Pipeline(steps=[
    ('outliers', FunctionTransformer(remove_outliers)),
    ('preprocessor', preprocessor)
])

pipe

In [13]:
X = data.drop(columns=['Depression'])
Y = data['Depression']

In [14]:
#Aplicar el pipe a los datos
X_processed = pipe.fit_transform(X)

In [15]:
# División 70-30

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.3)

In [16]:
categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]
categorical_indices

[1, 2, 3, 4, 6, 7, 8]

In [17]:
from imblearn.over_sampling import SMOTENC

smote = SMOTENC(categorical_features=categorical_indices, random_state=42)

X_train_bal, Y_train_bal = smote.fit_resample(X_train, Y_train)

In [18]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train_bal, Y_train_bal)

In [19]:
from sklearn.model_selection import cross_validate, StratifiedKFold

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Función para realizar validación cruzada con cross_validate y múltiples métricas
def cross_validate_model(model, X, Y, cv):
    # Evaluar el modelo con cross_validate usando varias métricas
    scores = cross_validate(
        model, X, Y, cv=cv,
        scoring={'accuracy': 'accuracy',
                 'precision': 'precision_weighted',
                 'recall': 'recall_weighted',
                 'f1': 'f1_weighted'},
        return_train_score=True
    )

    # Guardar las métricas en el diccionario de resultados
    results = {
        "Train Accuracy Mean": scores['train_accuracy'].mean(),
        "Test Accuracy Mean": scores['test_accuracy'].mean(),
        "Train Precision Mean": scores['train_precision'].mean(),
        "Test Precision Mean": scores['test_precision'].mean(),
        "Train Recall Mean": scores['train_recall'].mean(),
        "Test Recall Mean": scores['test_recall'].mean(),
        "Train F1 Mean": scores['train_f1'].mean(),
        "Test F1 Mean": scores['test_f1'].mean(),
        "Fit Time Mean": scores['fit_time'].mean(),
        "Score Time Mean": scores['score_time'].mean()
    }

    # Imprimir las métricas
    print(f"Model: {model.__class__.__name__}")
    print(f"   - Fit Time Mean: {scores['fit_time'].mean():.4f} seconds")
    print(f"   - Score Time Mean: {scores['score_time'].mean():.4f} seconds\n")

    return results

In [20]:
results = cross_validate_model(model_lr, X_processed, Y, cv)
results

Model: LogisticRegression
   - Fit Time Mean: 0.0197 seconds
   - Score Time Mean: 0.0105 seconds



{'Train Accuracy Mean': 0.970657218978516,
 'Test Accuracy Mean': 0.9632245710784314,
 'Train Precision Mean': 0.9703349377605605,
 'Test Precision Mean': 0.9631017074445172,
 'Train Recall Mean': 0.970657218978516,
 'Test Recall Mean': 0.9632245710784314,
 'Train F1 Mean': 0.9702438720907711,
 'Test F1 Mean': 0.962610956748124,
 'Fit Time Mean': 0.01971304416656494,
 'Score Time Mean': 0.010499811172485352}

In [21]:
# Evaluation
from sklearn import metrics

Y_pred_logreg = model_lr.predict(X_test)

# Metrics
accuracy = metrics.accuracy_score(Y_test, Y_pred_logreg)
precision = metrics.precision_score(Y_test, Y_pred_logreg, average='weighted')
recall = metrics.recall_score(Y_test, Y_pred_logreg, average='weighted')
f1 = metrics.f1_score(Y_test, Y_pred_logreg, average='weighted')

metr = pd.DataFrame({'LogisticRegression': [accuracy, precision, recall, f1]}, index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
metr

Unnamed: 0,LogisticRegression
Accuracy,0.949153
Precision,0.954192
Recall,0.949153
F1 Score,0.950626


In [22]:
# Reentrenar el modelo con todos los datos
model_lr.fit(X_processed, Y)

In [23]:
# Crea el pipeline con el escalado, codificación y modelo

from sklearn.pipeline import Pipeline

pipeline_model = Pipeline([
    ('preprocessor', pipe),  # Aplica el preprocesamiento
    ('model', model_lr)  # Modelo de regresión logística
])
pipeline_model

In [24]:
#Se guarda el pipeline
import pickle
filename = 'pipeline_Logistic_Regression_model.pkl'
pickle.dump(pipeline_model, open(filename, 'wb'))