In [1]:
import sys
import os

# Agrega la ruta del directorio padre (donde est√° src/)
sys.path.append(os.path.abspath(".."))

# üè• Proyecto: Predicci√≥n de Readmisi√≥n Hospitalaria
Este notebook est√° dise√±ado para ejecutarse en **Google Colab**.

Contiene todo el flujo del proyecto:
- Exploraci√≥n de datos (EDA)
- Entrenamiento de modelo base (Regresi√≥n Log√≠stica)
- Entrenamiento de Red Neuronal
- Evaluaci√≥n y comparaci√≥n de modelos

üîß Los estudiantes pueden modificar el c√≥digo y los archivos como `config.yaml` o `train.py`.

In [None]:
# Clonar el repositorio (solo una vez por sesi√≥n de Colab)
!git clone https://github.com/solivare/DeepNeuralNetworkUSS.git
%cd DeepNeuralNetworkUSS/Projects/HospitalReadmission


In [None]:
# Instalar dependencias necesarias (si no est√°n ya instaladas)
!pip install -r requirements.txt

In [None]:
# Creacion del directorio de trabajo
%mkdir run
%cd run/

In [2]:
# Preprocesamiento del dataset original
!python ../src/preprocess.py

Cargando datos desde ../data/diabetic_data.csv
Reemplazando '?' por NaN...
Valores NaN por columna:
weight               98569
max_glu_serum        96420
A1Cresult            84748
medical_specialty    49949
payer_code           40256
race                  2273
diag_3                1423
diag_2                 358
diag_1                  21
patient_nbr              0
dtype: int64

Eliminando columnas irrelevantes o con muchos NaNs...
Eliminando filas con NaN en columnas cr√≠ticas (race, gender, age)...
Convirtiendo variable objetivo...
Eliminando columnas con un solo valor √∫nico...
Codificando variables categ√≥ricas...
Guardando dataset limpio en ../data/hospital_readmission_clean.csv
‚úÖ Preprocesamiento finalizado.
üì¶ Registros finales: 99493  |  Variables: 2322
üîç Distribuci√≥n del target:
readmitted
0    0.887741
1    0.112259
Name: proportion, dtype: float64


In [3]:
# Exploraci√≥n de Datos (EDA)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/hospital_readmission_clean.csv')
df.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Steady,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,False,False,True,False,False,False,False,False,True,False
1,1,1,7,3,59,0,18,0,0,0,...,False,True,True,False,False,False,False,False,False,True
2,1,1,7,2,11,5,13,2,0,1,...,False,False,True,False,False,False,False,False,True,True
3,1,1,7,2,44,1,16,0,0,0,...,False,True,True,False,False,False,False,False,False,True
4,1,1,7,1,51,0,8,0,0,0,...,True,False,True,False,False,False,False,False,False,True


In [4]:
# Modelo base: Regresi√≥n Log√≠stica
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sys.path.append(os.path.abspath(".."))
import src.evaluate as ev
import numpy as np
import yaml

# Cargar configuraci√≥n y datos
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)
df = pd.read_csv('../data/hospital_readmission_clean.csv')
X = df.drop('readmitted', axis=1)
y = df['readmitted']

# Escalar y separar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=config['test_size'],
    random_state=config['random_state'], stratify=y)

# Entrenar modelo base
base_model = LogisticRegression()
base_model.fit(X_train, y_train)
y_pred_base = base_model.predict(X_test)
y_pred_proba_base = base_model.predict_proba(X_test)[:, 1]

# Evaluar modelo base
ev.evaluate_model(y_test, y_pred_base, y_pred_proba_base, model_name="Regresi√≥n Log√≠stica")


üìä Evaluaci√≥n del modelo: Regresi√≥n Log√≠stica
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     17665
           1       0.33      0.03      0.05      2234

    accuracy                           0.88     19899
   macro avg       0.61      0.51      0.49     19899
weighted avg       0.83      0.88      0.84     19899

F1 Score     : 0.0504
Precision    : 0.3297
Recall       : 0.0273
ROC AUC      : 0.6312


{'f1': 0.050434063662670524,
 'precision': 0.32972972972972975,
 'recall': 0.02730528200537153,
 'roc_auc': np.float64(0.631241617277284)}

In [None]:
# Entrenamiento de Red Neuronal
sys.path.append(os.path.abspath(".."))
import src.train as tr
model = tr.build_model(config, input_dim=X_train.shape[1])
history = tr.train_model(model, X_train, y_train, config)
y_pred_nn_proba = model.predict(X_test).flatten()
y_pred_nn = ev.predict_with_threshold(y_pred_nn_proba)

ev.evaluate_model(y_test, y_pred_nn, y_pred_nn_proba, model_name="Red Neuronal")

In [None]:
# Comparaci√≥n de ROC y m√©tricas
ev.plot_roc_comparison(y_test, y_pred_nn_proba, y_test, y_pred_proba_base,
                       label1='Red Neuronal', label2='Regresi√≥n Log√≠stica')

df_metrics = ev.compare_models_metrics(
    y_test, y_pred_nn, y_pred_nn_proba,
    y_test, y_pred_base, y_pred_proba_base,
    model_name_1='Red Neuronal', model_name_2='Regresi√≥n Log√≠stica')
import pandas as pd
from IPython.display import display
display(df_metrics)