# Modelo Base: Regresión Logística
Este notebook entrena un modelo base de clasificación binaria usando regresión logística sobre el dataset de readmisión hospitalaria.

In [1]:
# Cargar librerías
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Detectar si es Google Colab
IN_COLAB = 'google.colab' in str(get_ipython())
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = '/content/drive/MyDrive/Hospital_Readmission'
else:
    BASE_PATH = '.'

In [2]:
# Define BASE_PATH como el directorio actual
os.chdir(BASE_PATH)
print(f"Directorio actual: {os.getcwd()}")

Directorio actual: c:\Users\jorge\Documentos\GitHub\DeepNeuralNetworkUSS\Projects\HospitalReadmission\notebooks


In [3]:
# Cargar datos
df = pd.read_csv(os.path.join(BASE_PATH, '../data/hospital_readmission_clean.csv'))
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '.\\../data/hospital_readmission_clean.csv'

In [None]:
# Codificación LabelEncoder para variables categóricas
df_encoded = df.copy()
label_encoders = {}
for col in df_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

X = df_encoded.drop('readmitted', axis=1)
y = df_encoded['readmitted']

In [None]:
# División en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Entrenar modelo de regresión logística
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Evaluación del modelo
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

In [None]:
# Matriz de confusión y curva ROC
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de Confusión")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.show()

RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.title("Curva ROC")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Accuracy train:", accuracy_score(y_train, y_train_pred))
print("Accuracy test :", accuracy_score(y_test, y_test_pred))

In [None]:
# Solo variables numéricas
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
sns.heatmap(df_encoded.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Mapa de correlación entre variables")
plt.show()

In [None]:
# Ranking por coeficiente de regresión logística
import numpy as np
importance = model.coef_[0]
feature_importance = pd.Series(np.abs(importance), index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
print("Ranking de variables más influyentes:")
print(feature_importance)

# Gráfico
feature_importance.head(10).plot(kind='barh')
plt.title("Top 10 variables más influyentes")
plt.xlabel("Importancia (|coef|)")
plt.gca().invert_yaxis()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Accuracy promedio (5-fold CV):", cv_scores.mean())
print("Accuracies individuales:", cv_scores)