In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [103]:
url = "https://raw.githubusercontent.com/shotokan/diabetes-classifier/refs/heads/main/4_classification_diabetes.csv"
df = pd.read_csv(url)
df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [104]:
# Renombrar columnas para mayor claridad
df.columns = ["Diabetes_012", "HighBP", "HighChol", "CholCheck", "BMI", "Smoker", "Stroke",
              "HeartDiseaseorAttack", "PhysActivity", "Fruits", "Veggies", "HvyAlcoholConsump",
              "AnyHealthcare", "NoDocbcCost", "GenHlth", "MentHlth", "PhysHlth", "DiffWalk",
              "Sex", "Age", "Education", "Income"]
df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [105]:
# --- 2. Selección de características  ---
X = df.drop("Diabetes_012", axis=1)
y = df["Diabetes_012"]


In [106]:
# Escalar características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [107]:
# Selección de mejores características con chi-cuadrado
X_chi2 = X.copy()
X_chi2 = X_chi2.astype(int)
selector = SelectKBest(score_func=chi2, k=8)
X_selected = selector.fit_transform(X_chi2, y)
selected_features = X.columns[selector.get_support(indices=True)]
print("\nMejores características seleccionadas:")
print(selected_features)


Mejores características seleccionadas:
Index(['HighBP', 'BMI', 'HeartDiseaseorAttack', 'GenHlth', 'MentHlth',
       'PhysHlth', 'DiffWalk', 'Age'],
      dtype='object')


Dado que se cuenta con variables como HighBP o BMI, tienen una relación directa y demostrada con la diabetes tipo 2 podemos hacer uso de Regresión Logística

In [108]:
# --- 3. Entrenamiento del modelo de RandomForestClassifier ---
X_train, X_test, y_train, y_test = train_test_split(df[selected_features], y, test_size=0.3, random_state=42)

model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)


In [109]:
# Predicción en test y train
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [110]:
# --- 4. Evaluación del modelo ---
print("\n--- Evaluación en conjunto de entrenamiento ---")
print("Matriz de Confusión (Train):")
print(confusion_matrix(y_train, y_pred_train))
print("\nReporte de Clasificación (Train):")
print(classification_report(y_train, y_pred_train))
print(f"Accuracy (Train): {accuracy_score(y_train, y_pred_train):.4f}")

print("\n--- Evaluación en conjunto de prueba ---")
print("Matriz de Confusión (Test):")
print(confusion_matrix(y_test, y_pred_test))
print("\nReporte de Clasificación (Test):")
print(classification_report(y_test, y_pred_test))
print(f"Accuracy (Test): {accuracy_score(y_test, y_pred_test):.4f}")


--- Evaluación en conjunto de entrenamiento ---
Matriz de Confusión (Train):
[[123424  12076  14023]
 [   288   2713    205]
 [  2959   2217  19671]]

Reporte de Clasificación (Train):
              precision    recall  f1-score   support

         0.0       0.97      0.83      0.89    149523
         1.0       0.16      0.85      0.27      3206
         2.0       0.58      0.79      0.67     24847

    accuracy                           0.82    177576
   macro avg       0.57      0.82      0.61    177576
weighted avg       0.90      0.82      0.85    177576

Accuracy (Train): 0.8211

--- Evaluación en conjunto de prueba ---
Matriz de Confusión (Test):
[[50784  5302  8094]
 [  849   147   429]
 [ 5593  1120  3786]]

Reporte de Clasificación (Test):
              precision    recall  f1-score   support

         0.0       0.89      0.79      0.84     64180
         1.0       0.02      0.10      0.04      1425
         2.0       0.31      0.36      0.33     10499

    accuracy          

Con 10 variables, el modelo comete:

El modelo  comete errores en clases 1 y 2
El modelo no está aprendiendo a detectar prediabetes. Es probable que haya muy pocos datos y esté desbalanceado.
Quizás haya pocos ejemplos para clase 1 y/o falta de variables.


Dado que la prediabetes nos arrojaba 0 se hizo uso de class_weight='balanced' y mejoro en cuanto a prediabetes pero bajo un poco en las otras clases.

Se agregaron 12 variables y mejoro mucho sobre todo en la clase 1 pero en el de tests sigue bajo.

Hay pocas muestras de clases minoritarias.

Con k=8 en lugar de 12:

Mejoró la generalización: el modelo ya no memoriza tanto el entrenamiento.

Pero todavía sufre de desbalance, especialmente en la clase 1.

In [37]:
# --- 3. Entrenamiento del modelo (LogisticRegression) ---
X_train, X_test, y_train, y_test = train_test_split(df[selected_features], y, test_size=0.3, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predicción en test y train
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# --- 4. Evaluación del modelo ---
print("\n--- Evaluación en conjunto de entrenamiento ---")
print("Matriz de Confusión (Train):")
print(confusion_matrix(y_train, y_pred_train))
print("\nReporte de Clasificación (Train):")
print(classification_report(y_train, y_pred_train))
print(f"Accuracy (Train): {accuracy_score(y_train, y_pred_train):.4f}")

print("\n--- Evaluación en conjunto de prueba ---")
print("Matriz de Confusión (Test):")
print(confusion_matrix(y_test, y_pred_test))
print("\nReporte de Clasificación (Test):")
print(classification_report(y_test, y_pred_test))
print(f"Accuracy (Test): {accuracy_score(y_test, y_pred_test):.4f}")



--- Evaluación en conjunto de entrenamiento ---
Matriz de Confusión (Train):
[[145968      0   3555]
 [  2929      0    277]
 [ 20660      0   4187]]

Reporte de Clasificación (Train):
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.91    149523
         1.0       0.00      0.00      0.00      3206
         2.0       0.52      0.17      0.25     24847

    accuracy                           0.85    177576
   macro avg       0.46      0.38      0.39    177576
weighted avg       0.80      0.85      0.81    177576

Accuracy (Train): 0.8456

--- Evaluación en conjunto de prueba ---
Matriz de Confusión (Test):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[62695     0  1485]
 [ 1309     0   116]
 [ 8745     0  1754]]

Reporte de Clasificación (Test):
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     64180
         1.0       0.00      0.00      0.00      1425
         2.0       0.52      0.17      0.25     10499

    accuracy                           0.85     76104
   macro avg       0.46      0.38      0.39     76104
weighted avg       0.80      0.85      0.81     76104

Accuracy (Test): 0.8469


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


k=10
Me arroja un warning porque no está prediciendo ninguna muestra para alguna de las clases (la clase 1 – prediabetes)

In [111]:
# ---  Entrenamiento del modelo (LogisticRegression) ---
X_train, X_test, y_train, y_test = train_test_split(df[selected_features], y, test_size=0.3, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Predicción en test y train
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# ---  Evaluación del modelo ---
print("\n--- Evaluación en conjunto de entrenamiento ---")
print("Matriz de Confusión (Train):")
print(confusion_matrix(y_train, y_pred_train))
print("\nReporte de Clasificación (Train):")
print(classification_report(y_train, y_pred_train))
print(f"Accuracy (Train): {accuracy_score(y_train, y_pred_train):.4f}")

print("\n--- Evaluación en conjunto de prueba ---")
print("Matriz de Confusión (Test):")
print(confusion_matrix(y_test, y_pred_test))
print("\nReporte de Clasificación (Test):")
print(classification_report(y_test, y_pred_test))
print(f"Accuracy (Test): {accuracy_score(y_test, y_pred_test):.4f}")



--- Evaluación en conjunto de entrenamiento ---
Matriz de Confusión (Train):
[[98959 24179 26385]
 [  995   831  1380]
 [ 4483  5468 14896]]

Reporte de Clasificación (Train):
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78    149523
         1.0       0.03      0.26      0.05      3206
         2.0       0.35      0.60      0.44     24847

    accuracy                           0.65    177576
   macro avg       0.44      0.51      0.42    177576
weighted avg       0.85      0.65      0.72    177576

Accuracy (Train): 0.6458

--- Evaluación en conjunto de prueba ---
Matriz de Confusión (Test):
[[42451 10347 11382]
 [  445   387   593]
 [ 1838  2410  6251]]

Reporte de Clasificación (Test):
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     64180
         1.0       0.03      0.27      0.05      1425
         2.0       0.34      0.60      0.44     10499

    accuracy                   

Con 12 variables no mejora tanto:

identifica más casos reales de diabetes y prediabetes, pero comete más errores de clasificación en la clase 0 (y por eso baja el accuracy).
La precisión para clase 1 sigue siendo baja (muchos falsos positivos).
