In [11]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [12]:
# Load dataset
df = pd.read_csv("diabetes.csv")

In [13]:
# 1️⃣ Add BMI Category based on general medical guidelines
def classify_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

df["BMI_category"] = df["BMI"].apply(classify_bmi)

In [14]:
# 2️⃣ Split data (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Outcome"])

# Save validation set for inference
val_df.to_csv("val_set.csv", index=False)

# Identify numeric and categorical columns
numeric_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
categorical_features = ["BMI_category"]

In [15]:
# 3️⃣ Preprocessing: StandardScaler for numeric, OneHotEncoder for categorical
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Transform data
X_train = train_df.drop(columns=["Outcome"])
y_train = train_df["Outcome"]
X_val = val_df.drop(columns=["Outcome"])
y_val = val_df["Outcome"]

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

# Save the preprocessor for inference
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']

In [16]:
# 4️⃣ Train KNN Classifier (Try k = 3, 5, 7)
best_knn_model = None
best_knn_f1 = 0
best_k = 0

for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    
    print(f"KNN (k={k}) - F1 Score: {f1:.4f}")
    
    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_knn_model = knn
        best_k = k

print(f"Best KNN Model: k={best_k} with F1 Score: {best_knn_f1:.4f}")

KNN (k=3) - F1 Score: 0.5577
KNN (k=5) - F1 Score: 0.5882
KNN (k=7) - F1 Score: 0.5800
Best KNN Model: k=5 with F1 Score: 0.5882


In [17]:
# 5️⃣ Train Decision Tree (Try max_depth = 3, 5, 7)
best_dt_model = None
best_dt_f1 = 0
best_depth = 0

for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    
    print(f"Decision Tree (max_depth={depth}) - F1 Score: {f1:.4f}")
    
    if f1 > best_dt_f1:
        best_dt_f1 = f1
        best_dt_model = dt
        best_depth = depth

print(f"Best Decision Tree Model: max_depth={best_depth} with F1 Score: {best_dt_f1:.4f}")

Decision Tree (max_depth=3) - F1 Score: 0.3733
Decision Tree (max_depth=5) - F1 Score: 0.7037
Decision Tree (max_depth=7) - F1 Score: 0.5909
Best Decision Tree Model: max_depth=5 with F1 Score: 0.7037


In [18]:
# 6️⃣ Save the best model (whichever is better)
if best_knn_f1 >= best_dt_f1:
    best_model = best_knn_model
    best_model_name = f"KNN_k{best_k}"
else:
    best_model = best_dt_model
    best_model_name = f"DecisionTree_depth{best_depth}"

joblib.dump(best_model, "best_model.pkl")
print(f"✅ Best Model Saved: {best_model_name}")

✅ Best Model Saved: DecisionTree_depth5


In [19]:
# 7️⃣ Inference Function
def inference(sample):
    """
    Takes a dictionary input of a single sample, applies preprocessing, and returns the predicted class.
    """
    # Load preprocessor and model
    preprocessor = joblib.load("preprocessor.pkl")
    model = joblib.load("best_model.pkl")

    # Convert dictionary to DataFrame
    sample_df = pd.DataFrame([sample])

    # Ensure categorical columns are strings
    for col in categorical_features:
        if col in sample_df.columns:
            sample_df[col] = sample_df[col].astype(str)

    # Apply preprocessing
    sample_processed = preprocessor.transform(sample_df)

    # Make prediction
    prediction = model.predict(sample_processed)[0]
    return prediction

In [20]:
# 🔹 TEST INFERENCE ON 5 SAMPLES FROM VALIDATION SET
for i in range(5):
    sample = val_df.drop(columns=["Outcome"]).iloc[i].to_dict()
    true_label = val_df.iloc[i]["Outcome"]
    pred_label = inference(sample)
    print(f"Sample {i+1}: True Label = {true_label}, Predicted Label = {pred_label}")

Sample 1: True Label = 0, Predicted Label = 1
Sample 2: True Label = 0, Predicted Label = 0
Sample 3: True Label = 0, Predicted Label = 0
Sample 4: True Label = 1, Predicted Label = 1
Sample 5: True Label = 0, Predicted Label = 0
