In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
import joblib

In [20]:
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, names=columns)
print("Dataset Loaded:\n", df.head())

Dataset Loaded:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [21]:
# Create BMI_category column
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal weight'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

df["BMI_category"] = df["BMI"].apply(categorize_bmi)
print("BMI Categories:\n", df["BMI_category"].value_counts())

BMI Categories:
 BMI_category
Obese            478
Overweight       174
Normal weight    101
Underweight       15
Name: count, dtype: int64


In [22]:
# Splitting dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
# Define feature processing
num_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
cat_features = ["BMI_category"]

scaler = StandardScaler()
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num', scaler, num_features),
    ('cat', one_hot_encoder, cat_features)
])


In [24]:
# Transform features
X_train = preprocessor.fit_transform(train_df)
X_val = preprocessor.transform(val_df)
y_train = train_df['Outcome']
y_val = val_df['Outcome']
print("Processed Training Features Shape:", X_train.shape)
print("Processed Validation Features Shape:", X_val.shape)

Processed Training Features Shape: (614, 12)
Processed Validation Features Shape: (154, 12)


In [25]:
# Train and evaluate KNN model
best_knn_f1, best_k = 0, None
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"KNN with k={k} F1 Score: {f1}")
    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_k = k
        best_knn_model = knn

KNN with k=3 F1 Score: 0.6181818181818182
KNN with k=5 F1 Score: 0.5555555555555556
KNN with k=7 F1 Score: 0.5849056603773585


In [26]:
# Train and evaluate Decision Tree model
best_dt_f1, best_depth = 0, None
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"Decision Tree with depth={depth} F1 Score: {f1}")
    if f1 > best_dt_f1:
        best_dt_f1 = f1
        best_depth = depth
        best_dt_model = dt

Decision Tree with depth=3 F1 Score: 0.6476190476190476
Decision Tree with depth=5 F1 Score: 0.693069306930693
Decision Tree with depth=7 F1 Score: 0.6379310344827587


In [27]:
# Save the best model and preprocessing steps
best_model = best_knn_model if best_knn_f1 > best_dt_f1 else best_dt_model
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(best_model, "best_model.pkl")
print("Best model saved")

Best model saved


In [28]:
# Inference function
def infer(sample):
    preprocessor = joblib.load("preprocessor.pkl")
    model = joblib.load("best_model.pkl")
    processed_sample = preprocessor.transform(sample)
    prediction = model.predict(processed_sample)
    return prediction

In [29]:
# Demonstrate inference with 5 validation samples
samples = val_df.sample(5, random_state=42)
X_samples = samples.drop(columns=["Outcome"])
predictions = infer(X_samples)
print("Predicted classes:", predictions)


Predicted classes: [0 0 0 0 1]
