In [27]:
# --- Cell 1: Import Required Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

In [28]:
# --- Cell 2: Load Cleaned Data ---
df = pd.read_csv("../Data/cleaned_ckd_data.csv")
print("✅ Cleaned dataset loaded.", df.shape)

✅ Cleaned dataset loaded. (3000, 7)


In [29]:
# --- Cell 3: Define Features and Target ---
X = df[['age', 'sc', 'hemo', 'al', 'bp', 'sg']]
y = df['classification']
print("✅ Features and target selected")

✅ Features and target selected


In [30]:
# --- Cell 4: Train-Test Split ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("✅ Data split into train and test sets")

✅ Data split into train and test sets


In [31]:
# --- Cell 5: Define Models and Hyperparameters ---
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

param_grids = {
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Random Forest": {"n_estimators": [50, 100]},
    "AdaBoost": {"n_estimators": [50, 100]},
    "Gradient Boosting": {"n_estimators": [50, 100]}
}

In [32]:
# --- Cell 6: Evaluate and Tune Models ---
results = []
best_models = {}

for name in models:
    print(f"\n🔍 Tuning {name}...")
    grid = GridSearchCV(models[name], param_grids[name], cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    best_models[name] = best_model

    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else 0

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "AUC": auc
    })

print("\n✅ All models evaluated.")


🔍 Tuning Logistic Regression...

🔍 Tuning SVM...

🔍 Tuning Random Forest...

🔍 Tuning AdaBoost...

🔍 Tuning Gradient Boosting...

✅ All models evaluated.


In [33]:
# --- Cell 7: Compare and Display Results ---
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
print("\n📊 Model Comparison:")
print(results_df)


📊 Model Comparison:
                 Model  Accuracy  Precision  Recall  F1 Score  AUC
0  Logistic Regression       1.0        1.0     1.0       1.0  1.0
1                  SVM       1.0        1.0     1.0       1.0  1.0
2        Random Forest       1.0        1.0     1.0       1.0  1.0
3             AdaBoost       1.0        1.0     1.0       1.0  1.0
4    Gradient Boosting       1.0        1.0     1.0       1.0  1.0


In [34]:
# --- Cell 8: Save the Best Model ---
import joblib
best_model_name = results_df.iloc[0]['Model']
best_model = best_models[best_model_name]
joblib.dump(best_model, "../App/model/model.pkl")
print(f"\n✅ Best model '{best_model_name}' saved to ../App/model/model.pkl")


✅ Best model 'Logistic Regression' saved to ../App/model/model.pkl
