In [3]:
import pandas as pd
df = pd.read_csv("D:\ml-portfolio-project\Chronic_Kidney_disease_dataset (1).csv")
df.shape, df.columns
display(df.head())


  df = pd.read_csv("D:\ml-portfolio-project\Chronic_Kidney_disease_dataset (1).csv")


Unnamed: 0,age,gender,blood_pressure,specific_gravity,albumin,sugar,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,anemia,pedal_edema,ckd
0,62,female,50.0,1.02,4,4,normal,absent,absent,98.1,...,38.3,10132,5.8,yes,yes,yes,good,yes,no,1
1,44,female,90.4,1.01,1,0,normal,absent,absent,186.2,...,41.0,6703,6.0,yes,yes,no,good,yes,no,0
2,20,male,65.1,1.01,0,0,normal,absent,absent,201.3,...,36.5,6712,5.6,yes,no,no,good,no,no,0
3,66,female,99.1,1.025,2,0,normal,absent,absent,151.9,...,38.1,5327,5.7,no,yes,yes,good,yes,no,0
4,54,male,70.2,1.02,1,0,normal,absent,absent,163.7,...,31.4,6573,3.9,yes,yes,no,good,yes,no,0


In [15]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib

# --- Path to your dataset ---
CSV_PATH = r"D:\ml-portfolio-project\Chronic_Kidney_disease_dataset (1).csv"

def load_and_clean(path=CSV_PATH):
    df = pd.read_csv(path)

    # Replace common missing markers with NaN
    df.replace(['?', ' ', 'NA', 'N/A', 'na', 'nan'], np.nan, inplace=True)

    # Convert numeric-like columns
    numeric_cols = [
        "age","blood_pressure","specific_gravity","albumin","sugar",
        "blood_glucose_random","blood_urea","serum_creatinine","sodium",
        "potassium","hemoglobin","packed_cell_volume",
        "white_blood_cell_count","red_blood_cell_count"
    ]
    for c in numeric_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # Normalize gender values
    if "gender" in df.columns:
        df['gender'] = df['gender'].str.lower().str.strip()
        df['gender'] = df['gender'].replace({
            'male': 'male', 'm': 'male',
            'female': 'female', 'f': 'female'
        })

    return df

def build_and_train(df):
    target = "ckd"
    assert target in df.columns, "Target column 'ckd' not found"

    X = df.drop(columns=[target])
    y = df[target].astype(int)

    # Identify numeric & categorical features
    numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object','bool']).columns.tolist()

    # Transformers
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))


    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))
    ])

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Grid search
    param_grid = {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [None, 10, 20],
    }
    grid = GridSearchCV(
        clf, param_grid, cv=5, n_jobs=-1,
        scoring='roc_auc', verbose=1
    )
    grid.fit(X_train, y_train)

    best = grid.best_estimator_
    y_pred = best.predict(X_test)
    y_prob = best.predict_proba(X_test)[:, 1]

    print("Best params:", grid.best_params_)
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_prob))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    # Save trained model next to notebook
    joblib.dump(best, "model.joblib")
    print("✅ Model saved as model.joblib")

# --- Run training ---
df = load_and_clean()
build_and_train(df)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params: {'classifier__max_depth': 20, 'classifier__n_estimators': 200}
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5006
           1       0.97      0.84      0.90       994

    accuracy                           0.97      6000
   macro avg       0.97      0.92      0.94      6000
weighted avg       0.97      0.97      0.97      6000

ROC AUC: 0.9125443833596868
Confusion matrix:
 [[4979   27]
 [ 162  832]]
✅ Model saved as model.joblib
