In [14]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For model evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
df = pd.read_csv("kidney_disease.csv")
df.head()
## binary classification problem

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [16]:

important_columns = ['age', 'bp', 'sg', 'al', 'hemo', 'sc','htn','dm','cad','appet','pc','classification']
df = df[important_columns]
df

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,yes,yes,no,good,normal,ckd
1,7.0,50.0,1.020,4.0,11.3,0.8,no,no,no,good,normal,ckd
2,62.0,80.0,1.010,2.0,9.6,1.8,no,yes,no,poor,normal,ckd
3,48.0,70.0,1.005,4.0,11.2,3.8,yes,no,no,poor,abnormal,ckd
4,51.0,80.0,1.010,2.0,11.6,1.4,no,no,no,good,normal,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,no,no,no,good,normal,notckd
396,42.0,70.0,1.025,0.0,16.5,1.2,no,no,no,good,normal,notckd
397,12.0,80.0,1.020,0.0,15.8,0.6,no,no,no,good,normal,notckd
398,17.0,60.0,1.025,0.0,14.2,1.0,no,no,no,good,normal,notckd


In [17]:
# Clean the dataset
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip().str.replace('\t', '', regex=True)

print(df['cad'].value_counts())
print(df['dm'].value_counts())
print(df['classification'].value_counts())

cad
no     364
yes     34
Name: count, dtype: int64
dm
no     261
yes    137
Name: count, dtype: int64
classification
ckd       250
notckd    150
Name: count, dtype: int64


In [18]:

df.isnull().sum()

age                9
bp                12
sg                47
al                46
hemo              52
sc                17
htn                2
dm                 2
cad                2
appet              1
pc                65
classification     0
dtype: int64

In [19]:
# Fill missing values with appropriate methods
df['age'].fillna(df['age'].median(), inplace=True)         # Numerical → median
df['bp'].fillna(df['bp'].median(), inplace=True)           # Numerical → median
df['sg'].fillna(df['sg'].mode()[0], inplace=True)          # Categorical (discrete numeric) → mode
df['al'].fillna(df['al'].mode()[0], inplace=True)          # Categorical (discrete numeric) → mode
df['hemo'].fillna(df['hemo'].median(), inplace=True)       # Numerical → median
df['sc'].fillna(df['sc'].median(), inplace=True)           # Numerical → median
df['htn'].fillna(df['htn'].mode()[0], inplace=True)        # Categorical (yes/no) → mode
df['dm'].fillna(df['dm'].mode()[0], inplace=True)          # Categorical (yes/no) → mode
df['cad'].fillna(df['cad'].mode()[0], inplace=True)        # Categorical (yes/no) → mode
df['appet'].fillna(df['appet'].mode()[0], inplace=True)    # Categorical → mode
df['pc'].fillna(df['pc'].mode()[0], inplace=True)          # Categorical → mode

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)         # Numerical → median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bp'].fillna(df['bp'].median(), inplace=True)           # Numerical → median
The behavior will change in pandas 3.0. This inplace method will never w

age               0
bp                0
sg                0
al                0
hemo              0
sc                0
htn               0
dm                0
cad               0
appet             0
pc                0
classification    0
dtype: int64

In [20]:
# manullay
df['htn'] = df['htn'].map({'yes':1, "no":0})
df['dm'] = df['dm'].map({'yes':1, "no":0})
df['cad'] = df['cad'].map({'yes':1, "no":0})
df['appet'] = df['appet'].map({'good':1, "poor":0})
df['pc'] = df['pc'].map({'normal':1, "abnormal":0})
df['classification'] = df['classification'].map({'ckd':1, "notckd":0})

df

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,1,1,0,1,1,1
1,7.0,50.0,1.020,4.0,11.3,0.8,0,0,0,1,1,1
2,62.0,80.0,1.010,2.0,9.6,1.8,0,1,0,0,1,1
3,48.0,70.0,1.005,4.0,11.2,3.8,1,0,0,0,0,1
4,51.0,80.0,1.010,2.0,11.6,1.4,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,0,0,0,1,1,0
396,42.0,70.0,1.025,0.0,16.5,1.2,0,0,0,1,1,0
397,12.0,80.0,1.020,0.0,15.8,0.6,0,0,0,1,1,0
398,17.0,60.0,1.025,0.0,14.2,1.0,0,0,0,1,1,0


In [21]:
# Select numeric columns to normalize
numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']

# Initialize scaler
scaler = MinMaxScaler()

# Fit-transform the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.head()

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,0.522727,0.230769,0.75,0.2,0.836735,0.010582,1,1,0,1,1,1
1,0.056818,0.0,0.75,0.8,0.557823,0.005291,0,0,0,1,1,1
2,0.681818,0.230769,0.25,0.4,0.442177,0.018519,0,1,0,0,1,1
3,0.522727,0.153846,0.0,0.8,0.55102,0.044974,1,0,0,0,0,1
4,0.556818,0.230769,0.25,0.4,0.578231,0.013228,0,0,0,1,1,1


In [22]:
from imblearn.over_sampling import SMOTE

X = df.drop('classification', axis=1)
y = df['classification']


smote = SMOTE(random_state=42)

X_balanced, y_balanced = smote.fit_resample(X, y)

y_balanced.value_counts()

classification
1    250
0    250
Name: count, dtype: int64

In [23]:
# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Check the shape
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (400, 11)
Test shape: (100, 11)


In [24]:

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
}


# Train and evaluate each model
for name, model in models.items():
    print("="*50)
    print("Model:", name)
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)
    
    

Model: Logistic Regression
Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.91      0.95        46

    accuracy                           0.96       100
   macro avg       0.97      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

Confusion Matrix:
 [[54  0]
 [ 4 42]]
Model: Support Vector Classifier
Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       1.00      0.93      0.97        46

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100

Confusion Matrix:
 [[54  0]
 [ 3 43]]
Model: Random Forest Classifier
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

       

In [25]:
model_gbc = GradientBoostingClassifier()

model_gbc.fit(X_train,y_train)

y_pred = model_gbc.predict(X_test)



print("confusion matrix \n: ", confusion_matrix(y_test,y_pred))
print("classification report \n: ", classification_report(y_test, y_pred))

confusion matrix 
:  [[54  0]
 [ 1 45]]
classification report 
:                precision    recall  f1-score   support

           0       0.98      1.00      0.99        54
           1       1.00      0.98      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



In [26]:
import os

os.makedirs("models", exist_ok=True)


In [27]:

import pickle

pickle.dump(scaler, open("models/scaler.pkl",'wb'))
pickle.dump(model_gbc,open("models/model_gbc.pkl",'wb'))

In [28]:
# Load the encoder, scaler, and trained model from saved files
scaler = pickle.load(open("models/scaler.pkl", 'rb'))  # Load the scaler
model_gbc = pickle.load(open("models/model_gbc.pkl", 'rb'))  # Load the trained model

def predict_chronic_disease(age, bp, sg, al, hemo, sc, htn, dm, cad, appet, pc):
    # Create a DataFrame with input variables, following the correct order
    df_dict = {
        'age': [age],
        'bp': [bp],
        'sg': [sg],
        'al': [al],
        'hemo': [hemo],
        'sc': [sc],
        'htn': [htn],
        'dm': [dm],
        'cad': [cad],
        'appet': [appet],
        'pc': [pc]
    }
    df = pd.DataFrame(df_dict)

    # Encode the categorical columns
    df['htn'] = df['htn'].map({'yes':1, "no":0})
    df['dm'] = df['dm'].map({'yes':1, "no":0})
    df['cad'] = df['cad'].map({'yes':1, "no":0})
    df['appet'] = df['appet'].map({'good':1, "poor":0})
    df['pc'] = df['pc'].map({'normal':1, "abnormal":0})

    # Scale the numeric columns using the previously fitted scaler
    numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']
    df[numeric_cols] = scaler.transform(df[numeric_cols])

    # Make the prediction
    prediction = model_gbc.predict(df)

    # Return the predicted class
    return prediction[0]

# Example usage:
result = predict_chronic_disease(age=30, bp=20, sg=1.020, al=1.0, hemo=15.4, sc=1.2, htn="no", dm="no", cad='no', appet='good', pc='normal')

if result == 1:
    print("The Patient Has CKD....")
else:
    print("The Patient Has not CKD....")

The Patient Has not CKD....


In [29]:
# Example usage with higher chance of CKD
result = predict_chronic_disease(
    age=65,    
    bp=160,     
    sg=1.030,   
    al=3.0,     
    hemo=9.0,   
    sc=2.0,     
    htn="yes",  
    dm="yes",       
    cad="yes",      
    appet="poor",   
    pc='abnormal'  
)


if result == 1:
    print("The Patient Has CKD....")
else:
    print("The Patient Has not CKD....")

The Patient Has CKD....


In [30]:
import sklearn

print(sklearn.__version__)

1.7.2


In [32]:
import os
import pickle

# Make sure models folder exists
os.makedirs("models", exist_ok=True)

# SAVE THE SCALER (this is missing in your case)
pickle.dump(scaler, open("models/scaler.pkl", "wb"))

# SAVE THE MODEL YOU JUST TRAINED
pickle.dump(model_gbc, open("models/model_gbc.pkl", "wb"))

print("✅ scaler.pkl and model_gbc.pkl saved successfully")


✅ scaler.pkl and model_gbc.pkl saved successfully
