In [10]:
# Importing the Dependencies
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import pickle
import os


In [11]:
# Data Collection & Initial Info

data = pd.read_csv('kidney_disease.csv')

# quick look
data.head()
print("Original shape:", data.shape)

# info
data.info()




Original shape: (400, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc       

ckd=chronic kidney disease

In [12]:
# Cleaning 'classification' target & dropping ID column

# fix 'ckd\t' to 'ckd'
data['classification'] = data['classification'].replace("ckd\t", "ckd")

# map target labels to numbers: ckd -> 1, notckd -> 0
data['classification'] = data['classification'].replace(['ckd', 'notckd'], [1, 0])

# drop id column
data = data.drop('id', axis=1)

print("Unique classification values:", data['classification'].unique())


Unique classification values: [1 0]


In [13]:
# Handling missing values & resetting Index

# keep only complete rows (simple + clean)
df = data.dropna(axis=0)

print(f"After dropping NaN values: {df.shape}")

# reset index
df.index = range(0, len(df), 1)
df.head()


After dropping NaN values: (158, 25)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,1
1,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29,12100,3.7,yes,yes,no,poor,no,yes,1
2,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,...,32,4500,3.8,yes,yes,no,poor,yes,no,1
3,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,157.0,...,16,11000,2.6,yes,yes,yes,poor,yes,no,1
4,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,...,24,9200,3.2,yes,yes,yes,poor,yes,yes,1


In [14]:
# Fixing 'wc' weird values and converting dtypes

# replace tabbed counts like '\t6200', '\t8400'
df['wc'] = df['wc'].replace(["\t6200", "\t8400"], [6200, 8400])

# convert numeric-like columns from object/string to numeric
df['pcv'] = df['pcv'].astype(int)
df['wc'] = df['wc'].astype(int)
df['rc'] = df['rc'].astype(float)

print("\nInfo after type conversions:")
df.info()



Info after type conversions:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             158 non-null    float64
 1   bp              158 non-null    float64
 2   sg              158 non-null    float64
 3   al              158 non-null    float64
 4   su              158 non-null    float64
 5   rbc             158 non-null    object 
 6   pc              158 non-null    object 
 7   pcc             158 non-null    object 
 8   ba              158 non-null    object 
 9   bgr             158 non-null    float64
 10  bu              158 non-null    float64
 11  sc              158 non-null    float64
 12  sod             158 non-null    float64
 13  pot             158 non-null    float64
 14  hemo            158 non-null    float64
 15  pcv             158 non-null    int64  
 16  wc              158 non-null    int64  
 17  rc   

In [15]:
# Encoding Categorical Columns

mapping_dict = {
    "rbc":   {"abnormal": 1, "normal": 0},
    "pc":    {"abnormal": 1, "normal": 0},
    "pcc":   {"present": 1, "notpresent": 0},
    "ba":    {"present": 1, "notpresent": 0},
    "htn":   {"yes": 1, "no": 0},
    "dm":    {"yes": 1, "no": 0},
    "cad":   {"yes": 1, "no": 0},
    "appet": {"good": 1, "poor": 0},
    "pe":    {"yes": 1, "no": 0},
    "ane":   {"yes": 1, "no": 0}
}

df = df.replace(mapping_dict)

print("\nHead after encoding:")
df.head()



Head after encoding:


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,70.0,1.005,4.0,0.0,0,1,1,0,117.0,...,32,6700,3.9,1,0,0,0,1,1,1
1,53.0,90.0,1.02,2.0,0.0,1,1,1,0,70.0,...,29,12100,3.7,1,1,0,0,0,1,1
2,63.0,70.0,1.01,3.0,0.0,1,1,1,0,380.0,...,32,4500,3.8,1,1,0,0,1,0,1
3,68.0,80.0,1.01,3.0,2.0,0,1,1,1,157.0,...,16,11000,2.6,1,1,1,0,1,0,1
4,61.0,80.0,1.015,2.0,0.0,1,1,0,0,173.0,...,24,9200,3.2,1,1,1,0,1,1,1


In [16]:
# Selecting the MOST IMPORTANT & REPORT-AVAILABLE FEATURES

selected_features = [
    'age',   # Age (years)
    'bp',    # Blood pressure (mm Hg)
    'sg',    # Urine specific gravity
    'al',    # Albumin
    'bgr',   # Random blood glucose (mg/dL)
    'bu',    # Blood urea (mg/dL)
    'sc',    # Serum creatinine (mg/dL)
    'hemo',  # Hemoglobin (g/dL)
    'wc',    # White blood cell count
    'htn',   # Hypertension (0/1)
    'dm',    # Diabetes (0/1)
    'ane'    # Anemia (0/1)
]

X = df[selected_features]
y = df['classification']

print("X Shape:", X.shape)
print("Y Shape:", y.shape)
print("Features:\n", X.columns)


X Shape: (158, 12)
Y Shape: (158,)
Features:
 Index(['age', 'bp', 'sg', 'al', 'bgr', 'bu', 'sc', 'hemo', 'wc', 'htn', 'dm',
       'ane'],
      dtype='object')


In [17]:
# Train–Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (126, 12)
Test shape: (32, 12)


In [18]:
# Model Training — Random Forest

kidney_model = RandomForestClassifier(
    n_estimators=350,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

kidney_model.fit(X_train, y_train)


In [19]:
# Model Evaluation
# Accuracy Score & Confusion Matrix

y_train_pred = kidney_model.predict(X_train)
y_test_pred = kidney_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
cm = confusion_matrix(y_test, y_test_pred)

print("\nConfusion Matrix:\n", cm)
print(f"Accuracy on Training data : {round(train_accuracy * 100, 2)}%")
print(f"Accuracy on Test data     : {round(test_accuracy * 100, 2)}%")



Confusion Matrix:
 [[23  0]
 [ 0  9]]
Accuracy on Training data : 100.0%
Accuracy on Test data     : 100.0%


In [20]:
# Saving the Trained Model

# ensure saved_models exists (go one folder up like other models)
if not os.path.exists('../saved_models'):
    os.makedirs('../saved_models')

filename = os.path.join('..', 'saved_models', 'kidney_model.sav')
pickle.dump(kidney_model, open(filename, 'wb'))
print("\nModel saved at:", filename)

# verify
loaded_model = pickle.load(open(filename, 'rb'))
print("\nLoaded model expects features:", loaded_model.n_features_in_)
print("Feature order:")
for col in X.columns:
    print(col)



Model saved at: ..\saved_models\kidney_model.sav

Loaded model expects features: 12
Feature order:
age
bp
sg
al
bgr
bu
sc
hemo
wc
htn
dm
ane
