In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
ckd_dataset = pd.read_csv(r"D:\Disease prediction\Datasets\Chronic Kidney.csv")

In [3]:
ckd_dataset.head()

Unnamed: 0,serum_creatinine,gfr,bun,serum_calcium,ana,c3_c4,hematuria,oxalate_levels,urine_ph,blood_pressure,...,smoking,alcohol,painkiller_usage,family_history,weight_changes,stress_level,months,cluster,ckd_pred,ckd_stage
0,0.683683,32.946784,7.553739,10.039896,0,138.204989,0,2.878164,7.864308,115.224217,...,yes,daily,no,yes,stable,low,10,5,CKD,3
1,3.809044,32.685035,141.347494,8.330543,1,24.282343,1,4.767639,4.920015,130.1439,...,yes,daily,no,yes,loss,moderate,1,2,CKD,3
2,1.143827,2.079805,15.979104,9.419229,0,163.970666,0,1.818613,6.188115,98.026072,...,no,daily,no,no,stable,moderate,4,6,CKD,5
3,4.804657,109.871407,53.307333,7.556631,1,71.056846,1,4.051686,5.278607,142.16665,...,no,never,yes,yes,stable,high,9,2,CKD,1
4,4.920235,42.21459,134.182157,7.289379,1,23.384639,1,3.24092,4.862923,151.962572,...,no,occasionally,yes,no,gain,high,7,2,CKD,3


In [4]:
ckd_dataset.shape

(4000, 23)

In [5]:
ckd_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   serum_creatinine   4000 non-null   float64
 1   gfr                4000 non-null   float64
 2   bun                4000 non-null   float64
 3   serum_calcium      4000 non-null   float64
 4   ana                4000 non-null   int64  
 5   c3_c4              4000 non-null   float64
 6   hematuria          4000 non-null   int64  
 7   oxalate_levels     4000 non-null   float64
 8   urine_ph           4000 non-null   float64
 9   blood_pressure     4000 non-null   float64
 10  physical_activity  4000 non-null   object 
 11  diet               4000 non-null   object 
 12  water_intake       4000 non-null   float64
 13  smoking            4000 non-null   object 
 14  alcohol            4000 non-null   object 
 15  painkiller_usage   4000 non-null   object 
 16  family_history     4000 

In [6]:
ckd_dataset = ckd_dataset.drop(columns=['months','cluster'])

In [7]:
ckd_dataset.describe()

Unnamed: 0,serum_creatinine,gfr,bun,serum_calcium,ana,c3_c4,hematuria,oxalate_levels,urine_ph,blood_pressure,water_intake,ckd_stage
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,1.551423,51.735449,39.346088,8.53753,0.3,109.757723,0.3,2.624439,6.394566,119.74706,2.505405,2.91425
std,1.217478,34.389322,42.495106,1.403631,0.458315,46.556155,0.458315,1.088755,1.073232,25.218642,0.581993,1.391
min,0.500273,0.021317,7.00092,5.002407,0.0,10.027574,0.0,1.000852,4.500576,90.019471,1.500627,0.0
25%,0.758924,20.263808,11.632573,7.810761,0.0,76.770105,0.0,1.735248,5.248199,100.176525,2.004723,2.0
50%,0.995209,50.054933,16.296667,8.9822,0.0,115.511599,0.0,2.432659,6.580794,111.116236,2.512182,3.0
75%,2.065796,80.258482,67.090486,9.583627,1.0,148.532716,1.0,3.417216,7.305031,138.205563,3.015095,4.0
max,4.994009,119.923482,149.999395,10.199344,1.0,179.96397,1.0,4.999965,7.999886,179.991991,3.499595,5.0


In [8]:
print(ckd_dataset.columns)

Index(['serum_creatinine', 'gfr', 'bun', 'serum_calcium', 'ana', 'c3_c4',
       'hematuria', 'oxalate_levels', 'urine_ph', 'blood_pressure',
       'physical_activity', 'diet', 'water_intake', 'smoking', 'alcohol',
       'painkiller_usage', 'family_history', 'weight_changes', 'stress_level',
       'ckd_pred', 'ckd_stage'],
      dtype='object')


In [9]:
# Seperating the feature and target

X = ckd_dataset.drop(columns=["ckd_pred","ckd_stage"])
Y = ckd_dataset[["ckd_pred","ckd_stage"]]

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   serum_creatinine   4000 non-null   float64
 1   gfr                4000 non-null   float64
 2   bun                4000 non-null   float64
 3   serum_calcium      4000 non-null   float64
 4   ana                4000 non-null   int64  
 5   c3_c4              4000 non-null   float64
 6   hematuria          4000 non-null   int64  
 7   oxalate_levels     4000 non-null   float64
 8   urine_ph           4000 non-null   float64
 9   blood_pressure     4000 non-null   float64
 10  physical_activity  4000 non-null   object 
 11  diet               4000 non-null   object 
 12  water_intake       4000 non-null   float64
 13  smoking            4000 non-null   object 
 14  alcohol            4000 non-null   object 
 15  painkiller_usage   4000 non-null   object 
 16  family_history     4000 

In [11]:
Y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ckd_pred   4000 non-null   object
 1   ckd_stage  4000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


In [12]:
# Encode target and categorical features

label_encoders = {}

for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

for col in Y.columns:
    le = LabelEncoder()
    Y[col] = le.fit_transform(Y[col])
    label_encoders[col] = le

In [13]:
# Scale features

scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

In [14]:
# Train Test and Split 

X_train,X_test,Y_train,Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [15]:
Ckd_model = MultiOutputClassifier(RandomForestClassifier(random_state=42))
Ckd_model.fit(X_train,Y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
# Choose one sample row from the dataset
# sample_original = X.iloc[0:1].copy()  # original unprocessed sample (for display)
# sample_input = X.iloc[0:1]

sample_input = pd.DataFrame({
    'serum_creatinine': [0.68],
    'gfr': [32.94678],
    'bun': [7.553739],
    'serum_calcium': [10.0399],
    'ana': [0],
    'c3_c4': [138.205],
    'hematuria': [0],
    'oxalate_levels': [2.878164],
    'urine_ph': [7.864308],
    'blood_pressure': [115.2242],
    'physical_activity': ['weekly'],
    'diet': ['high protein'],
    'water_intake': [2.314979],
    'smoking': ['yes'],
    'alcohol': ['daily'],
    'painkiller_usage': ['no'],
    'family_history': ['yes'],
    'weight_changes': ['stable'],
    'stress_level': ['low']
})

# Encode sample categorical columns
# for col in sample_input.select_dtypes(include='object').columns:
#     sample_input[col] = label_encoders[col].transform(sample_input[col])

categorical_cols = sample_input.select_dtypes(include='object').columns
for col in categorical_cols:
    sample_input[col] = label_encoders[col].transform(sample_input[col])

# Scale numeric columns
sample_scaled = scalar.transform(sample_input)

# Predict CKD and Stage
prediction = Ckd_model.predict(sample_scaled)

# Decode the results
ckd_result = label_encoders["ckd_pred"].inverse_transform([prediction[0][0]])[0]
ckd_stage = label_encoders["ckd_stage"].inverse_transform([prediction[0][1]])[0]

print("Sample Input:",sample_input)
print("\nCKD Prediction:", ckd_result)
print("Predicted CKD Stage:", ckd_stage)

Sample Input:    serum_creatinine       gfr       bun  serum_calcium  ana    c3_c4  \
0              0.68  32.94678  7.553739        10.0399    0  138.205   

   hematuria  oxalate_levels  urine_ph  blood_pressure  physical_activity  \
0          0        2.878164  7.864308        115.2242                  2   

   diet  water_intake  smoking  alcohol  painkiller_usage  family_history  \
0     1      2.314979        1        0                 0               1   

   weight_changes  stress_level  
0               2             1  

CKD Prediction: CKD
Predicted CKD Stage: 3


In [18]:
# Saving the model

with open("CKD_model.sav", "wb") as f:
    pickle.dump(Ckd_model, f)

# Save scaler
with open("Ckd_scalar.sav", "wb") as f:
    pickle.dump(scalar, f)


In [None]:
# with open("CKD_model.pkl", "rb") as f:
model = pickle.load(open("CKD_model.sav", "rb"))

In [24]:
sample_input = pd.DataFrame({
    'serum_creatinine': [0.68],
    'gfr': [32.94678],
    'bun': [7.553739],
    'serum_calcium': [10.0399],
    'ana': [0],
    'c3_c4': [138.205],
    'hematuria': [0],
    'oxalate_levels': [2.878164],
    'urine_ph': [7.864308],
    'blood_pressure': [115.2242],
    'physical_activity': ['weekly'],
    'diet': ['high protein'],
    'water_intake': [2.314979],
    'smoking': ['yes'],
    'alcohol': ['daily'],
    'painkiller_usage': ['no'],
    'family_history': ['yes'],
    'weight_changes': ['stable'],
    'stress_level': ['low']
})

# Encode sample categorical columns
# for col in sample_input.select_dtypes(include='object').columns:
#     sample_input[col] = label_encoders[col].transform(sample_input[col])

categorical_cols = sample_input.select_dtypes(include='object').columns
for col in categorical_cols:
    sample_input[col] = label_encoders[col].transform(sample_input[col])

# Scale numeric columns
sample_scaled = scalar.transform(sample_input)

# Predict CKD and Stage
prediction = model.predict(sample_scaled)

# Decode the results
ckd_result = label_encoders["ckd_pred"].inverse_transform([prediction[0][0]])[0]
ckd_stage = label_encoders["ckd_stage"].inverse_transform([prediction[0][1]])[0]

print("🔍 Sample Input:",sample_input)
print("\nCKD Prediction:", ckd_result)
print("Predicted CKD Stage:", ckd_stage)

🔍 Sample Input:    serum_creatinine       gfr       bun  serum_calcium  ana    c3_c4  \
0              0.68  32.94678  7.553739        10.0399    0  138.205   

   hematuria  oxalate_levels  urine_ph  blood_pressure  physical_activity  \
0          0        2.878164  7.864308        115.2242                  2   

   diet  water_intake  smoking  alcohol  painkiller_usage  family_history  \
0     1      2.314979        1        0                 0               1   

   weight_changes  stress_level  
0               2             1  

CKD Prediction: CKD
Predicted CKD Stage: 3
