In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


In [2]:
#Reading the dataset
data = pd.read_csv("kidney_disease.csv")
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
#getting the shape of the dataset
data.shape

(400, 26)

In [5]:
data.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [4]:
#getting the information about the dataset contents
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

# Data Preprocessing


In [6]:
categorial_cols = [col for col in data.columns if data[col].dtype=="object"]
categorial_cols

['rbc',
 'pc',
 'pcc',
 'ba',
 'pcv',
 'wc',
 'rc',
 'htn',
 'dm',
 'cad',
 'appet',
 'pe',
 'ane',
 'classification']

In [7]:
numerical_cols = [x for x in data.columns if not x in categorial_cols]
numerical_cols

['id', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']

In [8]:
for i in ['rc','wc','pcv']:
    data[i] = data[i].str.extract('(\d+)').astype(float)

## Simple Imputing

In [9]:
#filling the null values with the mean values 
for i in ['age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','rc','wc','pcv']:
    data[i].fillna(data[i].mean(),inplace=True)

## OneHot Encoding

In [10]:
#converting the categorial data by using oneHot Encoding
rbc = pd.get_dummies(data[["rbc"]],drop_first=True)
rbc.head()

Unnamed: 0,rbc_normal
0,0
1,0
2,1
3,1
4,1


In [11]:
pc = pd.get_dummies(data[["pc"]],drop_first=True)
pc.head()

Unnamed: 0,pc_normal
0,1
1,1
2,1
3,0
4,1


In [12]:
pcc = pd.get_dummies(data[["pcc"]],drop_first=True)
pcc.head()

Unnamed: 0,pcc_present
0,0
1,0
2,0
3,1
4,0


In [13]:
ba = pd.get_dummies(data[["ba"]],drop_first=True)
ba.head()

Unnamed: 0,ba_present
0,0
1,0
2,0
3,0
4,0


In [14]:
#dropping the categorial data columns
data.drop(["rbc","pc","pcc","ba"],axis=1,inplace=True)

In [15]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,...,44.0,7800.0,5.0,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,...,38.0,6000.0,4.241636,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,137.528754,...,31.0,7500.0,4.241636,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,32.0,6700.0,3.0,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,...,35.0,7300.0,4.0,no,no,no,good,no,no,ckd


In [16]:
#concating the data columns
data = pd.concat([data,rbc,pc,pcc,ba],axis=1)

In [17]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,dm,cad,appet,pe,ane,classification,rbc_normal,pc_normal,pcc_present,ba_present
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,...,yes,no,good,no,no,ckd,0,1,0,0
1,1,7.0,50.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,...,no,no,good,no,no,ckd,0,1,0,0
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,137.528754,...,yes,no,poor,no,yes,ckd,1,1,0,0
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,no,no,poor,yes,yes,ckd,1,0,1,0
4,4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,...,no,no,good,no,no,ckd,1,1,0,0


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             400 non-null    float64
 2   bp              400 non-null    float64
 3   sg              400 non-null    float64
 4   al              400 non-null    float64
 5   su              400 non-null    float64
 6   bgr             400 non-null    float64
 7   bu              400 non-null    float64
 8   sc              400 non-null    float64
 9   sod             400 non-null    float64
 10  pot             400 non-null    float64
 11  hemo            400 non-null    float64
 12  pcv             400 non-null    float64
 13  wc              400 non-null    float64
 14  rc              400 non-null    float64
 15  htn             398 non-null    object 
 16  dm              398 non-null    object 
 17  cad             398 non-null    obj

In [19]:
data["classification"].value_counts()

ckd       248
notckd    150
ckd\t       2
Name: classification, dtype: int64

In [20]:
#replacing the values of notckd, ckd and ckd/t in the dataset
data.replace({"notckd":0,"ckd":1,"ckd\t":1},inplace=True)

In [21]:
data["appet"].value_counts()

good    317
poor     82
Name: appet, dtype: int64

In [22]:
data.replace({"good":1,"poor":0},inplace=True)

In [23]:
data["ane"].value_counts()

no     339
yes     60
Name: ane, dtype: int64

In [24]:
#replacing the values of no, yes to 0,1 respectively
data.replace({"no":0,"yes":1,"\tno":0,"\tyes":1," yes":1},inplace=True)

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             400 non-null    float64
 2   bp              400 non-null    float64
 3   sg              400 non-null    float64
 4   al              400 non-null    float64
 5   su              400 non-null    float64
 6   bgr             400 non-null    float64
 7   bu              400 non-null    float64
 8   sc              400 non-null    float64
 9   sod             400 non-null    float64
 10  pot             400 non-null    float64
 11  hemo            400 non-null    float64
 12  pcv             400 non-null    float64
 13  wc              400 non-null    float64
 14  rc              400 non-null    float64
 15  htn             398 non-null    float64
 16  dm              398 non-null    float64
 17  cad             398 non-null    flo

In [26]:
#if still null values present then replacing the null value with the most frequent value in the column
data=data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             400 non-null    float64
 2   bp              400 non-null    float64
 3   sg              400 non-null    float64
 4   al              400 non-null    float64
 5   su              400 non-null    float64
 6   bgr             400 non-null    float64
 7   bu              400 non-null    float64
 8   sc              400 non-null    float64
 9   sod             400 non-null    float64
 10  pot             400 non-null    float64
 11  hemo            400 non-null    float64
 12  pcv             400 non-null    float64
 13  wc              400 non-null    float64
 14  rc              400 non-null    float64
 15  htn             400 non-null    float64
 16  dm              400 non-null    float64
 17  cad             400 non-null    flo

# Splitting Train Data and Test Data

In [28]:
#getting the columns in the dataset
data.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',
       'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane',
       'classification', 'rbc_normal', 'pc_normal', 'pcc_present',
       'ba_present'],
      dtype='object')

In [30]:
#seperating the data for the model as X contains the data which feed to the model and y contains the target column 
X = data.loc[:,['age', 'bp', 'rc','wc','appet','pc_normal','htn','hemo','bgr','dm','ane']]
y = data["classification"]

In [31]:
#splitting the train data and test Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)
X_train.head()

Unnamed: 0,age,bp,rc,wc,appet,pc_normal,htn,hemo,bgr,dm,ane
336,25.0,60.0,5.0,9200.0,1.0,1,0.0,15.2,119.0,0.0,0.0
64,55.0,80.0,4.241636,8406.122449,1.0,1,0.0,9.8,146.0,0.0,0.0
55,35.0,80.0,4.241636,8406.122449,1.0,1,0.0,9.5,148.036517,0.0,0.0
106,50.0,90.0,4.241636,6500.0,1.0,0,1.0,6.0,89.0,1.0,1.0
300,45.0,60.0,5.0,9200.0,1.0,1,0.0,15.0,114.0,0.0,0.0


# MODELS:

## Random Forest

In [32]:
#using the random forest classifier
model = RandomForestClassifier()
model.fit(X_train,y_train)

RandomForestClassifier()

In [33]:
#getting the predictions using the trained model
predictions = model.predict(X_test)

In [34]:
#checking the mean absolute error between the predicted values and test data
print("the mean absolute error by using the RandomForest is",mean_absolute_error(y_test,predictions))

the mean absolute error by using the RandomForest is 0.025


In [35]:
#printing the accuracy of the train data
print("the accuracy of the train data is ",model.score(X_train,y_train)*100)

the accuracy of the train data is  100.0


In [36]:
#printing the accuracy of the test data
print("the accuracy of the test data is",model.score(X_test,y_test)*100)

the accuracy of the test data is 97.5


## XGradient Boost

In [37]:
#using the XGradient Boosting algorithm
mod = XGBClassifier()
mod.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [38]:
#checking the mean absolute error between the predicted values and test data
print(mean_absolute_error(y_test,mod.predict(X_test)))

0.0375


  "memory consumption")


In [39]:
#printing the accuracy of the train data
print("the accuracy of the train data is ",mod.score(X_train,y_train)*100)

the accuracy of the train data is  100.0


  "memory consumption")


In [40]:
#printing the accuracy of the test data
print("the accuracy of the test data is",mod.score(X_test,y_test)*100)

the accuracy of the test data is 96.25


  "memory consumption")


## Support Vector Machine

In [41]:
model2 = SVC()
model2.fit(X_train,y_train)

SVC()

In [42]:
print("the mean absolute error is",mean_absolute_error(y_test,model2.predict(X_test)))

the mean absolute error is 0.35


In [43]:
#printing the accuracy of the train data
print("the accuracy of the train data is ",model2.score(X_train,y_train)*100)

the accuracy of the train data is  61.875


In [45]:
#printing the accuracy of the test data
print("the accuracy of the test data is",model2.score(X_test,y_test)*100)

the accuracy of the test data is 65.0


# Dumping the best model into the pickle
            from the above three models we get to know that Random Forest gives the best accuracy as compared with remaining two models. So, we use Random Forest for this project.

In [48]:
import pickle
file = open("mainBookpickle.pkl","wb")
pickle.dump(model,file)

In [50]:
#checking the test accuracy with the model in the pickle file
mod1 = pickle.load(open("mainBookpickle.pkl","rb"))
print(mod1.score(X_test,y_test)*100)

97.5
