# Chronic Kidney Disease Analysis And Prediction Model


**The task involves _analysis_ of key factors (attributes) influencing the presence of chronic kidney disease and _building a prediction model_ to predict the presence of chronic kidney disease based on provided attributes in new patients.**

#### Given information:
    1. Data set containing 24 attributes required to determine presence of chronic kindney disease- kidneyChronic.csv.

## Importing required libraries

In [424]:
import pandas as pd
import numpy as np

## Importing data set

In [425]:
df = pd.read_csv('kidneyChronic.csv')

## Imputing 'NaN' values in place of '?'

In [426]:
df.replace('?', np.nan, inplace=True)
df.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       70
wbcc     105
rbcc     130
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

## Handling 'NaN' and 'Bad' values (Numerical attributes)

In [427]:
# Numerical columns
num_int_cols = ['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sod', 'pcv', 'wbcc']
num_float_cols = ['sg', 'sc', 'pot', 'hemo', 'rbcc']

In [428]:
# Convert from 'object' dtype and assign 'NaN' for invalid parsed values
df[num_int_cols] = df[num_int_cols].apply(pd.to_numeric, errors='coerce')
df[num_float_cols] = df[num_float_cols].apply(pd.to_numeric, errors='coerce')

# Replacing 'NaN' values with arithmetic mean and converting back to 'int' dtype
for col in num_int_cols:
    df[col]=df[col].fillna(np.mean(pd.factorize(col)[0]))
    df[col]=df[col].astype(int)

# Replacing 'NaN' values with arithmetic mean and keeping values as-is for 'float' dtype
for col in num_float_cols:
    df[col]=df[col].fillna(np.mean(pd.factorize(col)[0]))

In [429]:
# Finding 'NaN' values in data frame
df.isnull().sum()

age        0
bp         0
sg         0
al         0
su         0
rbc      152
pc        65
pcc        4
ba         4
bgr        0
bu         0
sc         0
sod        0
pot        0
hemo       0
pcv        0
wbcc       0
rbcc       0
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

## Handling 'NaN' values (Categrorical attributes)

In [430]:
# Print the value counts for 'rbc'
print(df['rbc'].value_counts(dropna=False))

normal      201
NaN         152
abnormal     47
Name: rbc, dtype: int64


In [431]:
# Impute 'NaN' values for 'rcb' column with most frequent value, i.e. 'normal'
df['rbc']=df['rbc'].fillna('normal')
# Print value counts for 'rbc' again
print(df['rbc'].value_counts(dropna=False))

normal      353
abnormal     47
Name: rbc, dtype: int64


In [432]:
# Print the value counts for 'pc'
print(df['pc'].value_counts(dropna=False))

normal      259
abnormal     76
NaN          65
Name: pc, dtype: int64


In [433]:
# Impute 'NaN' values for 'pc' column with most frequent value, i.e. 'normal'
df['pc']=df['pc'].fillna('normal')
# Print value counts for 'pc' again
print(df['pc'].value_counts(dropna=False))

normal      324
abnormal     76
Name: pc, dtype: int64


In [434]:
# Print the value counts for 'pcc'
print(df['pcc'].value_counts(dropna=False))

notpresent    354
present        42
NaN             4
Name: pcc, dtype: int64


In [435]:
# Impute 'NaN' values for 'pcc' column with most frequent value, i.e. 'notpresent'
df['pcc']=df['pcc'].fillna('notpresent')
# Print value counts for 'pcc' again
print(df['pcc'].value_counts(dropna=False))

notpresent    358
present        42
Name: pcc, dtype: int64


In [436]:
# Print the value counts for 'ba'
print(df['ba'].value_counts(dropna=False))

notpresent    374
present        22
NaN             4
Name: ba, dtype: int64


In [437]:
# Impute 'NaN' values for 'ba' column with most frequent value, i.e. 'notpresent'
df['ba']=df['ba'].fillna('notpresent')
# Print value counts for 'ba' again
print(df['ba'].value_counts(dropna=False))

notpresent    378
present        22
Name: ba, dtype: int64


In [438]:
# Print the value counts for 'htn'
print(df['htn'].value_counts(dropna=False))

no     251
yes    147
NaN      2
Name: htn, dtype: int64


In [439]:
# Impute 'NaN' values for 'htn' column with most frequent value, i.e. 'no'
df['htn']=df['htn'].fillna('no')
# Print value counts for 'htn' again
print(df['htn'].value_counts(dropna=False))

no     253
yes    147
Name: htn, dtype: int64


In [440]:
# Print the value counts for 'dm'
print(df['dm'].value_counts(dropna=False))

no       258
yes      134
\tno       3
\tyes      2
NaN        2
 yes       1
Name: dm, dtype: int64


In [441]:
# Impute 'NaN' values for 'dm' column with most frequent value, i.e. 'no'
df['dm']=df['dm'].fillna('no')
# Replacing '\t' and ' ' with empty string value ''
df['dm'].replace(['\t',' '], '', regex=True, inplace= True)
# Print value counts for 'dm' again
print(df['dm'].value_counts(dropna=False))

no     263
yes    137
Name: dm, dtype: int64


In [442]:
# Print the value counts for 'cad'
print(df['cad'].value_counts(dropna=False))

no      362
yes      34
\tno      2
NaN       2
Name: cad, dtype: int64


In [443]:
# Impute 'NaN' values for 'cad' column with most frequent value, i.e. 'no'
df['cad']=df['cad'].fillna('no')
# Replacing '\t' and ' ' with empty string value ''
df['cad'].replace(['\t'], '', regex=True, inplace= True)
# Print value counts for 'cad' again
print(df['cad'].value_counts(dropna=False))

no     366
yes     34
Name: cad, dtype: int64


In [444]:
# Print the value counts for 'appet'
print(df['appet'].value_counts(dropna=False))

good    317
poor     82
NaN       1
Name: appet, dtype: int64


In [445]:
# Impute 'NaN' values for 'appet' column with most frequent value, i.e. 'good'
df['appet']=df['appet'].fillna('good')
# Print value counts for 'appet' again
print(df['appet'].value_counts(dropna=False))

good    318
poor     82
Name: appet, dtype: int64


In [446]:
# Print the value counts for 'pe'
print(df['pe'].value_counts(dropna=False))

no     323
yes     76
NaN      1
Name: pe, dtype: int64


In [447]:
# Impute 'NaN' values for 'pe' column with most frequent value, i.e. 'no'
df['pe']=df['pe'].fillna('no')
# Print value counts for 'pe' again
print(df['pe'].value_counts(dropna=False))

no     324
yes     76
Name: pe, dtype: int64


In [448]:
# Print the value counts for 'ane'
print(df['ane'].value_counts(dropna=False))

no     339
yes     60
NaN      1
Name: ane, dtype: int64


In [449]:
# Impute 'NaN' values for 'ane' column with most frequent value, i.e. 'no'
df['ane']=df['ane'].fillna('no')
# Print value counts for 'ane' again
print(df['ane'].value_counts(dropna=False))

no     340
yes     60
Name: ane, dtype: int64


In [450]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      400 non-null int32
bp       400 non-null int32
sg       400 non-null float64
al       400 non-null int32
su       400 non-null int32
rbc      400 non-null object
pc       400 non-null object
pcc      400 non-null object
ba       400 non-null object
bgr      400 non-null int32
bu       400 non-null int32
sc       400 non-null float64
sod      400 non-null int32
pot      400 non-null float64
hemo     400 non-null float64
pcv      400 non-null int32
wbcc     400 non-null int32
rbcc     400 non-null float64
htn      400 non-null object
dm       400 non-null object
cad      400 non-null object
appet    400 non-null object
pe       400 non-null object
ane      400 non-null object
class    400 non-null object
dtypes: float64(5), int32(9), object(11)
memory usage: 46.9+ KB
None


In [451]:
# Check for presence of null values
df.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

## Separating feature variables and target variable

In [452]:
X=df.drop('class',axis='columns')
Y=df['class']

## Handling categorical columns in feature variable

Many machine learning model not accept categorical column in feature variable so we will convert categorical column in feature variable in numerical variable by creating dummy variables

In [453]:
X=pd.get_dummies(X,drop_first=True)

In [454]:
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 24 columns):
age            400 non-null int32
bp             400 non-null int32
sg             400 non-null float64
al             400 non-null int32
su             400 non-null int32
bgr            400 non-null int32
bu             400 non-null int32
sc             400 non-null float64
sod            400 non-null int32
pot            400 non-null float64
hemo           400 non-null float64
pcv            400 non-null int32
wbcc           400 non-null int32
rbcc           400 non-null float64
rbc_normal     400 non-null uint8
pc_normal      400 non-null uint8
pcc_present    400 non-null uint8
ba_present     400 non-null uint8
htn_yes        400 non-null uint8
dm_yes         400 non-null uint8
cad_yes        400 non-null uint8
appet_poor     400 non-null uint8
pe_yes         400 non-null uint8
ane_yes        400 non-null uint8
dtypes: float64(5), int32(9), uint8(10)
memory usage: 33.7 KB
None


## Checking for bad/missing values in target variable

In [455]:
print(Y.value_counts(dropna=False))

ckd       250
notckd    150
Name: class, dtype: int64


## Random Forest Classifier

In [456]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.80,random_state=42)
rf=RandomForestClassifier(n_estimators=50,min_samples_leaf=0.20,random_state=42)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
print(classification_report(y_test,pred))
print("Accuracy of the given model is : {}".format(accuracy_score(y_test,pred)))

              precision    recall  f1-score   support

         ckd       1.00      0.97      0.99       205
      notckd       0.95      1.00      0.97       115

    accuracy                           0.98       320
   macro avg       0.98      0.99      0.98       320
weighted avg       0.98      0.98      0.98       320

Accuracy of the given model is : 0.98125
