In [4]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.impute import KNNImputer

In [5]:
df = pd.read_csv(r'data\raw_ckd.csv')
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [6]:
print(df.isnull().sum())

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64


It is observed that there are huge number of missing values in all columns. This needs to be handled with an appropriate missing value handling techniques.

In [7]:
df.dtypes

age      float64
bp       float64
sg       float64
al       float64
su       float64
rbc       object
pc        object
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
class     object
dtype: object

All the string/object dtype columsn needs to be encoded with a suitable encoding. In this case we will be using lable encoding as this is nominal (no order/rank) categorical datatypes

In [8]:
df_cat = df.select_dtypes(include='object')
#col_name = df_cat.columns().to_list()
df_cat 

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...
395,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
396,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
397,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
398,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd


In [9]:
df_num = df.select_dtypes(exclude='object')
df_num

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2
1,7.0,50.0,1.020,4.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,
2,62.0,80.0,1.010,2.0,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9
4,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,140.0,49.0,0.5,150.0,4.9,15.7,47.0,6700.0,4.9
396,42.0,70.0,1.025,0.0,0.0,75.0,31.0,1.2,141.0,3.5,16.5,54.0,7800.0,6.2
397,12.0,80.0,1.020,0.0,0.0,100.0,26.0,0.6,137.0,4.4,15.8,49.0,6600.0,5.4
398,17.0,60.0,1.025,0.0,0.0,114.0,50.0,1.0,135.0,4.9,14.2,51.0,7200.0,5.9


In [10]:
cat_cols = df_cat.columns.to_list()

for i in cat_cols:
    print ("Unique values in feature:",i,df[i].unique(),'\n')

Unique values in feature: rbc [nan 'normal' 'abnormal'] 

Unique values in feature: pc ['normal' 'abnormal' nan] 

Unique values in feature: pcc ['notpresent' 'present' nan] 

Unique values in feature: ba ['notpresent' 'present' nan] 

Unique values in feature: htn ['yes' 'no' nan] 

Unique values in feature: dm ['yes' 'no' nan] 

Unique values in feature: cad ['no' 'yes' nan] 

Unique values in feature: appet ['good' 'poor' nan] 

Unique values in feature: pe ['no' 'yes' nan] 

Unique values in feature: ane ['no' 'yes' nan] 

Unique values in feature: class ['ckd' 'notckd'] 



In [11]:
# Not using direct Label Encoding as data contains NaN values which will be considered as another class
'''
le = LabelEncoder()
df_cat = df_cat.apply(le.fit_transform)
df_cat
'''

'\nle = LabelEncoder()\ndf_cat = df_cat.apply(le.fit_transform)\ndf_cat\n'

In [12]:
cleanup = {"rbc":     {"normal": 1, "abnormal": 0},
           "pc": {"normal": 1, "abnormal": 0},
           "pcc": {"present": 1, "notpresent": 0},
           "ba": {"present": 1, "notpresent": 0},
           "htn": {"yes": 1, "no": 0},
           "dm": {"yes": 1, "no": 0},
           "cad": {"yes": 1, "no": 0},
           "appet": {"good": 1, "poor": 0},
           "pe": {"yes": 1, "no": 0},
           "ane": {"yes": 1, "no": 0},
           "class": {"ckd": 1, "notckd": 0}}

In [13]:
# Encoding Labels with above 'cleanup' notation
df_cat.replace(cleanup,inplace=True)
df_cat

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1
1,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
395,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
396,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
397,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
398,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [14]:
df_final = pd.concat([df_cat,df_num],axis = 1)
df_final

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,...,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2
1,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,,18.0,0.8,,,11.3,38.0,6000.0,
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,140.0,49.0,0.5,150.0,4.9,15.7,47.0,6700.0,4.9
396,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,75.0,31.0,1.2,141.0,3.5,16.5,54.0,7800.0,6.2
397,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,100.0,26.0,0.6,137.0,4.4,15.8,49.0,6600.0,5.4
398,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,114.0,50.0,1.0,135.0,4.9,14.2,51.0,7200.0,5.9


In [15]:
df_final.dtypes

rbc      float64
pc       float64
pcc      float64
ba       float64
htn      float64
dm       float64
cad      float64
appet    float64
pe       float64
ane      float64
class      int64
age      float64
bp       float64
sg       float64
al       float64
su       float64
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
dtype: object

In [16]:
def knnImputerDatasetGenerator(df, neighbors):
    
    imputer_uniform = KNNImputer(n_neighbors=neighbors, weights = 'uniform')
    dataset_uniform = imputer_uniform.fit_transform(df)
    
    return pd.DataFrame(dataset_uniform, columns = df.columns)

In [17]:
# Using KNN imputation for filling NA values. Used different KNN methods with varying no of neighbors considered(n_neighbors)

df_knn_imputed_uniform_3 = knnImputerDatasetGenerator(df_final, 3)
df_knn_imputed_uniform_5 = knnImputerDatasetGenerator(df_final, 5)
df_knn_imputed_uniform_7 = knnImputerDatasetGenerator(df_final, 7)
df_knn_imputed_uniform_9 = knnImputerDatasetGenerator(df_final, 9)    
df_knn_imputed_uniform_11 = knnImputerDatasetGenerator(df_final, 11)
df_mean_imputed = df_final.fillna(df_final.mean())

In [18]:
path = 'data'
df_knn_imputed_uniform_3.to_csv(path + "/df_knn_imputed_uniform_3.csv", index=False)
df_knn_imputed_uniform_5.to_csv(path + "/df_knn_imputed_uniform_5.csv", index=False)
df_knn_imputed_uniform_7.to_csv(path + "/df_knn_imputed_uniform_7.csv", index=False)
df_knn_imputed_uniform_9.to_csv(path + "/df_knn_imputed_uniform_9.csv", index=False)
df_knn_imputed_uniform_11.to_csv(path + "/df_knn_imputed_uniform_11.csv", index=False)
df_mean_imputed.to_csv(path + "/df_mean_imputed.csv", index=False)

### Here we generate 6 different imputed datasets, which will be helful for the next prediction process