In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [4]:
df.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [5]:
d1 = df.iloc[:, 1:6]
d2 = df.iloc[:, 10:16]
d3 = pd.concat([d1,d2])
d3.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo'], dtype='object')

In [6]:
np.setdiff1d(df.columns, d3.columns)

array(['ane', 'appet', 'ba', 'cad', 'classification', 'dm', 'htn', 'id',
       'pc', 'pcc', 'pcv', 'pe', 'rbc', 'rc', 'wc'], dtype=object)

In [16]:
column_names = d3.columns
column_names

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo'], dtype='object')

In [7]:
from sklearn.impute import SimpleImputer

In [17]:
imp_mean = SimpleImputer(missing_values=np.nan,strategy='mean')


In [21]:
d4 = imp_mean.fit_transform(df[column_names])
d4

array([[ 48.        ,  80.        ,   1.02      , ..., 137.52875399,
          4.62724359,  15.4       ],
       [  7.        ,  50.        ,   1.02      , ..., 137.52875399,
          4.62724359,  11.3       ],
       [ 62.        ,  80.        ,   1.01      , ..., 137.52875399,
          4.62724359,   9.6       ],
       ...,
       [ 12.        ,  80.        ,   1.02      , ..., 137.        ,
          4.4       ,  15.8       ],
       [ 17.        ,  60.        ,   1.025     , ..., 135.        ,
          4.9       ,  14.2       ],
       [ 58.        ,  80.        ,   1.025     , ..., 141.        ,
          3.5       ,  15.8       ]])

In [22]:
d4.shape

(400, 11)

In [77]:
df[column_names] = d4
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.000000,36.0,1.2,137.528754,4.627244,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,148.036517,18.0,0.8,137.528754,4.627244,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.000000,53.0,1.8,137.528754,4.627244,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.000000,56.0,3.8,111.000000,2.500000,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.000000,26.0,1.4,137.528754,4.627244,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.000000,49.0,0.5,150.000000,4.900000,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.000000,31.0,1.2,141.000000,3.500000,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.000000,26.0,0.6,137.000000,4.400000,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.000000,50.0,1.0,135.000000,4.900000,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [78]:
df_copy = df.copy()
df_copy

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.000000,36.0,1.2,137.528754,4.627244,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,148.036517,18.0,0.8,137.528754,4.627244,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.000000,53.0,1.8,137.528754,4.627244,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.000000,56.0,3.8,111.000000,2.500000,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.000000,26.0,1.4,137.528754,4.627244,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.000000,49.0,0.5,150.000000,4.900000,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.000000,31.0,1.2,141.000000,3.500000,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.000000,26.0,0.6,137.000000,4.400000,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.000000,50.0,1.0,135.000000,4.900000,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [83]:
 print(pd.unique(df_copy['classification']))

['ckd' 'ckd\t' 'notckd']


In [93]:
f1=lambda x: 'ckd' if x == 'ckd\t' else x

In [90]:
df_copy=df_copy.applymap(lambda x: 'ckd' if x == 'ckd\t' else x)

In [91]:
 print(pd.unique(df_copy['classification']))

['ckd' 'notckd']


In [97]:
df_with_dummies = pd.get_dummies(df_copy)
df_with_dummies.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,rbc_abnormal,rbc_normal,pc_abnormal,pc_normal,pcc_notpresent,pcc_present,ba_notpresent,ba_present,pcv_\t43,pcv_\t?,pcv_14,pcv_15,pcv_16,pcv_17,pcv_18,pcv_19,pcv_20,pcv_21,pcv_22,pcv_23,pcv_24,pcv_25,pcv_26,pcv_27,pcv_28,pcv_29,pcv_30,pcv_31,...,rc_4.6,rc_4.7,rc_4.8,rc_4.9,rc_5,rc_5.0,rc_5.1,rc_5.2,rc_5.3,rc_5.4,rc_5.5,rc_5.6,rc_5.7,rc_5.8,rc_5.9,rc_6.0,rc_6.1,rc_6.2,rc_6.3,rc_6.4,rc_6.5,rc_8.0,htn_no,htn_yes,dm_\tno,dm_\tyes,dm_ yes,dm_no,dm_yes,cad_\tno,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes,classification_ckd,classification_notckd
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,15.4,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0
1,1,7.0,50.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,11.3,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,137.528754,4.627244,9.6,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,1,1,0
4,4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,11.6,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0
