In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
# load dataset
dataset = pd.read_csv("kidney_disease.csv",delimiter=";").convert_objects(convert_numeric=True)
dataset.isnull().sum()

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                71
wc                106
rc                131
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [3]:
#array containing numeric columns
columns1 =["age", "bp","sg","al", "su","bgr","bu","sc","sod", "pot","hemo","rc","pcv","wc"]
for x in columns1:
    #assigning the mean to missing values in numeric columns
    dataset[x]=dataset[x].fillna(dataset[x].mean())
    
#array contains columns of normal and abnormal values
columns3=["rbc","pc"]
for x in columns3:
    #assigning the backward fill to missing values in columns 
    dataset[x]=dataset[x].fillna( method ='bfill')
    # convert normal and abnormal to 0 and 1
    dataset[x] =dataset[x].replace(to_replace=({'normal':"0",'abnormal':"1"}))
    
#array contains columns of present and notpresent values    
columns4=["pcc","ba"]
for x in columns4:
     #assigning the backward fill to missing values in columns
    dataset[x]=dataset[x].fillna( method ='bfill')
    # convert present and notpresent to 0 and 1
    dataset[x] =dataset[x].replace(to_replace=({'present':"0",'notpresent':"1"}))

##array contains columns of yes and no values
columns5=["htn","dm","cad","pe","ane"]
for x in columns5:
     #assigning the backward fill to missing values in columns
    dataset[x]=dataset[x].fillna( method ='bfill')
     # convert yes and no to 0 and 1
    dataset[x] =dataset[x].replace(to_replace=({'yes':"1",'no':"0"}))
    
 #assigning the backward fill to missing values in columns    
dataset["appet"]=dataset["appet"].fillna( method ='bfill')
# convert good and poor to 0 and 1
dataset["appet"] =dataset["appet"].replace(to_replace=({'good':"1",'poor':"0"}))

# convert ckd and notckd to 0 and 1
dataset["classification"] =dataset["classification"].replace(to_replace=({'ckd':"1",'notckd':"0"}))

In [5]:
# sava new dataset after cleaning 
export_CKD=dataset.to_csv(r'export_CKD.csv')
dataset2 = pd.read_csv("export_CKD.csv",delimiter=",")
# after cleaning 
dataset2.isnull().sum()


Unnamed: 0        0
id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [6]:
array = dataset2.values
#Split dataset
#Sample vectors
X = array[:,2:26]
# Target vector (class labels) 
y = array[:,26]
# Print the shape of the array
X.shape, y.shape

((400, 24), (400,))

In [7]:
# k is the number of features you want to select
X_clf_new=SelectKBest(score_func=chi2,k=3).fit_transform(X,y)
print(X_clf_new[:5])
dataset2.head()

[[ 121.           36.         7800.        ]
 [ 148.03651685   18.         6000.        ]
 [ 423.           53.         7500.        ]
 [ 117.           56.         6700.        ]
 [ 106.           26.         7300.        ]]


Unnamed: 0.1,Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,0,48.0,80.0,1.02,1.0,0.0,0,0,1,...,44.0,7800.0,5.2,1,1,0,1,0,0,1
1,1,1,7.0,50.0,1.02,4.0,0.0,0,0,1,...,38.0,6000.0,4.707435,0,0,0,1,0,0,1
2,2,2,62.0,80.0,1.01,2.0,3.0,0,0,1,...,31.0,7500.0,4.707435,0,1,0,0,0,1,1
3,3,3,48.0,70.0,1.005,4.0,0.0,0,1,0,...,32.0,6700.0,3.9,1,0,0,0,1,1,1
4,4,4,51.0,80.0,1.01,2.0,0.0,0,0,1,...,35.0,7300.0,4.6,0,0,0,1,0,0,1


In [9]:
from ReliefF import ReliefF
fs = ReliefF(n_neighbors=20, n_features_to_keep=3)
X_train = fs.fit_transform(X, y)
print("(No. of tuples, No. of Columns before ReliefF) : " +str(dataset2.shape)+
     "\n(No. of tuples, No. of Columns after ReliefF) : "+str(X_train.shape ) )   
print(X_train)
#htn , pcc , dm 

(No. of tuples, No. of Columns before ReliefF) : (400, 27)
(No. of tuples, No. of Columns after ReliefF) : (400, 3)
[[1. 1. 1.]
 [0. 0. 1.]
 [0. 1. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
