In [1]:
import pandas as pd
import numpy as np

In [2]:
df =  pd.read_csv('kidney_disease.csv')

In [3]:
#
### Converting the categorical data to numerical data for the imputer 
#
df['rbc'] = df['rbc'].map({  'normal' : 1 , 'abnormal' : 0}) 
df['htn'] = df['htn'].map({  'yes' : 1 , 'no' : 0})  
df['dm'] = df['dm'].map({  'yes' : 1 , 'no' : 0, '\tyes' : 1, '\tno' : 0, ' yes' : 1})  
df['cad'] = df['cad'].map({  'yes' : 1 , 'no' : 0, '\tno' : 0 }) 
df['appet'] = df['appet'].map({  'good' : 1 , 'poor' : 0, '\tno' : 0 })  
df['pe'] = df['pe'].map({  'yes' : 1 , 'no' : 0})  
df['ane'] = df['ane'].map({  'yes' : 1 , 'no' : 0})   
df['ba'] = df['ba'].map({  'present' : 1 , 'notpresent' : 0})   
df['pc'] = df['pc'].map({  'normal' : 1 , 'abnormal' : 0})  
df['pcc'] = df['pcc'].map({  'present' : 1 , 'notpresent' : 0})  

# Mapping the feature column to numerical values.
df['classification'] = df['classification'].map({  'notckd' : 0 , 'ckd' : 1, 'ckd\t' :  1 })  


In [4]:
# Dummy vars for categorical data.
#df = pd.get_dummies(data=df, columns=[ 'rbc', 'htn', 'dm', 'cad', 'appet','pe','ane','ba', 'pc', 'pcc' ]) 


""" 
    Positioning the dependent column as the last column.
"""
# Getting a copy of series of target column
temp_target = df['classification']   

# Removing the target column at the present position
df.pop('classification')  

# Adding back  the column target as the last column
df.insert(df.shape[1], 'classification', temp_target)

In [5]:
# FIXING THE SAMPLE 
# Some values were '\t?' 
df[(df.values.ravel() == '\t?').reshape(df.shape).any(1)]  
df.iloc[66,16] = np.NAN
df.iloc[162,18] = np.NAN  
df.iloc[185,17] = np.NAN

## Splitting the data set

In [6]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values 

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42) 

# Importing and assigning

In [8]:
from sklearn.impute import SimpleImputer
s_imputer_mean = SimpleImputer( strategy='mean')  
s_imputer_median =  SimpleImputer( strategy='median')  
s_imputer_mf = SimpleImputer( strategy='most_frequent')  
s_imputer_c = SimpleImputer( strategy='constant', fill_value = 0 )  

In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 
from sklearn.linear_model import LinearRegression 
it_imputer = IterativeImputer(estimator=LinearRegression())

In [10]:
from sklearn.impute import KNNImputer  
knn_imputer = KNNImputer()

# Assigning training features data for different imputers

In [11]:
X_train_fill_list = list() 
# 0 
X_train_fill_list.append(s_imputer_mean.fit_transform(X_train))   
# 1
X_train_fill_list.append(s_imputer_median.fit_transform(X_train))   
# 2 
X_train_fill_list.append(s_imputer_mf.fit_transform(X_train))   
# 3 
X_train_fill_list.append(s_imputer_c.fit_transform(X_train)) 
# 4 
X_train_fill_list.append(it_imputer.fit_transform(X_train))  
# 5 
X_train_fill_list.append(knn_imputer.fit_transform(X_train)) 



# Assigning testing features data for different imputers

In [12]:
X_test_fill_list = list() 
# 0
X_test_fill_list.append(s_imputer_mean.transform(X_test))  
#1
X_test_fill_list.append(s_imputer_median.transform(X_test))  
#2
X_test_fill_list.append(s_imputer_mf.transform(X_test))  
#3
X_test_fill_list.append(s_imputer_c.transform(X_test))  
#4
X_test_fill_list.append(it_imputer.transform(X_test))  
#5
X_test_fill_list.append(knn_imputer.transform(X_test)) 

# Standart scaler for X_train and X_test  and dummies vars


In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

###### Scaling and assigning the dummies vars


In [14]:
# Scaling the X_test and X_train. Also getting the dummies for the samples.

for i in range( len(X_train_fill_list)):  
    
    # Scaling
    X_train_fill_list[i] = sc.fit_transform(X_train_fill_list[i])  
    
    # Stroing in a temporary dataframe
    temp_df = pd.DataFrame(X_train_fill_list[i], columns = df.columns[:-1] )   
    # Getting dummies for catgorical data
    temp_df = pd.get_dummies(data=temp_df, columns=[ 'rbc', 'htn', 'dm', 'cad', 'appet','pe','ane','ba', 'pc', 'pcc' ]) 
    # Assigning back the values of the modified dataframe
    X_train_fill_list[i] = temp_df.values[:,:42]
    
    
    # Scaling the X_test
    X_test_fill_list[i] = sc.transform(X_test_fill_list[i]) 
    
    # Stroing in a temporary dataframe
    temp_df = pd.DataFrame(X_test_fill_list[i], columns = df.columns[:-1] )   
    # Getting dummies for catgorical data
    temp_df = pd.get_dummies(data=temp_df, columns=[ 'rbc', 'htn', 'dm', 'cad', 'appet','pe','ane','ba', 'pc', 'pcc' ]) 
    # Assigning back the values of the modified dataframe
    X_test_fill_list[i] = temp_df.values[:,:42]

In [15]:
# Just a list with Imputer's names
method_name= ['Mean Simple Imputer','Median Simple Imputer', 'Most Freq Simple Imputer','Constant Simple Imputer', 'Iterative Imputer','knn Imputer']

# Importing the models and creating the dataframe

In [16]:
from sklearn.linear_model import LogisticRegression
log_cls = LogisticRegression(random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn_cls = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)   

from sklearn.naive_bayes import GaussianNB
gauss_cls = GaussianNB()

In [17]:
# Initializing the column's names

df = pd.DataFrame(columns =['ML model', 'Missing value handling technique', 'Accuracy'])

In [18]:
# Importing  accuracy utils
from sklearn.metrics import accuracy_score 

In [19]:
# Lopping and predicting the results
for i in range (len(method_name)):  
    
    # Logistic Regression 
    log_cls.fit( X_train_fill_list[i],y_train) 
    y_pred = log_cls.predict(X_test_fill_list[i]) 
    a = accuracy_score(y_test, y_pred) 
    new_row = {  'ML model' : 'LogisticReg',  'Missing value handling technique' : f'{method_name[i]}', 'Accuracy' : a } 
    df = df.append( new_row, ignore_index=True  )  
    
    # KNN
    knn_cls.fit( X_train_fill_list[i],y_train)  
    y_pred = knn_cls.predict(X_test_fill_list[i]) 
    a = accuracy_score(y_test, y_pred) 
    new_row = {  'ML model' : 'KNN',  'Missing value handling technique' : f'{method_name[i]}', 'Accuracy' : a } 
    df = df.append( new_row, ignore_index=True  ) 
    
    # NB
    gauss_cls.fit( X_train_fill_list[i],y_train)  
    y_pred = gauss_cls.predict(X_test_fill_list[i]) 
    a = accuracy_score(y_test, y_pred) 
    new_row = {  'ML model' : 'Naive Bayes',  'Missing value handling technique' : f'{method_name[i]}', 'Accuracy' : a } 
    df = df.append( new_row, ignore_index=True  ) 
    
    
    

# Results

In [20]:

df

Unnamed: 0,ML model,Missing value handling technique,Accuracy
0,LogisticReg,Mean Simple Imputer,0.99
1,KNN,Mean Simple Imputer,0.99
2,Naive Bayes,Mean Simple Imputer,0.64
3,LogisticReg,Median Simple Imputer,1.0
4,KNN,Median Simple Imputer,0.97
5,Naive Bayes,Median Simple Imputer,0.96
6,LogisticReg,Most Freq Simple Imputer,0.995
7,KNN,Most Freq Simple Imputer,0.96
8,Naive Bayes,Most Freq Simple Imputer,0.96
9,LogisticReg,Constant Simple Imputer,1.0


# I reduced the X_train and X_test to 42 features because of an erorr of the LogisticRegression(). 

 # Still there are  really surprisingly  results for predicting just 1/2 of the sample.   
 
 #  We can observe just a bad model on NaiveBayes, a 0.64 on Mean Imputer. 
 
 # It is either good results because of replacing correctly the NaN values, either overfitting, the whole sample has not been reduced.