In [33]:
import numpy as np
import pandas as pd
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score

In [34]:
data = pd.read_csv("/Users/saitejatangudu/Desktop/DATASETS/kidney_disease.csv")
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [35]:
data.isna().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [22]:
data.dtypes

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [36]:
data.drop('id', axis = 1, inplace = True)
# rename column names to make it more user-friendly
data.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [37]:
# converting necessary columns to numerical type
data['packed_cell_volume'] = pd.to_numeric(data['packed_cell_volume'], errors='coerce')
data['white_blood_cell_count'] = pd.to_numeric(data['white_blood_cell_count'], errors='coerce')
data['red_blood_cell_count'] = pd.to_numeric(data['red_blood_cell_count'], errors='coerce')
# Extracting categorical and numerical columns
cat_cols = [col for col in data.columns if data[col].dtype == 'object']
num_cols = [col for col in data.columns if data[col].dtype != 'object']

In [38]:
# replace incorrect values
data['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
data['coronary_artery_disease'] = data['coronary_artery_disease'].replace(to_replace = '\tno', value='no')
data['class'] = data['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

# replacing 'ckd' with 0 and 'not ckd' with 1
data['class'] = data['class'].map({'ckd': 0, 'not ckd': 1})

# making 'class' column into a numerical column
data['class'] = pd.to_numeric(data['class'], errors='coerce')

In [39]:
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,0
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,0
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,0
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,0
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,0


In [40]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

def encode(col):
    col = col.copy()
    nonulls = np.array(col.dropna())
    impute_reshape = nonulls.reshape(-1, 1)
    impute_ordinal = encoder.fit_transform(impute_reshape)
    col.loc[col.notnull()] = np.squeeze(impute_ordinal)
    return col

# Create an instance of OrdinalEncoder
encoder = OrdinalEncoder()

for i in cat_cols:
    data[i] = encode(data[i])

In [41]:
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,7.0,50.0,1.02,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
from sklearn.impute import KNNImputer 

In [43]:
Imputer = KNNImputer(n_neighbors=5)
Impute_data= pd.DataFrame(Imputer.fit_transform(data))
for i,j in enumerate(data.columns):
    data[j] = Impute_data[i] 
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,0.8,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,7.0,50.0,1.02,4.0,0.0,0.6,1.0,0.0,0.0,113.0,...,38.0,6000.0,4.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,3.8,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
data.isna().sum()

age                        0
blood_pressure             0
specific_gravity           0
albumin                    0
sugar                      0
red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
blood_glucose_random       0
blood_urea                 0
serum_creatinine           0
sodium                     0
potassium                  0
haemoglobin                0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetes_mellitus          0
coronary_artery_disease    0
appetite                   0
peda_edema                 0
aanemia                    0
class                      0
dtype: int64

In [45]:
# import module
from sklearn.preprocessing import MinMaxScaler
# scale features
scaler = MinMaxScaler()
model=scaler.fit(data)
Impute_data=model.transform(data)

# print scaled features
data = pd.DataFrame(Impute_data,columns = data.columns)
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,0.522727,0.230769,0.75,0.2,0.0,0.8,1.0,0.0,0.0,0.211538,...,0.777778,0.231405,0.525424,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.056818,0.0,0.75,0.8,0.0,0.6,1.0,0.0,0.0,0.194444,...,0.644444,0.157025,0.484746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.681818,0.230769,0.25,0.4,0.6,1.0,1.0,0.0,0.0,0.856838,...,0.488889,0.219008,0.288136,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.522727,0.153846,0.0,0.8,0.0,1.0,0.0,1.0,0.0,0.202991,...,0.511111,0.18595,0.305085,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,0.556818,0.230769,0.25,0.4,0.0,1.0,1.0,0.0,0.0,0.179487,...,0.577778,0.210744,0.423729,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
from collections import Counter
Counter(data["class"])

Counter({0.0: 250, 1.0: 150})

### DATA RESAMPLING

### GENERATING TEST DATA

In [47]:
X = data.drop(['class'],axis=1)
y = data["class"]

In [48]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [49]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

131 209 19 41


In [50]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))
     

0.6

In [52]:
x_test.to_csv("ckd_test_data.csv",index=False)
y_test.to_csv("ckd_test_label.csv",index=False)

In [51]:
x_train.to_csv("ckd_train_data.csv",index=False)
y_train.to_csv("ckd_train_label.csv",index=False)

### GENERATING EXT_IMB DATASET(05:95)

In [21]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import RandomUnderSampler

In [22]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 209, 1.0: 131})


In [23]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.05,(Counter(y)[1]+Counter(y)[0])*0.95
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:round(no_majority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling minority class %s' % Counter(y_res))
rnd = RandomUnderSampler(random_state=42,sampling_strategy={1:round(no_minority),0:Counter(y_res)[0]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 5:95 ratio%s' % Counter(y_final))



Resampled dataset shape After oversampling minority class Counter({0.0: 317, 1.0: 131})
Resampled dataset shape with 5:95 ratioCounter({0.0: 317, 1.0: 17})


In [24]:
X_final.to_csv("ckd_ext_imb_data.csv",index=False)
y_final.to_csv("ckd_ext_imb_label.csv",index=False)

### GENERATING 15285_IMB DATASET(15:85)

In [25]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 209, 1.0: 131})


In [26]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.15,(Counter(y)[1]+Counter(y)[0])*0.85
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:round(no_majority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling minority class %s' % Counter(y_res))
rnd = RandomUnderSampler(random_state=42,sampling_strategy={1:round(no_minority),0:Counter(y_res)[0]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 5:95 ratio%s' % Counter(y_final))



Resampled dataset shape After oversampling minority class Counter({0.0: 285, 1.0: 131})
Resampled dataset shape with 5:95 ratioCounter({0.0: 285, 1.0: 51})


In [27]:
X_final.to_csv("ckd_15285_imb_data.csv",index=False)
y_final.to_csv("ckd_15285_imb_label.csv",index=False)

### GENERATING MOD_IMB DATASET(30:70)

In [28]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 209, 1.0: 131})


In [29]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.30,(Counter(y)[1]+Counter(y)[0])*0.70
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:round(no_majority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling minority class %s' % Counter(y_res))
rnd = RandomUnderSampler(random_state=42,sampling_strategy={1:round(no_minority),0:Counter(y_res)[0]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 5:95 ratio%s' % Counter(y_final))



Resampled dataset shape After oversampling minority class Counter({0.0: 241, 1.0: 131})
Resampled dataset shape with 5:95 ratioCounter({0.0: 241, 1.0: 102})


In [30]:
X_final.to_csv("ckd_mod_imb_data.csv",index=False)
y_final.to_csv("ckd_mod_imb_label.csv",index=False)

### GENERATING NO_IMB DATASET(50:50)

In [31]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 209, 1.0: 131})


In [32]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape with 50:50 %s' % Counter(y_res))

Resampled dataset shape with 50:50 Counter({1.0: 210, 0.0: 209})


In [33]:
X_res.to_csv("ckd_no_imb_data.csv",index=False)
y_res.to_csv("ckd_no_imb_label.csv",index=False)