In [1]:
import numpy as np
import pandas as pd
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv("/Users/saitejatangudu/Desktop/DATASETS/kidney_disease.csv")
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
data.dtypes

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [4]:
data.drop('id', axis = 1, inplace = True)
# rename column names to make it more user-friendly
data.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [5]:
# converting necessary columns to numerical type
data['packed_cell_volume'] = pd.to_numeric(data['packed_cell_volume'], errors='coerce')
data['white_blood_cell_count'] = pd.to_numeric(data['white_blood_cell_count'], errors='coerce')
data['red_blood_cell_count'] = pd.to_numeric(data['red_blood_cell_count'], errors='coerce')
# Extracting categorical and numerical columns
cat_cols = [col for col in data.columns if data[col].dtype == 'object']
num_cols = [col for col in data.columns if data[col].dtype != 'object']

In [6]:
# replace incorrect values
data['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
data['coronary_artery_disease'] = data['coronary_artery_disease'].replace(to_replace = '\tno', value='no')
data['class'] = data['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

# replacing 'ckd' with 0 and 'not ckd' with 1
data['class'] = data['class'].map({'ckd': 0, 'not ckd': 1})

# making 'class' column into a numerical column
data['class'] = pd.to_numeric(data['class'], errors='coerce')

In [7]:
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,0
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,0
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,0
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,0
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,0


In [8]:
from sklearn.impute import KNNImputer 

In [9]:
def encode(col):
    '''function to encode non-null col and replace it in the original col'''
    #retains only non-null values
    nonulls = np.array(col.dropna())
    #reshapes the col for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    col.loc[col.notnull()] = np.squeeze(impute_ordinal)
    return col

In [10]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
for i in cat_cols:
    encode(data[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

In [11]:
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,7.0,50.0,1.02,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
Imputer = KNNImputer(n_neighbors=5)
Impute_data= pd.DataFrame(Imputer.fit_transform(data))
for i,j in enumerate(data.columns):
    data[j] = Impute_data[i] 
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,0.8,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,7.0,50.0,1.02,4.0,0.0,0.6,1.0,0.0,0.0,113.0,...,38.0,6000.0,4.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,3.8,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
data.isna().sum()

age                        0
blood_pressure             0
specific_gravity           0
albumin                    0
sugar                      0
red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
blood_glucose_random       0
blood_urea                 0
serum_creatinine           0
sodium                     0
potassium                  0
haemoglobin                0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetes_mellitus          0
coronary_artery_disease    0
appetite                   0
peda_edema                 0
aanemia                    0
class                      0
dtype: int64

In [14]:
# import module
from sklearn.preprocessing import MinMaxScaler
# scale features
scaler = MinMaxScaler()
model=scaler.fit(data)
Impute_data=model.transform(data)

# print scaled features
data = pd.DataFrame(Impute_data,columns = data.columns)
data.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,0.522727,0.230769,0.75,0.2,0.0,0.8,1.0,0.0,0.0,0.211538,...,0.777778,0.231405,0.525424,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.056818,0.0,0.75,0.8,0.0,0.6,1.0,0.0,0.0,0.194444,...,0.644444,0.157025,0.484746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.681818,0.230769,0.25,0.4,0.6,1.0,1.0,0.0,0.0,0.856838,...,0.488889,0.219008,0.288136,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.522727,0.153846,0.0,0.8,0.0,1.0,0.0,1.0,0.0,0.202991,...,0.511111,0.18595,0.305085,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,0.556818,0.230769,0.25,0.4,0.0,1.0,1.0,0.0,0.0,0.179487,...,0.577778,0.210744,0.423729,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
from collections import Counter
Counter(data["class"])

Counter({0.0: 250, 1.0: 150})

### DATA RESAMPLING

### GENERATING TEST DATA

In [16]:
X = data.drop(['class'],axis=1)
y = data["class"]

In [17]:
from sklearn.utils import Bunch
from src.measures import tlcm, degIR, degOver, imbalance_ratio, n_1_imb_mean, n_3_imb_mean

dataset = Bunch(data=np.array(X),target=np.array(y))
TLCM = tlcm(dataset)
print("TLCM",TLCM)

TLCM 0.006666666666666667


In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [18]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

131 209 19 41


In [19]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))
     

0.6

In [20]:
train_data = x_train
train_label = y_train
test_data = x_test
test_label=y_test

In [21]:
def cost_weights(indices,y_train,k):
    from sklearn.preprocessing import MinMaxScaler
    class_0 = []
    class_1 = []
    data = pd.DataFrame(indices)
    data = data.set_index(data.columns[0])
    for i in data.index:
        dict1 = {}
        for j in data.iloc[i]:
            dict1[y_train.iloc[j]] = dict1.get(y_train.iloc[j],0)+1
        class_0.append(dict1.get(0,0))
        class_1.append(dict1.get(1,0))
    data["class_0"] = class_0
    data["class_1"] = class_1
    data["Class"] = y_train
    data["cost"] = np.where(data["Class"]==0,((data["class_1"]+1)/k)*sum(data["Class"]==1),((data["class_0"]+1)/k)*sum(data["Class"]==0))
    return data["cost"]

def AdaWeight(x_train,y_train,k):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=k).fit(x_train)
    dist, indices = nbrs.kneighbors(x_train)
    return cost_weights(indices,y_train,k)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42)
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9833333333333333
Precision score 0.9743589743589743
Recall score 1.0
F1 score 0.95
Kappa score 0.9620253164556962
Gmean weighted score 0.9746794344808963
Roc auc score 0.9878048780487805
Precision -Recall score 0.9666666666666667


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight={0:sum(np.array(train_label)==1),1:sum(np.array(train_label)==0)})
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 1.0
Precision score 1.0
Recall score 1.0
F1 score 1.0
Kappa score 1.0
Gmean weighted score 1.0
Roc auc score 1.0
Precision -Recall score 1.0


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight="balanced")
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9833333333333333
Precision score 0.9743589743589743
Recall score 1.0
F1 score 0.95
Kappa score 0.9620253164556962
Gmean weighted score 0.9746794344808963
Roc auc score 0.9878048780487805
Precision -Recall score 0.9666666666666667


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42)
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel(),AdaWeight(train_data,train_label,k=5))
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 1.0
Precision score 1.0
Recall score 1.0
F1 score 1.0
Kappa score 1.0
Gmean weighted score 1.0
Roc auc score 1.0
Precision -Recall score 1.0


In [26]:
from sklearn.utils import Bunch
from src.measures import tlcm, degIR, degOver, imbalance_ratio, n_1_imb_mean, n_3_imb_mean

dataset = Bunch(data=np.array(X),target=np.array(y))
TLCM = tlcm(dataset)
IR   = imbalance_ratio(dataset)
N1   = n_1_imb_mean(dataset)
N3   = n_3_imb_mean(dataset)

In [27]:
def cal_class_weight(measure,value):
    C_W = {}
    if measure=="IR":
        C_W[0] = round((1/value)*100,2)
        C_W[1] = 100
    else:
        C_W[0] = round(value*100,2)
        C_W[1] = 100
    return C_W

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('IR',imbalance_ratio(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 1.0
Precision score 1.0
Recall score 1.0
F1 score 1.0
Kappa score 1.0
Gmean weighted score 1.0
Roc auc score 1.0
Precision -Recall score 1.0


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('TLCM',tlcm(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.95
Precision score 0.9268292682926829
Recall score 1.0
F1 score 0.8636363636363636
Kappa score 0.8891625615763546
Gmean weighted score 0.9293203772845852
Roc auc score 0.9634146341463415
Precision -Recall score 0.9136363636363637


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('N1',n_1_imb_mean(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9666666666666667
Precision score 0.9500000000000001
Recall score 1.0
F1 score 0.9047619047619048
Kappa score 0.9250936329588015
Gmean weighted score 0.9511897312113419
Roc auc score 0.975609756097561
Precision -Recall score 0.9380952380952381


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('N3',n_3_imb_mean(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9666666666666667
Precision score 0.9500000000000001
Recall score 1.0
F1 score 0.9047619047619048
Kappa score 0.9250936329588015
Gmean weighted score 0.9511897312113419
Roc auc score 0.975609756097561
Precision -Recall score 0.9380952380952381
