# <span style='color:#F52887'> Import Libraries </span>

In [17]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
import matplotlib
matplotlib.rcParams['axes.labelsize'] = 18
matplotlib.rcParams['xtick.labelsize'] = 18
matplotlib.rcParams['ytick.labelsize'] = 18
matplotlib.rcParams['text.color'] = 'm'

import seaborn as sns
import plotly.express as px2ww
import plotly.figure_factory as ff
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# <span style='color:#F52887'> Read Data </span>

In [18]:
kid=pd.read_csv("Kidney.csv")
kid.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd


## <span style='color:#FF00FF'> Shuffle Dataframe </span>

In [19]:
kid=kid.sample(frac=1)
cls=kid['class']
kid.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
64,55,80,1.010,0,0,?,normal,notpresent,notpresent,146,...,?,?,?,no,no,no,good,no,no,ckd
336,25,60,1.020,0,0,normal,normal,notpresent,notpresent,119,...,40,9200,5.2,no,no,no,good,no,no,notckd
378,71,60,1.025,0,0,normal,normal,notpresent,notpresent,?,...,42,7700,5.5,no,no,no,good,no,no,notckd
202,78,60,?,?,?,?,?,notpresent,notpresent,114,...,24,?,?,no,yes,no,good,no,yes,ckd
364,73,80,1.025,0,0,normal,normal,notpresent,notpresent,118,...,45,9300,4.7,no,no,no,good,no,no,notckd


## <span style='color:#FF00FF'> Data Attributes </span>

In [20]:
print("______________________________________________________")
print("\tData has {} Rows and {} Columns.".format(kid.shape[0],kid.shape[1]))
print("______________________________________________________")

______________________________________________________
	Data has 400 Rows and 25 Columns.
______________________________________________________


## <span style='color:#FF00FF'> Data Information </span>

In [21]:
kid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 64 to 279
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     400 non-null    object
 1   bp      400 non-null    object
 2   sg      400 non-null    object
 3   al      400 non-null    object
 4   su      400 non-null    object
 5   rbc     400 non-null    object
 6   pc      400 non-null    object
 7   pcc     400 non-null    object
 8   ba      400 non-null    object
 9   bgr     400 non-null    object
 10  bu      400 non-null    object
 11  sc      400 non-null    object
 12  sod     400 non-null    object
 13  pot     400 non-null    object
 14  hemo    400 non-null    object
 15  pcv     400 non-null    object
 16  wbcc    400 non-null    object
 17  rbcc    400 non-null    object
 18  htn     400 non-null    object
 19  dm      400 non-null    object
 20  cad     400 non-null    object
 21  appet   400 non-null    object
 22  pe      400 non-null    o

## <span style='color:#FF00FF'> Data Attributes </span>

In [22]:
kid.describe()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
count,400,400,400.0,400,400,400,400,400,400,400,...,400,400,400,400,400,400,400,400,400,400
unique,77,11,6.0,7,7,3,3,3,3,147,...,43,90,46,3,3,3,3,3,3,2
top,60,80,1.02,0,0,normal,normal,notpresent,notpresent,?,...,?,?,?,no,no,no,good,no,no,ckd
freq,19,116,106.0,199,290,201,259,354,374,44,...,71,106,131,251,261,364,317,323,339,250


# <span style='color:#F52887'> Data Preprocessing </span>

In [23]:
uchar="?"            # Replace "?" by NAN
for i in range(len(kid)):
    for j in range(len(kid.columns.tolist())):
        if kid.iloc[i][j]=="?":
            kid.iloc[i][j]=np.nan

## <span style='color:#FF00FF'> Check NAN values </span>

In [24]:
kid.isna().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

## <span style='color:#FF00FF'> Data Cleaning-Removing NAN </span>

In [25]:
kid=kid.fillna(0)

In [26]:
kid.isna().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

In [10]:
kid.head(2)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
282,20,70,1.02,0,0,normal,normal,notpresent,notpresent,123,...,44,5500,4.8,no,no,no,good,no,no,notckd
212,40,70,1.015,3,4,normal,normal,notpresent,notpresent,253,...,31,8800,3.4,yes,yes,no,poor,yes,no,ckd


## <span style='color:#FF00FF'> Data Reconstruction with variable types </span>

### <span style='color:#A74AC7'> Reconstruction of Numerical data </span>

In [11]:
kid['age']=np.array(kid['age'],float)
kid['bp']=np.array(kid['bp'],float)
kid['sg']=np.array(kid['sg'],float)
kid['al']=np.array(kid['al'],int)
kid['su']=np.array(kid['su'],int)
kid['bgr']=np.array(kid['bgr'],int)
kid['bu']=np.array(kid['bu'],float)
kid['sc']=np.array(kid['sc'],float)
kid['sod']=np.array(kid['sod'],float)
kid['pot']=np.array(kid['pot'],float)
kid['hemo']=np.array(kid['hemo'],float)
kid['pcv']=np.array(kid['pcv'],float)
kid['wbcc']=np.array(kid['wbcc'],int)
kid['rbcc']=np.array(kid['rbcc'],float)
kid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 282 to 191
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    int32  
 4   su      400 non-null    int32  
 5   rbc     400 non-null    object 
 6   pc      400 non-null    object 
 7   pcc     400 non-null    object 
 8   ba      400 non-null    object 
 9   bgr     400 non-null    int32  
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    float64
 16  wbcc    400 non-null    int32  
 17  rbcc    400 non-null    float64
 18  htn     400 non-null    object 
 19  dm      400 non-null    object 
 20  cad     400 non-null    object 
 21  appet   400 non-null    object 
 22  

### <span style='color:#A74AC7'> Reconstruction of Object data </span>

In [12]:
objtypes=kid.dtypes[kid.dtypes=='object'].index.tolist()
objtypes

['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']

### <span style='color:#571B7E'> Before Reconstruction </span>

In [13]:
for k in range(len(objtypes)):
    print(kid[objtypes[k]].value_counts())

normal      201
0           152
abnormal     47
Name: rbc, dtype: int64
normal      259
abnormal     76
0            65
Name: pc, dtype: int64
notpresent    354
present        42
0               4
Name: pcc, dtype: int64
notpresent    374
present        22
0               4
Name: ba, dtype: int64
no     251
yes    147
0        2
Name: htn, dtype: int64
no     261
yes    137
0        2
Name: dm, dtype: int64
no     364
yes     34
0        2
Name: cad, dtype: int64
good    317
poor     82
0         1
Name: appet, dtype: int64
no     323
yes     76
0        1
Name: pe, dtype: int64
no     339
yes     60
0        1
Name: ane, dtype: int64
ckd       250
notckd    150
Name: class, dtype: int64


### <span style='color:#571B7E'> After Reconstruction </span>

In [14]:
for k in range(len(objtypes)):
    untyp=kid[objtypes[k]].value_counts().index.tolist()
    try:
        zind=untyp.index(0)
    except:
        pass
    try:
        untyp.remove(0)
    except:
        pass
    unval=kid[objtypes[k]].value_counts().tolist()
    try:
        unval.remove(unval[zind])
    except:
        pass
    mxtyp=untyp[unval.index(max(unval))]
    for i in range(len(kid[objtypes[k]])):
        if kid[objtypes[k]][i] == 0:
            kid[objtypes[k]][i]=mxtyp
    print(kid[objtypes[k]].value_counts())

## <span style='color:#FF00FF'> Printing Cleaned Data </span>

normal      353
abnormal     47
Name: rbc, dtype: int64
normal      324
abnormal     76
Name: pc, dtype: int64
notpresent    358
present        42
Name: pcc, dtype: int64
notpresent    378
present        22
Name: ba, dtype: int64
no     253
yes    147
Name: htn, dtype: int64
no     263
yes    137
Name: dm, dtype: int64
no     366
yes     34
Name: cad, dtype: int64
good    318
poor     82
Name: appet, dtype: int64
no     324
yes     76
Name: pe, dtype: int64
no     340
yes     60
Name: ane, dtype: int64
ckd       250
notckd    150
Name: class, dtype: int64


## <span style='color:#FF00FF'> Printing Cleaned Data </span>

In [15]:
kid.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
282,20.0,70.0,1.02,0,0,normal,normal,notpresent,notpresent,123,...,44.0,5500,4.8,no,no,no,good,no,no,notckd
212,40.0,70.0,1.015,3,4,normal,normal,notpresent,notpresent,253,...,31.0,8800,3.4,yes,yes,no,poor,yes,no,ckd
346,33.0,60.0,0.0,0,0,normal,normal,notpresent,notpresent,130,...,52.0,4300,5.8,no,no,no,good,no,no,notckd
175,60.0,50.0,1.01,0,0,normal,normal,notpresent,notpresent,261,...,0.0,4200,3.4,yes,no,no,good,no,no,ckd
246,48.0,110.0,1.015,3,0,abnormal,normal,present,notpresent,106,...,26.0,5000,2.5,yes,no,yes,good,no,yes,ckd


In [16]:
kid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 282 to 191
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    int32  
 4   su      400 non-null    int32  
 5   rbc     400 non-null    object 
 6   pc      400 non-null    object 
 7   pcc     400 non-null    object 
 8   ba      400 non-null    object 
 9   bgr     400 non-null    int32  
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    float64
 16  wbcc    400 non-null    int32  
 17  rbcc    400 non-null    float64
 18  htn     400 non-null    object 
 19  dm      400 non-null    object 
 20  cad     400 non-null    object 
 21  appet   400 non-null    object 
 22  

## <span style='color:#FF00FF'> Data Attributes </span>

Data Attributes

    age - age
    bp - blood pressure
    sg - specific gravity
    al - albumin
    su - sugar
    rbc - red blood cells
    pc - pus cell
    pcc - pus cell clumps
    ba - bacteria
    bgr - blood glucose random
    bu - blood urea
    sc - serum creatinine
    sod - sodium
    pot - potassium
    hemo - hemoglobin
    pcv - packed cell volume
    wc - white blood cell count
    rc - red blood cell count
    htn - hypertension
    dm - diabetes mellitus
    cad - coronary artery disease
    appet - appetite
    pe - pedal edema
    ane - anemia
    class - class