\newpage

## Question

1) We consider the dataset from the Early Stage of Indians Chronic Kidney Disease (CKD) project for the following analysis. 

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from scipy import stats
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.model_selection import train_test_split

In [8]:
kd = pd.read_csv(r"C:\Users\Wjy\Desktop\Assignment-6-Chronic-Kidney-Disease-Classification-Challenge\kidney_disease.csv")

In [9]:
kd = kd.drop('id', axis = 1)

In [10]:
kd.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [11]:
kd.dtypes

age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [12]:
kd

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51,7200,5.9,no,no,no,good,no,no,notckd


The classification problem here is that we need to predict whether a patient has Chronic Kidney Disease (CKD) based on various medical indicators. According to a literature review by Sanmarchi et al., (2023), CKD is a state of progressive loss of kidney function ultimately resulting in the need for renal replacement therapy. CKD prevalence is growing worldwide, and so it is important to assess how to efficiently diagnose and treat patients with CKD. And so our objective is to use ML to help distinguish between healthy individuals and those with CKD.

2.

In [13]:
float_col = kd.select_dtypes(include = 'float64').columns
object_col = kd.select_dtypes(include = 'object').columns

In [14]:
mappings = {
    'rbc': {'normal': True, 'abnormal': False},
    'pc': {'normal': True, 'abnormal': False},
    'pcc': {'present': True, 'notpresent': False},
    'ba': {'present': True, 'notpresent': False},
    'htn': {'yes': True, 'no': False},
    'dm': {'yes': True, 'no': False},
    'cad': {'yes': True, 'no': False},
    'pe': {'yes': True, 'no': False},
    'ane': {'yes': True, 'no': False},
    'appet': {'good': True, 'poor': False},
}


In [15]:
for column, mapping in mappings.items():
    kd[column] = kd[column].replace(mapping)

In [16]:
kd.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,True,False,False,121.0,...,44,7800.0,5.2,True,True,False,True,False,False,ckd
1,7.0,50.0,1.02,4.0,0.0,,True,False,False,,...,38,6000.0,,False,False,False,True,False,False,ckd
2,62.0,80.0,1.01,2.0,3.0,True,True,False,False,423.0,...,31,7500.0,,False,True,False,False,False,True,ckd
3,48.0,70.0,1.005,4.0,0.0,True,False,True,False,117.0,...,32,6700.0,3.9,True,False,False,False,True,True,ckd
4,51.0,80.0,1.01,2.0,0.0,True,True,False,False,106.0,...,35,7300.0,4.6,False,False,False,True,False,False,ckd
5,60.0,90.0,1.015,3.0,0.0,,,False,False,74.0,...,39,7800.0,4.4,True,True,False,True,True,False,ckd
6,68.0,70.0,1.01,0.0,0.0,,True,False,False,100.0,...,36,,,False,False,False,True,False,False,ckd
7,24.0,,1.015,2.0,4.0,True,False,False,False,410.0,...,44,6900.0,5.0,False,True,False,True,True,False,ckd
8,52.0,100.0,1.015,3.0,0.0,True,False,True,False,138.0,...,33,9600.0,4.0,True,True,False,True,False,True,ckd
9,53.0,90.0,1.02,2.0,0.0,False,False,True,False,70.0,...,29,12100.0,3.7,True,True,False,False,False,True,ckd


In [17]:
scaler = StandardScaler()
kd[float_col] = scaler.fit_transform(kd[float_col])