# Importing The Required Python Packages

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['axes.labelsize']=15
matplotlib.rcParams['xtick.labelsize']=15
matplotlib.rcParams['ytick.labelsize']=15
matplotlib.rcParams['text.color']='#FF1493'
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,f1_score,classification_report,accuracy_score

## Read the data in Python through pandas

In [1]:
kidney_disease_PATH1="../../DATASETS/ML_Training/kidney_disease.csv"
data=pd.read_csv(kidney_disease_PATH1)

NameError: name 'pd' is not defined

In [4]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
data.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

The data was taken over a 2-month period in India with 25 features ( eg, red blood cell count, white blood cell count, etc). The target is the 'classification', which is either 'ckd' or 'notckd' - ckd=chronic kidney disease. There are 400 rows

The data needs cleaning: in that it has NaNs and the numeric features need to be forced to floats. Basically, we were instructed to get rid of ALL ROWS with Nans, with no threshold - meaning, any row that has even one NaN, gets deleted.

Part 1: We are asked to choose 3 features (bgr, rc, wc), visualize them, then run the PCA with n_components=2.
the PCA is to be run twice: one with no scaling and the second run WITH scaling. And this is where my issue starts … in that after scaling I can hardly see any difference!

## ATTRIBUTE DETAILS<br>
`age - age
bp - blood pressure
sg - specific gravity
al - albumin
su - sugar
rbc - red blood cells
pc - pus cell
pcc - pus cell clumps
ba - bacteria
bgr - blood glucose random
bu - blood urea
sc - serum creatinine
sod - sodium
pot - potassium
hemo - hemoglobin
pcv - packed cell volume
wc - white blood cell count
rc - red blood cell count
htn - hypertension
dm - diabetes mellitus
cad - coronary artery disease
appet - appetite
pe - pedal edema
ane - anemia
class - class`

In [6]:
data.rename(columns={'age':'Age','bp':'Blood pressure','sg':'Specific gravity','al':'Albumin','su':'Sugar','rbc':'Red blood cells','pc':'Pus cell','pcc':'pus cell clumps','ba':'Bacteria','bgr':'Blood glucose random','bu':'Blood urea','sc':'Serum creatinine','sod':'Sodium','pot':'Potassium','hemo':'Hemoglobin','pcv':'Packed cell volume','wc':'White blood cell count','rc':'Red blood cell count','htn':'Hypertension','dm':'Diabetes mellitus','cad':'Coronary artery disease','appet':'Appetite','pe':'Pedal edema','ane':'Anemia','classification':'Class'},inplace=True)

In [7]:
data.columns

Index(['id', 'Age', 'Blood pressure', 'Specific gravity', 'Albumin', 'Sugar',
       'Red blood cells', 'Pus cell', 'pus cell clumps', 'Bacteria',
       'Blood glucose random', 'Blood urea', 'Serum creatinine', 'Sodium',
       'Potassium', 'Hemoglobin', 'Packed cell volume',
       'White blood cell count', 'Red blood cell count', 'Hypertension',
       'Diabetes mellitus', 'Coronary artery disease', 'Appetite',
       'Pedal edema', 'Anemia', 'Class'],
      dtype='object')

#### We use 24 + class = 25 ( 11 numeric ,14 nominal)
`1.Age(numerical) age in years
2.Blood Pressure(numerical) bp in mm/Hg
3.Specific Gravity(nominal) sg - (1.005,1.010,1.015,1.020,1.025)
4.Albumin(nominal) al - (0,1,2,3,4,5)
5.Sugar(nominal) su - (0,1,2,3,4,5)
6.Red Blood Cells(nominal) rbc - (normal,abnormal)
7.Pus Cell (nominal) pc - (normal,abnormal)
8.Pus Cell clumps(nominal) pcc - (present,notpresent)
9.Bacteria(nominal) ba - (present,notpresent)
10.Blood Glucose Random(numerical) bgr in mgs/dl
11.Blood Urea(numerical) bu in mgs/dl
12.Serum Creatinine(numerical) sc in mgs/dl
13.Sodium(numerical) sod in mEq/L
14.Potassium(numerical) pot in mEq/L
15.Hemoglobin(numerical) hemo in gms
16.Packed Cell Volume(numerical)
17.White Blood Cell Count(numerical) wc in cells/cumm
18.Red Blood Cell Count(numerical) rc in millions/cmm
19.Hypertension(nominal) htn - (yes,no)
20.Diabetes Mellitus(nominal) dm - (yes,no)
21.Coronary Artery Disease(nominal) cad - (yes,no)
22.Appetite(nominal) appet - (good,poor)
23.Pedal Edema(nominal) pe - (yes,no)
24.Anemia(nominal) ane - (yes,no)
25.Class (nominal) class - (ckd,notckd)`

In [8]:
data.shape

(400, 26)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       400 non-null    int64  
 1   Age                      391 non-null    float64
 2   Blood pressure           388 non-null    float64
 3   Specific gravity         353 non-null    float64
 4   Albumin                  354 non-null    float64
 5   Sugar                    351 non-null    float64
 6   Red blood cells          248 non-null    object 
 7   Pus cell                 335 non-null    object 
 8   pus cell clumps          396 non-null    object 
 9   Bacteria                 396 non-null    object 
 10  Blood glucose random     356 non-null    float64
 11  Blood urea               381 non-null    float64
 12  Serum creatinine         383 non-null    float64
 13  Sodium                   313 non-null    float64
 14  Potassium                3

In [13]:
datacorr=data.describe()

In [None]:
data.corr()

In [12]:
datacorr

Unnamed: 0,id,Age,Blood pressure,Specific gravity,Albumin,Sugar,Blood glucose random,Blood urea,Serum creatinine,Sodium,Potassium,Hemoglobin
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       400 non-null    int64  
 1   Age                      391 non-null    float64
 2   Blood pressure           388 non-null    float64
 3   Specific gravity         353 non-null    float64
 4   Albumin                  354 non-null    float64
 5   Sugar                    351 non-null    float64
 6   Red blood cells          248 non-null    object 
 7   Pus cell                 335 non-null    object 
 8   pus cell clumps          396 non-null    object 
 9   Bacteria                 396 non-null    object 
 10  Blood glucose random     356 non-null    float64
 11  Blood urea               381 non-null    float64
 12  Serum creatinine         383 non-null    float64
 13  Sodium                   313 non-null    float64
 14  Potassium                3

In [21]:
print(data['Class'].value_counts(),"\n")
print(data['Red blood cells'].value_counts(),"\n")
print(data['Pus cell'].value_counts(),"\n\n")
print(data['pus cell clumps'].value_counts(),"\n")
print(data['Bacteria'].value_counts(),"\n")
print(data['Packed cell volume'].value_counts(),"\n")
print(data['White blood cell count'].value_counts(),"\n")
print(data['Red blood cell count'].value_counts(),"\n")
print(data['Hypertension'].value_counts(),"\n")
print(data['Diabetes mellitus'].value_counts(),"\n")
print(data['Coronary artery disease'].value_counts(),"\n")
print(data['Appetite'].value_counts(),"\n")
print(data['Pedal edema'].value_counts(),"\n")
print(data['Anemia'].value_counts(),"\n")
print(data['Class'].value_counts(),"\n")

ckd       248
notckd    150
ckd\t       2
Name: Class, dtype: int64 

normal      201
abnormal     47
Name: Red blood cells, dtype: int64 

normal      259
abnormal     76
Name: Pus cell, dtype: int64 


notpresent    354
present        42
Name: pus cell clumps, dtype: int64 

notpresent    374
present        22
Name: Bacteria, dtype: int64 

41      21
52      21
44      19
48      19
40      16
43      14
42      13
45      13
32      12
36      12
33      12
50      12
28      12
34      11
37      11
30       9
29       9
35       9
46       9
31       8
24       7
39       7
26       6
38       5
53       4
51       4
49       4
47       4
54       4
25       3
27       3
22       3
19       2
23       2
15       1
21       1
17       1
20       1
\t43     1
18       1
9        1
14       1
\t?      1
16       1
Name: Packed cell volume, dtype: int64 

9800     11
6700     10
9200      9
9600      9
7200      9
         ..
19100     1
\t?       1
12300     1
14900     1
12700     