In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('kidneydata.csv')
data.head()

In [3]:
data.shape

(400, 26)

In [4]:
data.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [None]:
data = data.dropna()
data

In [11]:
data.isnull().sum()
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
9,9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,...,29,12100,3.7,yes,yes,no,poor,no,yes,ckd
11,11,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,...,32,4500,3.8,yes,yes,no,poor,yes,no,ckd
14,14,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,...,16,11000,2.6,yes,yes,yes,poor,yes,no,ckd
20,20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,...,24,9200,3.2,yes,yes,yes,poor,yes,yes,ckd


Convert categorical data to numerical data

In [12]:
data.replace({'rbc':{'normal':0,'abnormal':1},'pc':{'normal':0,'abnormal':1},'pcc':{'present':0,'notpresent':1},'ba':{'present':0,'notpresent':1},'htn':{'yes':1,'no':0},'dm':{'no':0,'yes':1},'cad':{'no':0,'yes':1},'appet':{'poor':0,'good':1},'pe':{'no':0,'yes':1},'ane':{'no':0,'yes':1},'classification':{'ckd':0,'notckd':1}},inplace=True)

In [13]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
3,3,48.0,70.0,1.005,4.0,0.0,0,1,0,1,...,32,6700,3.9,1,0,0,0,1,1,0
9,9,53.0,90.0,1.02,2.0,0.0,1,1,0,1,...,29,12100,3.7,1,1,0,0,0,1,0
11,11,63.0,70.0,1.01,3.0,0.0,1,1,0,1,...,32,4500,3.8,1,1,0,0,1,0,0
14,14,68.0,80.0,1.01,3.0,2.0,0,1,0,0,...,16,11000,2.6,1,1,1,0,1,0,0
20,20,61.0,80.0,1.015,2.0,0.0,1,1,1,1,...,24,9200,3.2,1,1,1,0,1,1,0


# 0 --> ckd that means chronic kidney disease 


# 1 --> notckd that means not chronic kidney disease

In [14]:
x = data.drop(columns='classification',axis = 1 )
y = data['classification']

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [19]:
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [20]:
pca = PCA(n_components=2)
X_train = pca.fit_transform(x_train)
X_test = pca.fit_transform(x_test)

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [23]:
x_pred = model.predict(X_test)

In [24]:
accuracy_score(x_pred,y_test)

1.0

In [27]:
x_prediction = model.predict(X_train)

In [31]:
x_prediction

array([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1])

In [32]:
y_train

342    1
366    1
304    1
392    1
196    0
      ..
229    0
356    1
380    1
360    1
397    1
Name: classification, Length: 126, dtype: int64

In [28]:
accuracy_score(x_prediction,y_train)

1.0