<h1>PROJECT ON ESTIMATION OF DIALYSIS</h1> 

# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import patsy

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.utils import resample
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Importing the dataset

In [3]:
df = pd.read_csv('chronic_kidney_disease_full.csv')

In [4]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [5]:
df.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [6]:
df.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [7]:
df_copy = df.dropna()

In [8]:
df_copy.shape

(158, 25)

In [9]:
df.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   rbc     248 non-null    object 
 6   pc      335 non-null    object 
 7   pcc     396 non-null    object 
 8   ba      396 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     329 non-null    float64
 16  wbcc    294 non-null    float64
 17  rbcc    269 non-null    float64
 18  htn     398 non-null    object 
 19  dm      398 non-null    object 
 20  cad     398 non-null    object 
 21  appet   399 non-null    object 
 22  pe

# droping some features

In [11]:
df.drop(['rbc','pc','sod','pot','pcv','wbcc','rbcc'], axis=1, inplace=True)

# data preprocessing

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   pcc     396 non-null    object 
 6   ba      396 non-null    object 
 7   bgr     356 non-null    float64
 8   bu      381 non-null    float64
 9   sc      383 non-null    float64
 10  hemo    348 non-null    float64
 11  htn     398 non-null    object 
 12  dm      398 non-null    object 
 13  cad     398 non-null    object 
 14  appet   399 non-null    object 
 15  pe      399 non-null    object 
 16  ane     399 non-null    object 
 17  class   400 non-null    object 
dtypes: float64(9), object(9)
memory usage: 56.4+ KB


In [13]:
df_v1 = df

In [14]:
df_v1.replace(np.nan, 0, inplace=True)

In [15]:
df_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   pcc     400 non-null    object 
 6   ba      400 non-null    object 
 7   bgr     400 non-null    float64
 8   bu      400 non-null    float64
 9   sc      400 non-null    float64
 10  hemo    400 non-null    float64
 11  htn     400 non-null    object 
 12  dm      400 non-null    object 
 13  cad     400 non-null    object 
 14  appet   400 non-null    object 
 15  pe      400 non-null    object 
 16  ane     400 non-null    object 
 17  class   400 non-null    object 
dtypes: float64(9), object(9)
memory usage: 56.4+ KB


In [16]:
df_v1['pcc'].value_counts()

notpresent    354
present        42
0               4
Name: pcc, dtype: int64

In [17]:
df_v1['pcc'].replace(0, 'present', inplace=True)

In [18]:
df_v1['pcc'].value_counts()

notpresent    354
present        46
Name: pcc, dtype: int64

In [19]:
df_v1['ba'].value_counts()

notpresent    374
present        22
0               4
Name: ba, dtype: int64

In [20]:
df_v1['ba'].replace(0, 'present', inplace=True)

In [21]:
df_v1['ba'].value_counts()

notpresent    374
present        26
Name: ba, dtype: int64

In [22]:
df_v1['htn'].value_counts()

no     251
yes    147
0        2
Name: htn, dtype: int64

In [23]:
df_v1['htn'].replace(0, 'yes', inplace=True)

In [24]:
df_v1['htn'].value_counts()

no     251
yes    149
Name: htn, dtype: int64

In [25]:
df_v1['dm'].value_counts()

no     261
yes    137
0        2
Name: dm, dtype: int64

In [26]:
df_v1['dm'].replace(0, 'yes', inplace=True)

In [27]:
df_v1['dm'].value_counts()

no     261
yes    139
Name: dm, dtype: int64

In [28]:
df_v1['cad'].value_counts()

no     364
yes     34
0        2
Name: cad, dtype: int64

In [29]:
df_v1['cad'].replace(0, 'yes', inplace=True)

In [30]:
df_v1['cad'].value_counts()

no     364
yes     36
Name: cad, dtype: int64

In [31]:
df_v1['appet'].value_counts()

good    317
poor     82
0         1
Name: appet, dtype: int64

In [32]:
df_v1['appet'].replace(0, 'poor', inplace=True)

In [33]:
df_v1['appet'].value_counts()

good    317
poor     83
Name: appet, dtype: int64

In [34]:
df_v1['pe'].value_counts()

no     323
yes     76
0        1
Name: pe, dtype: int64

In [35]:
df_v1['pe'].replace(0, 'yes', inplace=True)

In [36]:
df_v1['pe'].value_counts()

no     323
yes     77
Name: pe, dtype: int64

In [37]:
df_v1['ane'].value_counts()

no     339
yes     60
0        1
Name: ane, dtype: int64

In [38]:
df_v1['ane'].replace(0, 'yes', inplace=True)

In [39]:
df_v1['ane'].value_counts()

no     339
yes     61
Name: ane, dtype: int64

In [40]:
df_v1['class'].value_counts()

ckd       250
notckd    150
Name: class, dtype: int64

In [41]:
df_v1 = pd.get_dummies(data=df_v1, drop_first=True)

In [42]:
df_v1.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'hemo', 'pcc_present',
       'ba_present', 'htn_yes', 'dm_yes', 'cad_yes', 'appet_poor', 'pe_yes',
       'ane_yes', 'class_notckd'],
      dtype='object')

In [43]:
df_v1.drop('class_notckd', axis=1, inplace=True)

In [44]:
df_v1 ['class'] = df['class']

In [45]:
df_v1.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'hemo', 'pcc_present',
       'ba_present', 'htn_yes', 'dm_yes', 'cad_yes', 'appet_poor', 'pe_yes',
       'ane_yes', 'class'],
      dtype='object')

In [46]:
df_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          400 non-null    float64
 1   bp           400 non-null    float64
 2   sg           400 non-null    float64
 3   al           400 non-null    float64
 4   su           400 non-null    float64
 5   bgr          400 non-null    float64
 6   bu           400 non-null    float64
 7   sc           400 non-null    float64
 8   hemo         400 non-null    float64
 9   pcc_present  400 non-null    uint8  
 10  ba_present   400 non-null    uint8  
 11  htn_yes      400 non-null    uint8  
 12  dm_yes       400 non-null    uint8  
 13  cad_yes      400 non-null    uint8  
 14  appet_poor   400 non-null    uint8  
 15  pe_yes       400 non-null    uint8  
 16  ane_yes      400 non-null    uint8  
 17  class        400 non-null    object 
dtypes: float64(9), object(1), uint8(8)
memory usage: 3

In [47]:
v1col_list = list(df_v1.columns)

# extraction of feature

In [48]:
v1_features = []

In [49]:
[v1_features.append(col) for col in v1col_list if col != 'class']

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [50]:
v1_features

['age',
 'bp',
 'sg',
 'al',
 'su',
 'bgr',
 'bu',
 'sc',
 'hemo',
 'pcc_present',
 'ba_present',
 'htn_yes',
 'dm_yes',
 'cad_yes',
 'appet_poor',
 'pe_yes',
 'ane_yes']

In [51]:
X = df_v1[v1_features]
y = df_v1['class']

In [52]:
poly = PolynomialFeatures(include_bias=False, degree=2)

In [53]:
X_poly = poly.fit_transform(X)

# Splitting the dataset into the Training set and Test set


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42)

# Feature Scaling

In [55]:
ss = StandardScaler()

In [56]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [57]:
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

# Logistic Regression

In [58]:
logreg = LogisticRegression()

In [59]:
logreg.fit(X_train_sc, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
logreg.score(X_train_sc, y_train)

0.9733333333333334

In [61]:
logreg.score(X_test_sc, y_test)

0.96

In [62]:
logreg.coef_

array([[ 5.73839248e-01,  5.28753418e-01,  2.33552286e-01,
        -7.24282751e-01, -1.01199699e-01,  1.20174962e-01,
         8.22751787e-02, -5.15625591e-01, -5.30668582e-01,
        -4.84321989e-02,  1.32495185e-01, -2.69897361e-01,
        -2.41763633e-01,  8.87425240e-02, -3.06138462e-01,
        -2.72142042e-01, -2.16516273e-01, -1.05703724e+00,
        -4.42929388e-01,  2.74137793e-01, -5.45720885e-01,
        -6.49994850e-02, -1.22988368e-01,  3.55085624e-01,
        -2.59207363e-01,  1.01323148e+00, -1.24680697e-01,
         6.77792641e-02, -1.83879005e-01, -2.71072225e-01,
         3.10300532e-02, -1.89667454e-01, -2.02389002e-01,
        -1.76155256e-01, -7.14246159e-01,  3.82998426e-02,
        -5.32630030e-01, -8.82016275e-02,  5.15098103e-02,
         4.60309770e-01, -3.48993151e-01, -2.81446125e-01,
        -5.97058243e-02,  1.00511449e-01, -3.27943564e-01,
        -2.61781900e-01,  4.91909136e-02, -2.58789344e-01,
        -2.31625122e-01, -2.03195493e-01,  4.33960552e-0

In [63]:
np.exp(0.11018577)

1.1164854606988666

In [64]:
predictions = logreg.predict(X_test_sc)

# Making the Confusion Matrix and Accuracy_score

In [65]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [66]:
cm = confusion_matrix(y_test, predictions)

In [67]:
cm

array([[62,  3],
       [ 1, 34]], dtype=int64)

In [68]:
cm = pd.DataFrame(cm, columns=['Predicted Negative','Predicted Positive'], index=['Actual Negative','Actual Positive'])

In [69]:
cm

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,62,3
Actual Positive,1,34


In [70]:
accuracy_score(y_test, predictions)

0.96

In [71]:
df['class'].value_counts()

ckd       250
notckd    150
Name: class, dtype: int64

In [72]:
df_v1['class'].value_counts()

ckd       250
notckd    150
Name: class, dtype: int64

In [73]:
df_v1.shape

(400, 18)

In [74]:
df_v1['class'].value_counts()

ckd       250
notckd    150
Name: class, dtype: int64

In [75]:
df_v1_maj = df_v1[ df_v1['class'] == 'ckd' ]
df_v1_min = df_v1[ df_v1['class'] == 'notckd' ]

In [76]:
df_upsample = resample(df_v1_maj, replace = True, n_samples = 4850, random_state = 42)
df_upsample = pd.concat([df_upsample, df_v1_min])

In [77]:
df_upsample['class'].value_counts()

ckd       4850
notckd     150
Name: class, dtype: int64

In [78]:
X = df_upsample[v1_features]
y = df_upsample['class']

In [79]:
X_poly = poly.fit_transform(X)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42)

In [81]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [82]:
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [83]:
logreg.fit(X_train_sc, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [84]:
# Earlier score was 0.96666666666

In [85]:
logreg.score(X_train_sc, y_train)

0.9968

In [86]:
# Earlier score was 0.96

In [87]:
logreg.score(X_test_sc, y_test)

0.9976

In [88]:
predictions = logreg.predict(X_test_sc)

# Making the Confusion Matrix and Accuracy_score

In [89]:
from sklearn import metrics
metrics.accuracy_score(predictions,y_test)

0.9976

In [90]:
cm = confusion_matrix(y_test, predictions)

In [91]:
cm

array([[1208,    0],
       [   3,   39]], dtype=int64)

In [92]:
cm = pd.DataFrame(cm, columns=['Predicted Negative','Predicted Positive'], index=['Actual Negative','Actual Positive'])

In [93]:
cm

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,1208,0
Actual Positive,3,39


In [94]:
accuracy_score(y_test, predictions)

1250

# KNeighborsClassifier

In [95]:
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=4)
model_knn.fit(X_train,y_train)
Prediction = model_knn.predict(X_test)
metrics.accuracy_score(Prediction,y_test)

0.9832

# DecisionTreeClassifier

In [96]:
from sklearn.tree import DecisionTreeClassifier
model_dec = DecisionTreeClassifier()
model_dec.fit(X_train,y_train)
Prediction = model_dec.predict(X_test)
metrics.accuracy_score(Prediction,y_test)

1.0

# RandomForestClassifier

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
model_random = RandomForestClassifier()
model_random.fit(X_train,y_train)
Prediction = model_random.predict(X_test)
metrics.accuracy_score(Prediction,y_test)

1.0

---