In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Modules and helper functions

In [None]:
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report,accuracy_score

In [None]:
#import Dataset
dataset = pd.read_csv('../input/ckdisease/kidney_disease.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.dtypes

### Cleaning and preprocessing of data for training

In [None]:
dataset[['htn','dm','cad','pe','ane']]=dataset[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
dataset[['rbc','pc']] = dataset[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
dataset[['pcc','ba']] = dataset[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
dataset[['appet']] = dataset[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
dataset['classification']=dataset['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
dataset.rename(columns={'classification':'class'},inplace=True)

In [None]:
# Further cleaning
dataset['pe'] = dataset['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
dataset['appet'] = dataset['appet'].replace(to_replace='no',value=0)
dataset['cad'] = dataset['cad'].replace(to_replace='\tno',value=0)
dataset['dm'] = dataset['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
dataset.drop('id',axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
# '?' character remove process in the dataset
for i in ['rc','wc','pcv']:
    dataset[i] = dataset[i].str.extract('(\d+)').astype(float)

In [None]:
# Filling missing numeric data in the dataset with mean
for i in ['age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','rc','wc','pcv']:
    dataset[i].fillna(dataset[i].mean(),inplace=True)

In [None]:
dataset.isnull().sum()

In [None]:
dataset = dataset.dropna(axis=1) 

In [None]:
dataset.shape

In [None]:
dataset.isnull().sum()

In [None]:
dataset.head()

In [None]:
#Data preprocessing
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
# Feature Scaling
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
#Splitting the dataset in to training and testing set
X_train , X_test , y_train , y_test   = train_test_split(X,y,test_size = 0.2 , random_state=123)  

## Logistic Regression


In [None]:
# Training the Logistic Regression model on the Training set
lg = LogisticRegression(random_state = 0)
lg.fit(X_train, y_train)

In [None]:
#predictin the test result
y_pred_lg = lg.predict(X_test) 

In [None]:
#calculate accuracy
score_lg = accuracy_score(y_pred_lg,y_test)
score_lg

In [None]:
print("train score - " + str(lg.score(X_train, y_train)))
print("test score - " + str(lg.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_lg = confusion_matrix(y_test,y_pred_lg)
sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_lg, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_lg))

## Decision Tree Classifier

In [None]:
#fitting Decision Tree to the training set 
dtc = DecisionTreeClassifier(criterion='entropy',random_state=0)
dtc.fit(X_train,y_train)

In [None]:
#predictin the test result
y_pred_dtc = dtc.predict(X_test) 

In [None]:
#calculate accuracy
score_dtc = accuracy_score(y_pred_dtc,y_test)
score_dtc

In [None]:
print("train score - " + str(dtc.score(X_train, y_train)))
print("test score - " + str(dtc.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
cm_dtc = confusion_matrix(y_test,y_pred_dtc)

sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_dtc, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_dtc))

## K Nearest Neighbors Classifier

In [None]:
#fitting KNN to the training set
knn= KNeighborsClassifier(n_neighbors=5 , metric='minkowski',p=2  )
knn.fit(X_train,y_train)

In [None]:
#predictin the test result
y_pred_knn = knn.predict(X_test) 

In [None]:
#calculate accuracy
score_dtc = accuracy_score(y_pred_knn,y_test)
score_dtc

In [None]:
print("train score - " + str(knn.score(X_train, y_train)))
print("test score - " + str(knn.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
cm_knn = confusion_matrix(y_test,y_pred_knn)

sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_knn, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_knn))

## Support Vector Machine

In [None]:
#fitting SVM to the training set
svm = SVC(kernel='linear', random_state=0)
svm.fit(X_train,y_train)

In [None]:
#predictin the test result
y_pred_svm = svm.predict(X_test) 

In [None]:
score_svm = accuracy_score(y_pred_svm,y_test)
score_svm

In [None]:
print("train score - " + str(svm.score(X_train, y_train)))
print("test score - " + str(svm.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
cm_svm = confusion_matrix(y_test,y_pred_svm)

sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_svm, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_svm))

## kernal SVM

In [None]:
#fitting kernal SVM to the training set
ksvm = SVC(kernel='rbf', random_state=0 )
ksvm.fit(X_train,y_train)

In [None]:
#predictin the test result
y_pred_ksvm = ksvm.predict(X_test) 

In [None]:
#calculate accuracy
score_ksvm = accuracy_score(y_pred_ksvm,y_test)
score_ksvm

In [None]:
print("train score - " + str(ksvm.score(X_train, y_train)))
print("test score - " + str(ksvm.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
cm_ksvm = confusion_matrix(y_test,y_pred_ksvm)
sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_ksvm, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_ksvm))

## Random Forest classification

In [None]:
#fitting Random Forest classification to the training set 
rfc = RandomForestClassifier(n_estimators=10 , criterion='entropy',random_state=0)
rfc.fit(X_train,y_train)

In [None]:
#predictin the test result
y_pred_rfc = rfc.predict(X_test) 

In [None]:
#calculate accuracy
score_rfc = accuracy_score(y_pred_rfc,y_test)
score_rfc

In [None]:
print("train score - " + str(rfc.score(X_train, y_train)))
print("test score - " + str(rfc.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
cm = confusion_matrix(y_test,y_pred_rfc)

sns.set(font_scale=1.4) # for label size
sns.heatmap(cm, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_rfc))

## Kernal Navie Bayes

In [None]:
#fitting kernal Navie bayes to the training set 
knb = GaussianNB()
knb.fit(X_train,y_train)

In [None]:
#predictin the test result
y_pred_knb = knb.predict(X_test) 

In [None]:
#calculate accuracy
score_knb = accuracy_score(y_pred_knb,y_test)
score_knb

In [None]:
print("train score - " + str(knb.score(X_train, y_train)))
print("test score - " + str(knb.score(X_test, y_test)))

In [None]:
#Making the Confusion Matrix
cm_knb = confusion_matrix(y_test,y_pred_knb)

sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_knb, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
print(classification_report(y_test, y_pred_knb))