# Data description

The data was taken over a 2-month period in India with 25 features. 

The target is the 'classification', which is either 'ckd' or 'notckd'

We use 24 + class = 25 ( 11 numeric ,14 nominal)

- Age(numerical) - age in years
- Blood Pressure(numerical) - bp in mm/Hg
- Specific Gravity(nominal) - sg - (1.005,1.010,1.015,1.020,1.025)
- Albumin(nominal) - al - (0,1,2,3,4,5)
- Sugar(nominal) - su - (0,1,2,3,4,5)
- Red Blood Cells(nominal) - rbc - (normal,abnormal)
- Pus Cell (nominal) - pc - (normal,abnormal)
- Pus Cell clumps(nominal) - pcc - (present,notpresent)
- Bacteria(nominal) - ba - (present,notpresent)
- Blood Glucose Random(numerical) - bgr in mgs/dl
- Blood Urea(numerical) -bu in mgs/dl
- Serum Creatinine(numerical) - sc in mgs/dl
- Sodium(numerical) - sod in mEq/L
- Potassium(numerical) - pot in mEq/L
- Hemoglobin(numerical) - hemo in gms
- Packed Cell Volume(numerical)
- White Blood Cell Count(numerical) - wc in cells/cumm
- Red Blood Cell Count(numerical) - rc in millions/cmm
- Hypertension(nominal) - htn - (yes,no)
- Diabetes Mellitus(nominal) - dm - (yes,no)
- Coronary Artery Disease(nominal) - cad - (yes,no)
- Appetite(nominal) - appet - (good,poor)
- Pedal Edema(nominal) - pe - (yes,no)
- Anemia(nominal) - ane - (yes,no)
- Class (nominal)- class - (ckd,notckd)

Acknowledgements

https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease

# Loading the dataset and first look

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/ckdisease/kidney_disease.csv')
df

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Data cleaner

In [None]:
def data_cleaner(data):
    data.fillna(0, inplace=True)
    data = pd.get_dummies(data,columns =['htn', 'dm', 'cad', 'pc', 'pcc', 'ba', 'rbc', 'appet', 'pe', 'ane'], dtype = bool, drop_first=True)
    return data

In [None]:
df = data_cleaner(df)
df

In [None]:
df.columns

# The model

This portion will keep on getting updated.

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

target = ['classification']
features = ['htn_no', 'dm_\tno','dm_no', 'cad_\tno', 'cad_no', 'pc_abnormal', 'pcc_notpresent', 'ba_notpresent', 'sg', 'al', 'su', 
            'age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'rbc_abnormal', 'appet_poor', 'pe_no', 'ane_no']

X = df[features]
y = df[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('XGBClassifier', XGBClassifier()))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=0)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
model=XGBClassifier(random_state=0)
model.fit(X,y)
y_pred=model.predict(X_test)

from sklearn import metrics

cm = metrics.confusion_matrix(y_test, y_pred)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(metrics.accuracy_score(y_test,y_pred))
plt.title(all_sample_title, size = 15);
plt.show()
print(metrics.classification_report(y_test,y_pred))