## Load Modules and helper functions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

## Load files

In [None]:
CKD_dataset = pd.read_csv("../input/kidney-disease/kidney_disease.csv", header=0, na_values="?")
CKD_dataset.head(10)

## Preprocessing of data for training a classifier

In [None]:
CKD_dataset.info()
CKD_dataset.columns
CKD_dataset.describe()
CKD_dataset.shape
CKD_dataset.head().T

## Cleaning of data for training a classifier

In [None]:
cols_names = {"bp": "blood_pressure",
              "sg": "specific_gravity",
              "al": "albumin",
              "su": "sugar",
              "rbc": "red_blood_cells",
              "pc": "pus_cell",
              "pcc": "pus_cell_clumps",
              "ba": "bacteria",
              "bgr": "blood_glucose_random",
              "bu": "blood_urea",
              "sc": "serum_creatinine",
              "sod": "sodium",
              "pot": "potassium",
              "hemo": "haemoglobin",
              "pcv": "packed_cell_volume",
              "wc": "white_blood_cell_count",
              "rc": "red_blood_cell_count",
              "htn": "hypertension",
              "dm": "diabetes_mellitus",
              "cad": "coronary_artery_disease",
              "appet": "appetite",
              "pe": "pedal_edema",
              "ane": "anemia"}

CKD_dataset.rename(columns=cols_names, inplace=True)
print(f"\nSo we have {CKD_dataset.shape[1]} columns and {CKD_dataset.shape[0]} instances")

# Change to Numerical Dtyp
CKD_dataset['red_blood_cell_count'] = pd.to_numeric(CKD_dataset['red_blood_cell_count'], errors='coerce')
CKD_dataset['packed_cell_volume'] = pd.to_numeric(CKD_dataset['packed_cell_volume'], errors='coerce')
CKD_dataset['white_blood_cell_count'] = pd.to_numeric(CKD_dataset['white_blood_cell_count'], errors='coerce')

# Drop id Column as it is seems to be an unique identifier for each row
CKD_dataset.drop(["id"], axis=1, inplace=True)

# Checking missing values
CKD_dataset.isnull().sum().sort_values(ascending=False)

# Replace incorrect values
CKD_dataset['diabetes_mellitus'] = CKD_dataset['diabetes_mellitus'].replace(
    to_replace={'\tno': 'no', '\tyes': 'yes', ' yes': 'yes'})
CKD_dataset['coronary_artery_disease'] = CKD_dataset['coronary_artery_disease'].replace(to_replace='\tno', value='no')
CKD_dataset['classification'] = CKD_dataset['classification'].replace(to_replace='ckd\t', value='ckd')

# Convert nominal values to binary values
CKD_dataset.replace("?", np.NaN, inplace=True)
conv_value = {"red_blood_cells": {"normal": 1, "abnormal": 0},
              "pus_cell": {"normal": 1, "abnormal": 0},
              "pus_cell_clumps": {"present": 1, "notpresent": 0},
              "bacteria": {"present": 1, "notpresent": 0},
              "hypertension": {"yes": 1, "no": 0},
              "diabetes_mellitus": {"yes": 1, "no": 0},
              "coronary_artery_disease": {"yes": 1, "no": 0},
              "appetite": {"good": 1, "poor": 0},
              "pedal_edema": {"yes": 1, "no": 0},
              "anemia": {"yes": 1, "no": 0},
              "classification": {"ckd": 1, "notckd": 0}}
CKD_dataset.replace(conv_value, inplace=True)

# Fill null values with mean value of the respective column
CKD_dataset.fillna(round(CKD_dataset.mean(), 2), inplace=True)

# Save the final data cleaning
CKD_dataset.to_csv("kidney_disease_final.csv", sep=',', index=False)

## Classifiers

In [None]:
# Classifies


def import_data():
    kidney_disease_dataset = pd.read_csv('kidney_disease_final.csv', sep=',', header=0)

    # Printing the dataset shape
    print("Dataset Lenght: ", len(kidney_disease_dataset))
    print("Dataset Shape: ", kidney_disease_dataset.shape)

    # Printing the dataset observations
    return kidney_disease_dataset


# Split Training/Testing Data


def split_dataset(kidney_disease_dataset):
    # Seperating the target variable
    X = kidney_disease_dataset.values[:, 0:24]
    Y = kidney_disease_dataset.values[:, -1]

    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.3, random_state=100)

    return X, Y, X_train, X_test, y_train, y_test


def train_using_gini(X_train, X_test, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion="gini",
                                      random_state=100, max_depth=3, min_samples_leaf=5)
    # Performing training
    clf_gini.fit(X_train, y_train)

    return clf_gini


def train_using_rfc(X_train, X_test, y_train):
    # Creating the classifier object
    rfc = RandomForestClassifier(random_state=100)
    # Performing training
    rfc.fit(X_train, y_train)

    return rfc


def train_using_knn(X_train, X_test, y_train):
    # Creating the classifier object
    knn = KNeighborsClassifier(n_neighbors=1)
    # Performing training
    knn.fit(X_train, y_train)

    return knn


# Predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    return y_pred


# Accuracy
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: \n",
          confusion_matrix(y_test, y_pred))

    print("Accuracy : ",
          accuracy_score(y_test, y_pred) * 100)

    print("Report : \n",
          classification_report(y_test, y_pred))


# Main code
def main():
    # Building Phase
    data = import_data()
    X, Y, X_train, X_test, y_train, y_test = split_dataset(data)

    # Operational Phase
    print("\nResults Using Gini Index:")
    clf_gini = train_using_gini(X_train, X_test, y_train)
    # Prediction using gini
    y_pred_gini = prediction(X_test, clf_gini)
    cal_accuracy(y_test, y_pred_gini)

    # Operational Phase
    print("Results Using RandomForestClassifier:")
    rfc = train_using_rfc(X_train, X_test, y_train)
    # Prediction using rfc
    y_pred_rfc = prediction(X_test, rfc)
    cal_accuracy(y_test, y_pred_rfc)

   # Operational Phase
    print("Results Using KNeighborsClassifier:")
    knn = train_using_knn(X_train, X_test, y_train)
    # Prediction using knn
    y_pred_knn = prediction(X_test, knn)
    cal_accuracy(y_test, y_pred_knn)


# Calling main function
if __name__ == "__main__":
    main()