# Problem Statement: 
Create a Predictive model to predict if a patient is suffering from a chronic kidney disease or not?

# Target Variable: 
'classification' ('ckd' or 'notckd', ckd - chronic kidney disease)

# Predictors: 
'id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'



# Predictor feature details:
age - age<br>
bp - blood pressure<br>
sg - specific gravity<br>
al - albumin<br>
su - sugar<br>
rbc - red blood cells<br>
pc - pus cell<br>
pcc - pus cell clumps<br>
ba - bacteria<br>
bgr - blood glucose random<br>
bu - blood urea<br>
sc - serum creatinine<br>
sod - sodium<br>
pot - potassium<br>
hemo - hemoglobin<br>
pcv - packed cell volume<br>
wc - white blood cell count<br>
rc - red blood cell count<br>
htn - hypertension<br>
dm - diabetes mellitus<br>
cad - coronary artery disease<br>
appet - appetite<br>
pe - pedal edema<br>
ane - anemia<br>
class - class<br>

# STEP 1 : Importing requisite libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import missingno as msno 
from sklearn.impute import KNNImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.simplefilter('ignore')

from scipy.stats import randint 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

In [None]:
def display_confusion_matrix(y_test,y_pred):
  cm = confusion_matrix(y_test, y_pred_lr)
  group_names = ["True Neg","False Pos","False Neg","True Pos"]
  group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
  group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]
  labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
  labels = np.asarray(labels).reshape(2,2)
  sns.heatmap(cm, annot=labels, fmt="", cmap="Blues")
  print(classification_report(y_test, y_pred))


# STEP 2 : Importing training dataset

In [None]:
# Reading the dataset
ckd_df = pd.read_csv('../input/ckdisease/kidney_disease.csv')

In [None]:
#check the columns
ckd_df.columns

In [None]:
##Rename the columns to have meaningful names
col_dict={"bp":"blood_pressure",
          "sg":"specific_gravity",
          "al":"albumin",
          "su":"sugar",
          "rbc":"red_blood_cells",
          "pc":"pus_cell",
          "pcc":"pus_cell_clumps",
          "ba":"bacteria",
          "bgr":"blood_glucose_random",
          "bu":"blood_urea",
          "sc":"serum_creatinine",
          "sod":"sodium",
          "pot":"potassium",
          "hemo":"hemoglobin",
          "pcv":"packed_cell_volume",
          "wc":"white_blood_cell_count",
          "rc":"red_blood_cell_count",
          "htn":"hypertension",
          "dm":"diabetes_mellitus",
          "cad":"coronary_artery_disease",
          "appet":"appetite",
          "pe":"pedal_edema",
          "ane":"anemia"}

ckd_df.rename(columns=col_dict, inplace=True)

In [None]:
#Check the column names again
ckd_df.columns

# STEP 3 : Exploratory Data Analysis


In [None]:
#Check the shape
print(ckd_df.shape)

Inference - there are 26 features and 400 rows. Small dataset for training.

In [None]:
# Check the data first
ckd_df.head(5)

In [None]:
# Observing the summarized information of data
ckd_df.info()

Inference - most of the features are float. The object features need verified categorical values

In [None]:
#Check the number of NULL values in the train Dataset
print('Null values in Train Data: \n', ckd_df.isnull().sum())

Inference - There are missing values for rbc, wbc. Given the small size of train dataset, there is imputation required.

In [None]:
#Check the target value counts
print('Total Count of the Prediction Output Column Classification values: \n', ckd_df['classification'].value_counts())

Inference - Distribution is balanced and good. Target values need no correction.

In [None]:
for c in ckd_df.columns:
  print(c)
  print(c,ckd_df[c].unique())

Inference - There are incorrect values for diabetes_mellitus, coronary_artery_disease and white_blood_cell_count which need to be corrected.



Inference - Values look good now except for missing values "\t?" and NaN. These will be treated in imputation

In [None]:
#Replace incorrect values
ckd_df['diabetes_mellitus'] =ckd_df['diabetes_mellitus'].replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'})
ckd_df['coronary_artery_disease'] = ckd_df['coronary_artery_disease'].replace(to_replace='\tno',value='no')
ckd_df['white_blood_cell_count'] = ckd_df['white_blood_cell_count'].replace(to_replace='\t8400',value='8400')

In [None]:
ckd_df["classification"].value_counts()

In [None]:
ckd_df["classification"]=ckd_df["classification"].replace("ckd\t", "ckd")

In [None]:
# Looking at the statistial distribution of the data including categorical variables
ckd_df.describe(include='all').T

Inferences:

1. All features, including categorical are listed.
2. The count value varies from 187 to 280. This means there are few features with missing values.
3. The feature **id** is a running number and seems to be an unique identifier for each row. So, it is not an influencing feature.
4. **age** is between 2 and 90. It is normally distributed.
5. **blood_pressure** , **sodium** and **hemoglobin** are normally distributed.
6. **blood_glucose_random** and **potassium** are right skewed
7. **blood_urea** and **serum_creatinine** are left skewed.
8. Categorical target value is well balanced.



# STEP 4 : Distribution of all features


If target variable's distribution is too skewed then the predictive modeling will not be possible. Bell curve is desirable but slightly positive skew or negative skew is also fine. To make sure there is a balance in the the distribution of each class otherwise it impacts the Machine Learning algorithms ability to learn all the classes

In [None]:
sns.countplot(x='classification',data=ckd_df)
plt.xlabel("classification")
plt.ylabel("Count")
plt.title("target classification Distribution")
plt.show()

Inference - no imbalance in the target feature

In [None]:
##Percent of target data
print("Percent of distribution below:")
ckd_df["classification"].value_counts()/len(ckd_df)*100

In [None]:
#histograms for all continous variables
ckd_df.hist(['age','blood_pressure', 'blood_glucose_random', 'blood_glucose_random', 
         'blood_glucose_random', 'sodium', 'potassium',  
         'packed_cell_volume', 'packed_cell_volume', 'red_blood_cell_count'], figsize=(30,15))

In [None]:
#Check distribution between age and ckd presence
# KDE is used to study PDF of a continuous RV
ckd_df["classification"] = [1 if i == "ckd" else 0 for i in ckd_df["classification"]]
sns.jointplot(ckd_df.age, ckd_df.classification, kind="kde", size=7)

Inferences 
1. nockd - concentrated more around age=40
2. ckd - concentrated more around age = 65

In [None]:
# Check distribution of age for each target class
g = sns.FacetGrid(ckd_df,col="classification")
g.map(sns.distplot,"age", bins=10)
plt.show()

Inferences
1. For class nockd, the data is normally distributed
2. For class ckd, the data is right skewed. There are larger number of values distributed on the right side of the plot. 

# STEP 5 : Check correlation between features

In [None]:
corr_df = ckd_df.corr()

In [None]:
f,ax=plt.subplots(figsize=(15,15))
sns.heatmap(corr_df,annot=True,fmt=".2f",ax=ax,linewidths=0.5,linecolor="orange")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.title('Correlations between different predictors')
plt.show()

Inferences
1. Any correlation coefficients close to +1 and -1 imply the features are highly correlated.
2. hemo and pcv have correlation coefficient = 0.9. Highly correlated.
3. Any correlationclose to 0 with target can be removed. Here **potassium** can be removed from the features.

# STEP 6 : Data preprocessing

In [None]:
##Find missing values and impute them
# Visualize missing values as a matrix 
msno.matrix(ckd_df) 

Inference - Many columns have missing values. Needs to be fixed

In [None]:
## Visualize the missing values in a histogram
msno.bar(ckd_df)

In [None]:
##Check if the values are good
for c in ckd_df.columns:
  print(c,ckd_df[c].unique())

Inference - There are \t? values, which need to be replaced

In [None]:
ckd_df["white_blood_cell_count"]=ckd_df["white_blood_cell_count"].replace("\t?", np.nan)
ckd_df["red_blood_cell_count"]=ckd_df["red_blood_cell_count"].replace("\t?", np.nan)
ckd_df['diabetes_mellitus'] = ckd_df['diabetes_mellitus'].replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'})
ckd_df['coronary_artery_disease'] = ckd_df['coronary_artery_disease'].replace(to_replace='\tno',value='no')
ckd_df['white_blood_cell_count'] = ckd_df['white_blood_cell_count'].replace(to_replace='\t8400',value='8400')
ckd_df["packed_cell_volume"]= ckd_df["packed_cell_volume"].replace("\t?", np.nan)

In [None]:
##Re-check if the values are good
for c in ckd_df.columns:
  print(c,ckd_df[c].unique())

In [None]:
for string_column in ["red_blood_cells","pus_cell","pus_cell_clumps","bacteria","hypertension","diabetes_mellitus","coronary_artery_disease","pedal_edema","anemia","appetite"]:
  ckd_df[string_column]=ckd_df[string_column].astype(str)


In [None]:
## Do some encoding to use KNN Imputer
ckd_df['red_blood_cells']=ckd_df['red_blood_cells'].replace({'normal':1,'abnormal':0})
ckd_df['pus_cell']=ckd_df['pus_cell'].replace({'normal':1,'abnormal':0})
ckd_df['pus_cell_clumps']=ckd_df['pus_cell_clumps'].replace({'notpresent':0,'present':1})
ckd_df['bacteria']=ckd_df['bacteria'].replace({'notpresent':0,'present':1})
ckd_df['hypertension']=ckd_df['hypertension'].replace({'no':0,'yes':1})
ckd_df['diabetes_mellitus']=ckd_df['diabetes_mellitus'].replace({'no':0,'yes':1})
ckd_df['coronary_artery_disease']=ckd_df['coronary_artery_disease'].replace({'no':0,'yes':1})
ckd_df['pedal_edema']=ckd_df['pedal_edema'].replace({'no':0,'yes':1})
ckd_df['anemia']=ckd_df['anemia'].replace({'no':0,'yes':1})
ckd_df['appetite']=ckd_df['appetite'].replace({'poor':0,'good':1})

In [None]:
##List all columns with % NaNs
print (round((ckd_df.isnull().sum() * 100/ len(ckd_df)),2).sort_values(ascending=False))

In [None]:
# define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [None]:
impute_columns=list(set(ckd_df.columns)-set(["classification"]))
print(impute_columns)

In [None]:
imputer.fit(ckd_df[impute_columns])

In [None]:
X_trans=pd.DataFrame(imputer.transform(ckd_df[impute_columns]), columns=impute_columns)

In [None]:
X_trans.head(3)

In [None]:
##List all columns with % NaNs
print (round((X_trans.isnull().sum() * 100/ len(X_trans)),2).sort_values(ascending=False))

Inference - all NaNs have been imputed

In [None]:
X=X_trans[X_trans["id"].isin(ckd_df["id"])].drop(["id"],axis=1)

In [None]:
y=ckd_df["classification"]

In [None]:
X_prod=X_trans[X_trans["id"].isin(ckd_df["id"])].drop(["id"],axis=1)

In [None]:
print(X.shape)
print(y.shape)
print(X_prod.shape)

In [None]:
X.info()

# STEP 6 : Predictive Models with hyperparameter tuning Section

In [None]:
cat_columns=["red_blood_cells",
  "pus_cell",
  "pus_cell_clumps",
  "bacteria",
  "hypertension",
  "diabetes_mellitus",
  "coronary_artery_disease",
  "appetite",
  "pedal_edema",
  "anemia"]

In [None]:
##Split train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 4658)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Logistic Regression Hyper parameter tuning
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV 

c_space = np.logspace(-5, 8, 15) 
param_grid = {'C': c_space} 

lr = LogisticRegression() 

lr = GridSearchCV(lr, param_grid, cv = 5) 

lr.fit(X_train, y_train) 
# Print the tuned parameters and score 
print("Tuned Logistic Regression Parameters: {}".format(lr.best_params_)) 
print("Best score is {}".format(lr.best_score_))

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
y_pred_lr = lr.predict(X_test)
display_confusion_matrix(y_test, y_pred_lr)
accuracy_lr=accuracy_score(y_test, y_pred_lr)
print("Accuracy of Logistic Regression is :", accuracy_lr)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

hyperparam_combs = {
    'max_depth': [4, 6, 8, 10, 12],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 4}, {0: 1, 1: 5}]
}

clf = RandomizedSearchCV(DecisionTreeClassifier(),
                         hyperparam_combs,
                         scoring='f1',
                         random_state=1,
                         n_iter=20)

dt_model = clf.fit(X_train, y_train)

# Print the tuned parameters and score 
print("Tuned Decision Tree Parameters: {}".format(dt_model.best_params_)) 
print("Best score is {}".format(dt_model.best_score_)) 

In [None]:
y_pred_dt = dt_model.predict(X_test)
display_confusion_matrix(y_test, y_pred_dt)
accuracy_dt=accuracy_score(y_test, y_pred_dt)
print("Accuracy of Decision Tree is :", accuracy_dt)

In [None]:
# Parameters for Random Foresthypertuning
param_grid = {"n_estimators": np.arange(2, 300, 2),
              "max_depth": np.arange(1, 28, 1),
              "min_samples_split": np.arange(1,150,1),
              "min_samples_leaf": np.arange(1,60,1),
              "max_leaf_nodes": np.arange(2,60,1),
              "min_weight_fraction_leaf": np.arange(0.1,0.4, 0.1)}

rf = RandomizedSearchCV(RandomForestClassifier(),
                         param_grid,
                         scoring='f1',
                         random_state=4658,
                         n_iter=20)

rf_model = rf.fit(X_train, y_train)

# Print the tuned parameters and score 
print("Tuned Random Tree Parameters: {}".format(rf_model.best_params_)) 
print("Best score is {}".format(rf_model.best_score_)) 

In [None]:
y_pred_rf = rf_model.predict(X_test)
display_confusion_matrix(y_test, y_pred_rf)
accuracy_rf=accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forests model is :", accuracy_rf)

In [None]:
params = {'depth':[2, 3, 4],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg':np.logspace(-20, -19, 3)
}

cb = RandomizedSearchCV(CatBoostClassifier(),
                         params,
                         scoring='f1',
                         random_state=4658,
                         n_iter=20)

cb_model = cb.fit(X_train, y_train)

# Print the tuned parameters and score 
print("Tuned Catboost Parameters: {}".format(cb_model.best_params_)) 
print("Best score is {}".format(cb_model.best_score_)) 

In [None]:
y_pred_cb = cb_model.predict(X_test)
display_confusion_matrix(y_test, y_pred_cb)
accuracy_cb=accuracy_score(y_test, y_pred_cb)
print("Accuracy of CatBoost model is :", accuracy_cb)

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
import lightgbm as lgb

lgbm_model = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)

lgbm = RandomizedSearchCV(
    estimator=lgbm_model, 
    param_distributions=param_test, 
    n_iter=20,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=4658,
    verbose=True)
 
 
lgbm_model = lgbm.fit(X_train, y_train)

# Print the tuned parameters and score 
print("Tuned LGBM Parameters: {}".format(lgbm_model.best_params_)) 
print("Best score is {}".format(lgbm_model.best_score_)) 

In [None]:
y_pred_lgbm = lgbm_model.predict(X_test)
display_confusion_matrix(y_test, y_pred_lgbm)
accuracy_lgbm=accuracy_score(y_test, y_pred_lgbm)
print("Accuracy of LGBM model is :", accuracy_lgbm)

In [None]:
models= ['LogisticRegression', 'DecisionTrees', 'RandomForests', 'CatBoost', 'LGBM']
accuracies = [accuracy_lr,accuracy_dt,accuracy_rf,accuracy_cb,accuracy_lgbm]

# Figure Size 
fig, ax = plt.subplots(figsize =(16, 9)) 

# Horizontal Bar Plot 
ax.barh(models, accuracies) 

# Remove axes splines 
for s in ['top', 'bottom', 'left', 'right']: 
	ax.spines[s].set_visible(False) 

# Remove x, y Ticks 
ax.xaxis.set_ticks_position('none') 
ax.yaxis.set_ticks_position('none') 

# Add padding between axes and labels 
ax.xaxis.set_tick_params(pad = 5) 
ax.yaxis.set_tick_params(pad = 10) 

# Add x, y gridlines 
ax.grid(b = True, color ='grey', 
		linestyle ='-.', linewidth = 0.5, 
		alpha = 0.2) 

# Show top values 
ax.invert_yaxis() 

# Add annotation to bars 
for i in ax.patches: 
	plt.text(i.get_width()+0.2, i.get_y()+0.5, 
			str(round((i.get_width()), 2)), 
			fontsize = 10, fontweight ='bold', 
			color ='grey') 

# Add Plot Title 
ax.set_title('Accuracies of different models', 
			loc ='left') 

# Show Plot 
plt.show() 

INFERENCE - It can be seen that DT, RF and catboost give the best accuracies and their best parameters have been determined using RandomSearchCV.