In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
ckd_data = pd.read_csv("/kaggle/input/ckdisease/kidney_disease.csv")

In [None]:
ckd_data.info()

In [None]:
# 66, 162, 185
print(ckd_data.loc[[66]])

In [None]:
# Changing all the tab spaces into NaN
print(ckd_data.loc[[185]].wc)
ckd_data.at[185, 'wc'] = 'NaN'
ckd_data.at[162, 'rc'] = 'NaN'
ckd_data.at[66, 'pcv'] = 'NaN'

In [None]:
from sklearn.model_selection import train_test_split
ckd_train_data, ckd_test_data = train_test_split(ckd_data, test_size=0.2, random_state=42)

In [None]:
# Separate out the labels
ckd_train_data_label = ckd_train_data["classification"].copy()
ckd_test_data_label = ckd_test_data["classification"].copy()
ckd_train_data_label.head()

In [None]:
# List of Features only dataset. Drop the labels
ckd_train_data = ckd_train_data.drop("classification", axis=1)
ckd_test_data = ckd_test_data.drop("classification", axis=1)
ckd_train_data.head()

In [None]:
def impute_nan_add_variable(DataFrame,ColName):
    #1. add new column and replace if category is null then 1 else 0
    DataFrame[ColName+"_Imputed"] =   np.where(DataFrame[ColName].isnull(),1,0)
    
    # 2. Take most occured category in that vairable (.mode())
    Mode_Category = DataFrame[ColName].mode()[0]
    
    ## 2.1 Replace NAN values with most occured category in actual vairable
    DataFrame[ColName].fillna(Mode_Category,inplace=True)

In [None]:
# Call function to impute NAN values for categorical variables and add new importance feature
for Columns in ['rbc','pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']:
    impute_nan_add_variable(ckd_train_data,Columns)

In [None]:
ckd_train_data.head()

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scalar', StandardScaler()),
])

In [None]:
ckd_train_data_numeric = ckd_train_data.drop(['rbc','pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'], axis=1)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

num_attribs = list(ckd_train_data_numeric)
cat_attribs = ['rbc','pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OrdinalEncoder(), cat_attribs),
])

In [None]:
ckd_train_data_prepared = full_pipeline.fit_transform(ckd_train_data)

In [None]:
ckd_train_data_binary_labels = (ckd_train_data_label == 'ckd')

In [None]:
from sklearn.svm import LinearSVC
svm_clf1 = LinearSVC(C=1, loss="hinge", random_state=42)
svm_clf1.fit(ckd_train_data_prepared, ckd_train_data_binary_labels)

In [None]:
from sklearn.svm import SVC
rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=5, C=0.001)
rbf_kernel_svm_clf.fit(ckd_train_data_prepared, ckd_train_data_binary_labels)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(ckd_train_data_prepared, ckd_train_data_binary_labels)

In [None]:
# Lets evaluate the cross validation to compare the models performance on this
# binary classification problem. 
from sklearn.model_selection import cross_val_predict
ckd_train_data_LSVC_predictions = cross_val_predict(svm_clf1, ckd_train_data_prepared, ckd_train_data_binary_labels, cv=3)
ckd_train_data_rbf_kernel_SVM_predictions = cross_val_predict(rbf_kernel_svm_clf, ckd_train_data_prepared, ckd_train_data_binary_labels, cv=3)
ckd_train_data_RF_predictions = cross_val_predict(rnd_clf, ckd_train_data_prepared, ckd_train_data_binary_labels, cv=3)

In [None]:
# Comparing the confusion matrix of the 3 algorithm options
from sklearn.metrics import confusion_matrix
confusion_matrix(ckd_train_data_binary_labels, ckd_train_data_LSVC_predictions)


In [None]:
confusion_matrix(ckd_train_data_binary_labels, ckd_train_data_rbf_kernel_SVM_predictions)

In [None]:
confusion_matrix(ckd_train_data_binary_labels, ckd_train_data_RF_predictions)

In [None]:
# As we can see that random forest performed the best when comparing confusion matrix.
# Lets now use it to predict on the test set.

# First prepare the test data
#Prepare the test data
for Columns in ['rbc','pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']:
    impute_nan_add_variable(ckd_test_data,Columns)

In [None]:
# Execute the pipeline on the test data.
ckd_test_data_prepared = full_pipeline.fit_transform(ckd_test_data)

In [None]:
# Make the predictions using random forest classifier
ckd_test_data_RF_predictions = rnd_clf.predict(ckd_test_data_prepared)

In [None]:
ckd_test_data_RF_predictions

In [None]:
# Lets look at the confusion matrix of the test predictions
ckd_test_data_binary_labels = (ckd_test_data_label == 'ckd')
confusion_matrix(ckd_test_data_binary_labels, ckd_test_data_RF_predictions) 

In [None]:
# Prediction using the linear SVC
ckd_test_data_Linear_SVC_predictions = svm_clf1.predict(ckd_test_data_prepared)


In [None]:
# Confusion matrix of Linear SVC predictions
confusion_matrix(ckd_test_data_binary_labels, ckd_test_data_Linear_SVC_predictions)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score(ckd_test_data_binary_labels, ckd_test_data_Linear_SVC_predictions)

In [None]:
recall_score(ckd_test_data_binary_labels, ckd_test_data_Linear_SVC_predictions)

In [None]:
from sklearn.metrics import f1_score
f1_score(ckd_test_data_binary_labels, ckd_test_data_Linear_SVC_predictions)

In [None]:
ckd_test_data_prediction_scores = cross_val_predict(svm_clf1, ckd_test_data_prepared, ckd_test_data_binary_labels, cv=3, method="decision_function")

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ckd_test_data_binary_labels, ckd_test_data_prediction_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--') # Dashboard diagnol
    #plt.axis([0, 1, 0, 1])                                    
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) 
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)
    plt.grid(True) 

In [None]:
import matplotlib.pyplot as plt
plot_roc_curve(fpr, tpr, "SVC")
plt.show()