In [4]:
import pandas as pd
import numpy as np
import scipy as sp

import sklearn
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
kidney_df = pd.read_csv(
    'Kidney_Disease_Dataset.csv',
    header='infer',
    delimiter=',',
    na_values=['NaN'],
)

kidney_df.head(5)

Unnamed: 0,PatientID,Age,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,...,Itching,QualityOfLifeScore,HeavyMetalsExposure,OccupationalExposureChemicals,WaterQuality,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy,Diagnosis,DoctorInCharge
0,1,71,0,0,0,2,31.069414,1,5.128112,1.67622,...,7.556302,76.0768,0,0,1,1.018824,4.966808,9.871449,1,Confidential
1,2,34,0,0,1,3,29.692119,1,18.609552,8.377574,...,6.836766,40.128498,0,0,0,3.923538,8.189275,7.161765,1,Confidential
2,3,80,1,1,0,1,37.394822,1,11.882429,9.607401,...,2.144722,92.872842,0,1,1,1.429906,7.624028,7.354632,1,Confidential
3,4,40,0,2,0,1,31.32968,0,16.020165,0.408871,...,7.077188,90.080321,0,0,0,3.226416,3.282688,6.629587,1,Confidential
4,5,43,0,1,1,2,23.726311,0,7.944146,0.780319,...,3.553118,5.258372,0,0,1,0.285466,3.849498,1.437385,1,Confidential


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [15]:
# EXTRACT CATEGORICAL FEATURES
def extract_categorical(df, include_dtypes, exclude_vars) :
    # Extract categorical features
    cat_df = df.select_dtypes(
        include=include_dtypes
    ).drop(
        exclude_vars, axis=1
    )
    # Coerce original feature types to Pandas category type
    cat_df = cat_df.astype('category')
    return cat_df

# Extract and Coerce Features to Category Types:
cat_df = extract_categorical(
    df=kidney_df, 
    include_dtypes=['integer', 'object'], 
    exclude_vars=['Age', 'SystolicBP', 'DiastolicBP',
    'PatientID', 'DoctorInCharge']
)

# View Metadata and Print First-5 Categorical Observations:
print(cat_df.info())
cat_df.head(5)



# EXTRACT NUMERIC FEATURES
num_df = kidney_df.drop(
    cat_df.columns, 
    axis=1
)


# Remove 'PatientID' & 'DoctorInCharge'
num_df.drop(
    ['PatientID', 'DoctorInCharge'],
    axis=1,
    inplace=True
)

# Ordinal categorical features
ordinal_cat_df = cat_df.loc[:, ['SocioeconomicStatus', 'EducationLevel']]

# Remove ordinal features and Diagnosis (target feature)
nominal_cat_df = cat_df.drop(
    ['SocioeconomicStatus', 'EducationLevel', 'Diagnosis'],
    axis=1
)

# View ordinal categorical features
nominal_cat_df

# ############################################ #
#         Ordinal Processing Pipeline          #
# ############################################ #
# Impute missing data using constant values
cat_impute = SimpleImputer(
    strategy='most_frequent'
)
ordinal_encode = OrdinalEncoder(
    handle_unknown='error'
)

# Develop a pipeline that processes ordinal categorical data
ord_cat_pipe = Pipeline(steps=[
    ('imputation', cat_impute),
    ('ordinal-encoder', ordinal_encode)
])


# ############################################ #
#         Nominal Processing Pipeline          #
# ############################################ #
# One-Hot Encoder Function
one_hot_encode = OneHotEncoder(
    handle_unknown='ignore' 
)

# Develop a pipeline that process nominal categorical data
nom_cat_pipe = Pipeline(steps=[
    ('imputation', cat_impute),
    ('one-hot-encode', one_hot_encode),
])


# ############################################ #
#         Numeric Processing Pipeline          #
# ############################################ #
# Impute missing data using the median column value
median_impute = SimpleImputer(
    strategy='median'
)
# Standardize observations from their column means
standardize = StandardScaler()

# Develop a pipeline that processes numeric data
num_pipe = Pipeline(steps=[
    ('imputation', median_impute),
    ('standardize', standardize)
])


# COLUMN TRANSFORMER CLASS --> Bring Pipelines together to process & transform data
# Integrate processing pipes together into one processing pipeline
processing_pipe = ColumnTransformer(transformers=[
        ('ordinal-cat-pipe', ord_cat_pipe, 
         ordinal_cat_df.columns.tolist()),
    
        ('nominal-cat-pipe', nom_cat_pipe, 
         nominal_cat_df.columns.tolist()),
    
        ('numeric-pipe', num_pipe, 
         num_df.columns.tolist())
    ],
)

# View structure of processing pipe
processing_pipe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1659 entries, 0 to 1658
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   Gender                         1659 non-null   category
 1   Ethnicity                      1659 non-null   category
 2   SocioeconomicStatus            1659 non-null   category
 3   EducationLevel                 1659 non-null   category
 4   Smoking                        1659 non-null   category
 5   FamilyHistoryKidneyDisease     1659 non-null   category
 6   FamilyHistoryHypertension      1659 non-null   category
 7   FamilyHistoryDiabetes          1659 non-null   category
 8   PreviousAcuteKidneyInjury      1659 non-null   category
 9   UrinaryTractInfections         1659 non-null   category
 10  ACEInhibitors                  1659 non-null   category
 11  Diuretics                      1659 non-null   category
 12  Statins                        165

In [19]:
processed_df = pd.concat(
    [cat_df, num_df],
    axis = 1 # By column --> horizontally stacking
)

# Response --> 0 / 1 (Kidney Disease)
y = processed_df.loc[:, 'Diagnosis']

# Remove my response feature from set of predictors, X
X = processed_df.loc[:, ~processed_df.columns.isin(['Diagnosis'])] 
X

Unnamed: 0,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,Smoking,FamilyHistoryKidneyDisease,FamilyHistoryHypertension,FamilyHistoryDiabetes,PreviousAcuteKidneyInjury,UrinaryTractInfections,...,CholesterolTriglycerides,NSAIDsUse,FatigueLevels,NauseaVomiting,MuscleCramps,Itching,QualityOfLifeScore,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy
0,0,0,0,2,1,0,0,0,0,0,...,212.095215,4.563139,3.563894,6.992244,4.518513,7.556302,76.076800,1.018824,4.966808,9.871449
1,0,0,1,3,1,1,1,0,0,0,...,255.451314,9.097002,5.327336,0.356290,2.202222,6.836766,40.128498,3.923538,8.189275,7.161765
2,1,1,0,1,1,0,0,0,0,0,...,251.902583,3.851249,4.855420,4.674069,5.967271,2.144722,92.872842,1.429906,7.624028,7.354632
3,0,2,0,1,0,0,0,0,0,0,...,392.338425,7.881765,8.531685,5.691455,2.176387,7.077188,90.080321,3.226416,3.282688,6.629587
4,0,1,1,2,0,0,0,0,0,0,...,370.523877,4.179459,1.422320,2.273459,6.800993,3.553118,5.258372,0.285466,3.849498,1.437385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1654,0,0,1,2,1,0,0,0,0,0,...,320.668245,5.408175,6.973771,1.541242,6.972093,2.138976,81.102765,0.951836,9.547583,2.046212
1655,0,0,2,1,0,0,0,0,0,0,...,334.818155,3.229984,0.465757,4.224176,0.465297,7.911566,10.600428,3.604147,1.609847,0.324417
1656,0,0,2,3,0,0,0,0,0,0,...,162.784996,8.367022,4.537944,6.061171,2.281970,0.015531,69.633427,0.801955,5.768617,4.935108
1657,0,0,2,2,0,0,0,0,0,0,...,389.349914,9.994551,1.844736,4.151696,0.257814,3.432765,31.858023,0.560298,2.744519,0.322592


## Training-Testing Split

In [24]:
from sklearn.model_selection import train_test_split

# Training and testing partitions of the original data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.25, # Hold-out (reserve) 25% of data for training
    random_state = 7,  # Random seed allows for replication of shuffing
    shuffle = True  # Random shuffling
)

# Viewing testing set for predictors
X_test

Unnamed: 0,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,Smoking,FamilyHistoryKidneyDisease,FamilyHistoryHypertension,FamilyHistoryDiabetes,PreviousAcuteKidneyInjury,UrinaryTractInfections,...,CholesterolTriglycerides,NSAIDsUse,FatigueLevels,NauseaVomiting,MuscleCramps,Itching,QualityOfLifeScore,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy
913,0,0,0,1,0,0,0,0,0,0,...,395.774690,5.778928,9.831580,3.938730,1.412725,7.059117,39.222388,3.531148,3.818084,6.756600
488,1,3,1,1,0,0,0,0,0,0,...,108.310247,1.819244,0.316005,1.389562,2.390393,8.118661,41.961538,2.162870,2.137252,5.891382
359,1,0,0,3,1,0,0,0,0,0,...,128.488268,5.786623,3.864071,4.693073,4.035657,7.263931,12.768648,3.578111,3.385254,6.845726
981,1,0,1,2,1,0,0,0,0,1,...,313.397275,8.807043,1.308437,6.393938,0.238078,8.482967,5.617130,2.023292,8.775366,6.118574
188,1,3,1,2,0,0,0,0,1,0,...,312.935469,9.579242,8.483994,1.472749,3.762790,8.634496,25.192556,0.540133,3.842780,3.736396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049,1,0,0,2,0,0,0,0,0,1,...,195.516592,2.328954,8.881058,1.287741,2.551971,5.350874,87.684583,2.872881,7.544269,8.394102
620,0,0,1,1,0,0,0,0,0,1,...,338.291237,2.153394,4.242217,1.558943,3.939655,7.525267,94.516671,0.474895,8.610023,0.228197
1436,1,0,0,0,0,1,0,0,0,1,...,389.020074,5.902814,4.580478,2.045427,1.756051,1.275583,66.456298,0.171102,5.822374,2.648489
581,1,0,2,1,0,0,0,0,1,0,...,226.172949,2.376036,9.557638,5.448011,2.078208,1.216881,23.528706,1.003602,2.104349,9.307882


## Connect Pipeline to Classification Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import GridSearchCV

# Dictionary of potential hyperparameter values --> we're going to test and evaluate
# Evaluate and test for 12-potential hyperparameter combinations
param_grid = {
    'model__max_depth': range(4, 12, 2), # Try out 4-potential depth levels [4, 6, 8, 10]
    'model__min_samples_split': range(2, 5, 1)  # Try out 3-potential hyperparameters [2, 3, 4]
}

# Classification Tree --> Sklearn model
class_tree = DTC()

# Processing pipeline + model come together in a single pipeline
tree_pipe = Pipeline(
    steps=[
        ('preprocessing', processing_pipe) ,  # Preprocessing stage of pipeline
        ('model', class_tree)  # Model training, fitting, and prediction stage of pipeline
    ]
)

# 5-Folds Cross-Validation to generate 5-unique models for each hyperparameter combination
## 12 * 5 = 60 unique models
tree_pipe_cv = GridSearchCV(
    tree_pipe, param_grid,
    cv=5, n_jobs=1
)

tree_pipe_cv


## Train the Decision Tree and Evaluate the Optimal Hyperparameters

In [29]:
%%time

# Classification tree is fit to the training data using hyperparameter combinations
tree_pipe_cv.fit(
    X_train, y_train
)

CPU times: user 3.46 s, sys: 1.05 ms, total: 3.47 s
Wall time: 3.47 s


In [32]:
# Retrieve the results from 5-Folds Cross-Validation
cv_results = pd.DataFrame(
    tree_pipe_cv.cv_results_
).sort_values(['rank_test_score', 'std_test_score'])  # High mean test score, low standard deviation score

# Print out as a DataFrame
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.037118,0.003668,0.011797,0.003214,4,4,"{'model__max_depth': 4, 'model__min_samples_sp...",0.907631,0.911647,0.911647,0.899598,0.919355,0.909975,0.00643,1
0,0.040008,0.010808,0.010222,0.00221,4,2,"{'model__max_depth': 4, 'model__min_samples_sp...",0.907631,0.915663,0.907631,0.899598,0.919355,0.909975,0.006914,1
5,0.046677,0.001404,0.010535,0.002455,6,4,"{'model__max_depth': 6, 'model__min_samples_sp...",0.927711,0.923695,0.915663,0.875502,0.907258,0.909966,0.018606,3
1,0.035383,0.002996,0.009111,0.001884,4,3,"{'model__max_depth': 4, 'model__min_samples_sp...",0.907631,0.907631,0.915663,0.88755,0.923387,0.908372,0.011944,4
4,0.038697,0.002244,0.010103,0.001196,6,3,"{'model__max_depth': 6, 'model__min_samples_sp...",0.915663,0.923695,0.919679,0.871486,0.903226,0.90675,0.018919,5
3,0.043519,0.003522,0.011185,0.002897,6,2,"{'model__max_depth': 6, 'model__min_samples_sp...",0.927711,0.911647,0.903614,0.871486,0.903226,0.903537,0.018317,6
8,0.047542,0.002066,0.008748,0.001187,8,4,"{'model__max_depth': 8, 'model__min_samples_sp...",0.915663,0.911647,0.907631,0.871486,0.903226,0.90193,0.015773,7
7,0.046456,0.003836,0.014184,0.002933,8,3,"{'model__max_depth': 8, 'model__min_samples_sp...",0.927711,0.927711,0.915663,0.86747,0.870968,0.901904,0.02707,8
6,0.049904,0.011777,0.010741,0.0009,8,2,"{'model__max_depth': 8, 'model__min_samples_sp...",0.919679,0.919679,0.899598,0.863454,0.891129,0.898708,0.020884,9
9,0.053161,0.003304,0.008389,0.000757,10,2,"{'model__max_depth': 10, 'model__min_samples_s...",0.903614,0.911647,0.911647,0.839357,0.903226,0.893898,0.027518,10


## Retrieve the Best Estimator (Optimal Tree)

In [34]:
# Retrieve best performing tree
optimal_tree = tree_pipe_cv.best_estimator_

# Extract optimal, trained tree
optimal_tree