# Task to predict whether person has ckd or notckd??

## ckd-chronic kidney disease
## notckd-not crornic kidney disease

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv(r'../input/kidney-kronicle/kidney_disease.csv')
data.head()

In [None]:
data.shape

## Features:
*     age - age
*     bp - blood pressure
*     sg - specific gravity
*     al - albumin
*     su - sugar
*     rbc - red blood cells
*     pc - pus cell
*     pcc - pus cell clumps
*     ba - bacteria
*     bgr - blood glucose random
*     bu - blood urea
*     sc - serum creatinine
*     sod - sodium
*     pot - potassium
*     hemo - haemoglobin
*     pcv - packed cell volume
*     wc - white blood cell count
*     rc - red blood cell count
*     htn - hypertension
*     dm - diabetes mellitus
*     cad - coronary artery disease
*     appet - appetite
*     pe - pedal edema
*     ane - anemia
*     classification - class

In [None]:
columns=pd.read_csv('../input/kidney-kronicle/data_description.txt',sep='-')
columns=columns.reset_index()
columns.columns=['cols','abb_col_names']
columns

In [None]:
data.head()

In [None]:
data.columns=columns['abb_col_names'].values
data.head()

In [None]:
data.info()

In [None]:
data.columns

As it can be seen, red_blood_cell_count, packed_cell_volume and white_blood_cell_count are object type. We need to 
change to numerical dtype.

In [None]:
features=['packed cell volume','white blood cell count', 'red blood cell count']
def convert_dtype(data,feature):
    data[feature] = pd.to_numeric(data[feature], errors='coerce')

In [None]:
for feature in features:
    convert_dtype(data,feature)
data.info()

# Extract Numerical & Categorical Features

In [None]:
def extract_cat_num(data):
    cat_col=[col for col in data.columns if data[col].dtype=='object']
    num_col=[col for col in data.columns if data[col].dtype!='object']
    return cat_col,num_col
cat_col,num_col=extract_cat_num(data)

In [None]:
cat_col

In [None]:
num_col

### total unique categories in our categorical features to check if any dirtiness in data or not

In [None]:
for col in cat_col:
    print('{} has {} values '.format(col,data[col].unique()))
    print('\n')

So we need to correct 2 features and the target variable which contain certain discrepancy in some values.

In [None]:
#Replace incorrect values
data['diabetes mellitus']=data['diabetes mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
data['coronary artery disease'] = data['coronary artery disease'].replace(to_replace = '\tno', value='no')
data['class'] = data['class'].replace(to_replace = 'ckd\t', value = 'ckd')
for col in cat_col:
    print('{} has {} values  '.format(col, data[col].unique()))
    print('\n')

In [None]:
!pip install pandas-profiling


In [None]:
from pandas_profiling import ProfileReport
ProfileReport(data, title="EDA Report")

In [None]:
data1 = data.sample(frac=0.99, random_state=42)
data_unseen = data.drop(data1.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
kidney = setup(data = data1, target = 'class', session_id=123,
                  normalize = True, 
                  transformation = True, 
                  log_experiment = True,
               train_size = 0.8, # training over 80% of available data
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,# all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level
                  numeric_imputation='median',
                  categorical_imputation='mode',
                  ignore_features=['id'],
               feature_selection = True)

In [None]:
best = compare_models()

In [None]:
et = create_model('et')

In [None]:
print(et)

In [None]:
tuned_et = tune_model(et)

In [None]:
evaluate_model(tuned_et)

In [None]:
predict_model(tuned_et)

In [None]:
final_et = finalize_model(tuned_et)
final_et

In [None]:
predict_model(final_et)

In [None]:
unseen_predictions = predict_model(final_et, data=data_unseen)
unseen_predictions.head()

In [None]:
print("Confidence Score :   {}".format(round(unseen_predictions.Score.mean(),2)))