# HealthPredictor
<hr>

## Midterm Jupyter Notebook, Group 1 - Healthcare
## CIS-579-002, Introduction to Artificial Intelligence
### Avinash Shete, Chandana Bhadravati Nagaraj, Jim Small, Ritesh Revansiddappa Honnalli

### Setup environment/notebook:

In [None]:
# Needed libraries:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 26)

In [None]:
# Load data:
# Note:  Using forward slashes ("/" versus "\") so works on both Windows and Linux/macOS:
data = './Data/chronickidneydisease.csv'
df= pd.read_csv(data)

# Explore first few rows:
df.head()

### First look at data/dataframe:

In [None]:
# Dimensions (rows, columns) of dataframe:
df.shape

In [None]:
# Show overview of dataframe columns:
df.info()

In [None]:
# Show number of unique values per column:
df.nunique()

In [None]:
# Examine descriptive statistics for each column - average, standard deviation, quartiles, and more:
df.describe()

### Data Preprocessing:

In [None]:
# Remove id column:
df.drop('id', axis = 1, inplace = True)

In [None]:
# Rename column names to make it more user-friendly:
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'hemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'anemia', 'class']

In [None]:
# Again look at first few rows to see column name changes:
df.head()

In [None]:
# Generate summary statistics of numerical columns, including standard deviation and quartiles:
df.describe()

In [None]:
# Show overview of dataframe columns:
df.info()

In [None]:
# Convert relevant columns to numerical type:
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')

In [None]:
# Confirm changes - note the 3 columns data types are now floating point:
df.info()

In [None]:
# Collect categorical and numerical columns:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']

In [None]:
# Review unique values in categorical columns to check for missing values (nans):
def display_cols(cols, val_title='Values', op='unique'):
    colsize = max(len(col) for col in cols)
    trailing = int(colsize * 1.5)
    print(f'{"Column:":>{colsize}} | {val_title}:')
    print('-' * colsize, '|', '-' * trailing)
    for col in sorted(cols):        
        output = res if isinstance(res := getattr(df[col], op)(), int) else ', '.join(map(str, res)) 
        print(f'{col:>{colsize}} | {output}')

display_cols(cat_cols)

In [None]:
# Remove extra whitespace:
for dfcol in ('diabetes_mellitus', 'coronary_artery_disease', 'class'):
    df[dfcol] = df[dfcol].str.strip()

In [None]:
# Re-check:
display_cols(cat_cols)

# Note the nans - more cleanup to do...

### Visually Explore Data:

In [None]:
# Visualize numerical features distribution:
plt.figure(figsize = (20, 15))

for pn, column in enumerate(num_cols, 1):
    if pn > 14:
        break
    ax = plt.subplot(3, 5, pn)
    sns.distplot(df[column])
    plt.xlabel(column)

plt.tight_layout()
plt.show()

In [None]:
df.info()

In [None]:
# Visualize categorical column data distribution:
plt.figure(figsize = (20, 15))

for i, column in enumerate(cat_cols, 1):
    if i > 11:
        break
    ax = plt.subplot(3, 4, i)
    sns.countplot(df[column], palette = 'rocket')
    plt.xlabel(column)

plt.tight_layout()
plt.show()

In [None]:
# Remap values - patients with Chronic Kidney Disease = 0, without = 1:
df['class'] = df['class'].map({'ckd': 1, 'notckd': 0})
df['class'] = pd.to_numeric(df['class'], errors='coerce')
# Note:  Doing this after plotting as the plot looks better before converting it to a numeric column

In [None]:
df.info()

In [None]:
# Heatmap of data:
plt.figure(figsize = (15, 8))

# Note:  Had to add numeric_only=True argument or errors out as there are non-numeric values:
sns.heatmap(df.corr(numeric_only=True), annot = True, linewidths = 2, linecolor = 'lightgrey')
plt.show()

In [None]:
# Show data columns:
df.columns

### Visual Data Analysis:

In [None]:
# Functions to plot data:

# Create a violin plot to contrast each column for those with and without CKD
# Include a box plot within the violin plot (box=True)
def violin(col):
    fig = px.violin(df, y=col, x='class', color='class', box=True, template='plotly_dark',
                    color_discrete_map={0: '#636EFA', 1: '#EF553B'})
    fig.update_layout(legend_traceorder='reversed')
    return fig.show()

# Create a Kernel Density Estimation plot contrasting those with and without CKD
# Allows visualizing the probability density function (PDF) of a continuous variable
# In other words - this visually shows the distribution of the column data
def kde(col):
    grid = sns.FacetGrid(df, hue='class', height=6, aspect=2)
    grid.map(sns.kdeplot, col)
    grid.add_legend()

# Create a scatter plot contrasting pairs of columns for those with and without CKD
# This allows visualizing the relationship and correlation between pairs of columns
# while simultaneously differentiating (via color) those with and without CKD
def scatter(col1, col2):
    fig = px.scatter(df, x=col1, y=col2, color='class', template='plotly_dark',
                     color_discrete_map={0: '#636EFA', 1: '#EF553B'})
    fig.update_layout(legend_traceorder='reversed')
    return fig.show()

In [None]:
violin('red_blood_cell_count')

In [None]:
kde('red_blood_cell_count')

In [None]:
violin('white_blood_cell_count')

In [None]:
kde('white_blood_cell_count')

In [None]:
violin('packed_cell_volume')

In [None]:
kde('packed_cell_volume')

In [None]:
violin('hemoglobin')

In [None]:
kde('hemoglobin')

In [None]:
violin('albumin')

In [None]:
kde('albumin')

In [None]:
violin('blood_glucose_random')

In [None]:
kde('blood_glucose_random')

In [None]:
violin('sodium')

In [None]:
kde('sodium')

In [None]:
violin('blood_urea')

In [None]:
kde('blood_urea')

In [None]:
violin('specific_gravity')

In [None]:
kde('specific_gravity')

In [None]:
scatter('hemoglobin', 'packed_cell_volume')

In [None]:
scatter('red_blood_cell_count', 'packed_cell_volume')

In [None]:
scatter('red_blood_cell_count', 'albumin')

In [None]:
scatter('sugar', 'blood_glucose_random')

In [None]:
scatter('packed_cell_volume','blood_urea')

In [None]:
px.bar(df, x="specific_gravity", y="packed_cell_volume", color='class', barmode='group', template = 'plotly_dark', height = 400)

In [None]:
px.bar(df, x="specific_gravity", y="albumin", color='class', barmode='group', template = 'plotly_dark', height = 400)

In [None]:
px.bar(df, x="blood_pressure", y="packed_cell_volume", color='class', barmode='group', template = 'plotly_dark', height = 400)

In [None]:
px.bar(df, x="blood_pressure", y="hemoglobin", color='class', barmode='group', template = 'plotly_dark', height = 400)

### Data Cleaning:

In [None]:
# Check for null values:
df.isna().sum().sort_values(ascending = False)

In [None]:
df[num_cols].isnull().sum()

In [None]:
df[cat_cols].isnull().sum()

In [None]:
# For filling null values, we will use two methods, random sampling for higher null values
# and mean/mode sampling for lower null values:
def random_value_imputation(feature):
    random_sample = df[feature].dropna().sample(df[feature].isna().sum())
    random_sample.index = df[df[feature].isnull()].index
    df.loc[df[feature].isnull(), feature] = random_sample
    
def impute_mode(feature):
    mode = df[feature].mode()[0]
    df[feature] = df[feature].fillna(mode)

In [None]:
# Filling num_cols null values using random sampling method:
for col in num_cols:
    random_value_imputation(col)

In [None]:
df[num_cols].isnull().sum()

In [None]:
# Filling "red_blood_cells" and "pus_cell" using random sampling method
# and rest of cat_cols using mode imputation:
random_value_imputation('red_blood_cells')
random_value_imputation('pus_cell')

for col in cat_cols:
    impute_mode(col)

In [None]:
df[cat_cols].isnull().sum()

### Feature Encoding:

In [None]:
display_cols(cat_cols, val_title='Categories', op='nunique')

### Label Encoding:

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

<a id = '5.0'></a>
<p style = "font-size : 45px; color : #34656d ; font-family : 'Comic Sans MS'; text-align : center; background-color : #f9b208; border-radius: 5px 5px;"><strong>Model Building</strong></p> 

In [None]:
ind_col = [col for col in df.columns if col != 'class']
dep_col = 'class'

X = df[ind_col]
y = df[dep_col]

In [None]:
# Split data into training and test sets:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
from IPython.display import display as display

# Build confusion matrix results dataframe:
def get_cm_results(cm):
    # cm is the result of sklearn.metrics.confusion_matrix(y_test, classifier.predict(X_test))
    tn, fp, fn, tp = cm.ravel()
    data = [
        {'Result': 'True Positive', 'Number': tp,
         'Description': 'Actual occurrence, correctly predicted'},
        {'Result': 'True Negative', 'Number': tn,
         'Description': 'Non-occurrence, correctly predicted'},
        {'Result': 'False Negative', 'Number': fn,
         'Description': 'Actual occurrence, incorrectly predicted (Type II error)'},
        {'Result': 'False Positive', 'Number': fp,
         'Description': 'Non-occurrence, incorrectly predicted (Type I error)'},
    ]
    return pd.DataFrame(data).style.set_properties(
        subset=['Description'], **{'text-align': 'left'}
    ).hide(axis='index')


# Build confusion matrix metrics dataframe:
def get_cm_metrics(cm):
    # cm is the result of sklearn.metrics.confusion_matrix(y_test, classifier.predict(X_test))
    tn, fp, fn, tp = cm.ravel()
    prec_val = tp/(tp + fp)
    rec_val = tp/(tp + fn)
    data = [
        {'Metric': 'Accuracy', 'Value': (tp + tn)/(tp + tn + fp + fn),
         'Description': 'How often is classifier correct'},
        {'Metric': 'Precision', 'Value': prec_val,
         'Description': 'How often is TP correctly predicted (% of correct positives)'},
        {'Metric': 'Recall', 'Value': rec_val,
         'Description': 'How often is actual occurrence correctly predicted'},
        {'Metric': 'F1-Score', 'Value': (2 * prec_val * rec_val)/(prec_val + rec_val),
         'Description': 'Account for both precision and recall using harmonic mean'},
        {'Metric': 'Misclassification', 'Value': (fp + fn)/(tp + tn + fp + fn),
         'Description': 'How often is classifier wrong'},
        {'Metric': 'NPV', 'Value': tn/(tn + fn),
         'Description': 'How often is TN correctly predicted (% of correct negatives)'},
        {'Metric': 'FPR', 'Value': fp/(tn + fp),
         'Description': 'How often is non-occurrence falsely predicted'},
        {'Metric': 'Specificity', 'Value': tn/(tn + fp),
         'Description': 'How often is actual non-occurrence correctly predicted'},
        {'Metric': 'Prevalence', 'Value': (fn + tp)/(tp + tn + fp + fn),
         'Description': 'How often does actually occurrence occur in sample'},
    ]
    return pd.DataFrame(data).style.set_properties(
        subset=['Description'], **{'text-align': 'left'}
    ).format({'Value': '{:.2%}'}).hide(axis='index')

# Plot confusion matrix heatmap:
def cm_heatmap(cm):
    classes = ['No CKD', 'CKD']
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)
    
    # Flip horizontally and vertically
    df_cm = df_cm.iloc[::-1, ::-1]
    
    plt.figure(figsize=(5,4))
    sns.heatmap(df_cm, annot=True, fmt='d', cmap='coolwarm_r',
                linewidths=0.5, cbar_kws={'label': 'Count'})
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix Heatmap')
    return plt.show()

<a id = '5.2'></a>
<p style = "font-size : 25px; color : #34656d ; font-family : 'Comic Sans MS'; text-align : center; background-color : #fbc6a4; border-radius: 5px 5px;"><strong>Decision Tree Classifier</strong></p> 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# Accuracy score, confusion matrix and classification report of decision tree:
dtc_acc = accuracy_score(y_test, dtc.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")

cm = confusion_matrix(y_test, dtc.predict(X_test))
print(f"Confusion Matrix :- \n{cm}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(X_test))}")

In [None]:
# Plot confusion matrix heatmap:
cm_heatmap(cm)

# Show classifier results and metrics:
display(get_cm_results(cm))
display(get_cm_metrics(cm))

In [None]:
# Hyper parameter tuning of decision tree:
from sklearn.model_selection import GridSearchCV

grid_param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'splitter' : ['best', 'random'],
    'min_samples_leaf' : [1, 2, 3, 5, 7],
    'min_samples_split' : [1, 2, 3, 5, 7],
    'max_features' : ['auto', 'sqrt', 'log2']
}

grid_search_dtc = GridSearchCV(dtc, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search_dtc.fit(X_train, y_train)

In [None]:
# Best parameters and best score:
print(grid_search_dtc.best_params_)
print(grid_search_dtc.best_score_)

In [None]:
# Best estimator:
dtc = grid_search_dtc.best_estimator_

# Accuracy score, confusion matrix and classification report of decision tree
dtc_acc = accuracy_score(y_test, dtc.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")

cm = confusion_matrix(y_test, dtc.predict(X_test))
print(f"Confusion Matrix :- \n{cm}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(X_test))}")

In [None]:
# Plot confusion matrix heatmap:
cm_heatmap(cm)

# Show classifier results and metrics:
display(get_cm_results(cm))
display(get_cm_metrics(cm))

<a id = '5.3'></a>
<p style = "font-size : 25px; color : #34656d ; font-family : 'Comic Sans MS'; text-align : center; background-color : #fbc6a4; border-radius: 5px 5px;"><strong>Random Forest Classifier</strong></p>

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Note:  The default for max_features changed from 'auto' to 'sqrt', updating:
rd_clf = RandomForestClassifier(criterion = 'entropy', max_depth = 11, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 3, n_estimators = 130)
rd_clf.fit(X_train, y_train)

# Accuracy score, confusion matrix and classification report of random forest:
rd_clf_acc = accuracy_score(y_test, rd_clf.predict(X_test))

print(f"Training Accuracy of Random Forest Classifier is {accuracy_score(y_train, rd_clf.predict(X_train))}")
print(f"Test Accuracy of Random Forest Classifier is {rd_clf_acc} \n")

cm = confusion_matrix(y_test, rd_clf.predict(X_test))
print(f"Confusion Matrix :- \n{cm}\n")
print(f"Classification Report :- \n {classification_report(y_test, rd_clf.predict(X_test))}")

In [None]:
# Plot confusion matrix heatmap:
cm_heatmap(cm)

# Show classifier results and metrics:
display(get_cm_results(cm))
display(get_cm_metrics(cm))

<a id = '6.0'></a>
<p style = "font-size : 35px; color : #34656d ; font-family : 'Comic Sans MS'; text-align : center; background-color : #f9b208; border-radius: 5px 5px;"><strong>Models Comparison</strong></p> 

In [None]:
models = pd.DataFrame({
    'Model' : ['Decision Tree Classifier', 'Random Forest Classifier'],
    'Score' : [dtc_acc, rd_clf_acc]
})

models.sort_values(by = 'Score', ascending = False)

In [None]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', 
       title = 'Models Comparison')

### Select random forest classifier - more reliable:

In [None]:
# Top 10 Features:
feature_scores=pd.DataFrame(rd_clf.feature_importances_,columns=['Score'],index=X_train.columns).sort_values(by='Score',ascending=False)
top10_feature = feature_scores.nlargest(n=10, columns=['Score'])

plt.figure(figsize=(14,6))
g = sns.barplot(x=top10_feature.index, y=top10_feature['Score'])
p = plt.title('Top 10 Features with Random Forest')
p = plt.xlabel('Feature name')
p = plt.ylabel('Random Forest score')
p = g.set_xticklabels(g.get_xticklabels(), horizontalalignment='right', rotation=45)

In [None]:
top10_feature.index

In [None]:
X.columns

In [None]:
# Prune columns not in top 10:
for ele in X.columns:
    if ele not in top10_feature.index:
        X = X.drop(ele, axis = 1)

In [None]:
X.head()

In [None]:
X_train=X_train[['specific_gravity', 'hemoglobin', 'serum_creatinine', 'albumin',
       'packed_cell_volume', 'diabetes_mellitus', 'hypertension',
       'blood_glucose_random', 'red_blood_cell_count', 'blood_urea']]
X_test=X_test[['specific_gravity', 'hemoglobin', 'serum_creatinine', 'albumin',
       'packed_cell_volume', 'diabetes_mellitus', 'hypertension',
       'blood_glucose_random', 'red_blood_cell_count', 'blood_urea']]
rd_clf.fit(X_train,y_train)

### Testing Predictions:

In [None]:
# Prediction 1 - no CKD:
prediction = rd_clf.predict([[1.025, 15.8, 1.1, 0.0, 53.0, 0, 0, 131.0, 6.1, 18.0]])[0]
if prediction:
    print('Oops! You have Chronic Kidney Disease.')
else:
    print("Great! You don't have Chronic Kidney Disease.")

In [None]:
# Prediction 2 - CKD:
prediction = rd_clf.predict([[1.020, 15.4, 1.2, 1.0, 44.0, 1, 1, 121.0, 5.2, 36.0]])[0]
if prediction:
    print('Oops! You have Chronic Kidney Disease.')
else:
    print("Great! You don't have Chronic Kidney Disease.")

### Serialize and save model:

In [None]:
import pickle

pickle.dump(rd_clf,open("CKD.pkl","wb"))