# Predicting Patient Attributes

In [4]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_validate, StratifiedKFold
import matplotlib.pyplot as plt
from ipywidgets import interact

## Data
All data in this notebook is dummy data, which is the indian liver patinet dataset found here: https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset) augmented with one dummy categorical feature.

Reference: Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [7]:
data_path = 'data/dummy_data.csv'

In [8]:
data = pd.read_csv(data_path, index_col=0)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 582
Data columns (total 12 columns):
age                           579 non-null int64
gender                        579 non-null int64
total_bilirubin               579 non-null float64
direct_bilirubin              579 non-null float64
alkaline_phosphotase          579 non-null int64
alamine_aminotransferase      579 non-null int64
aspartate_aminotransferase    579 non-null int64
total_proteins                579 non-null float64
albumin                       579 non-null float64
albumin_and_globulin_ratio    579 non-null float64
liver_patient                 579 non-null int64
dummy_category                579 non-null object
dtypes: float64(5), int64(6), object(1)
memory usage: 58.8+ KB


In [10]:
data.describe()

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_proteins,albumin,albumin_and_globulin_ratio,liver_patient
count,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0
mean,44.782383,0.758204,3.315371,1.494128,291.366149,81.126079,110.414508,6.481693,3.138515,0.947064,1.284974
std,16.221786,0.428542,6.227716,2.816499,243.561863,183.182845,289.850034,1.084641,0.794435,0.319592,0.451792
min,4.0,0.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,1.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,1.0,2.6,1.3,298.0,61.0,87.0,7.2,3.8,1.1,2.0
max,90.0,1.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [11]:
data.corr()

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_proteins,albumin,albumin_and_globulin_ratio,liver_patient
age,1.0,0.055881,0.011,0.006784,0.078878,-0.087799,-0.020499,-0.186248,-0.264211,-0.216408,-0.133164
gender,0.055881,1.0,0.088068,0.09916,-0.029368,0.081339,0.079421,-0.095149,-0.095579,-0.003424,-0.081349
total_bilirubin,0.011,0.088068,1.0,0.874481,0.205739,0.213375,0.237323,-0.007906,-0.222087,-0.206267,-0.220218
direct_bilirubin,0.006784,0.09916,0.874481,1.0,0.234008,0.23318,0.257022,3.3e-05,-0.228409,-0.200125,-0.246273
alkaline_phosphotase,0.078878,-0.029368,0.205739,0.234008,1.0,0.124777,0.16658,-0.027062,-0.163419,-0.234166,-0.183363
alamine_aminotransferase,-0.087799,0.081339,0.213375,0.23318,0.124777,1.0,0.791862,-0.042432,-0.028658,-0.002375,-0.163117
aspartate_aminotransferase,-0.020499,0.079421,0.237323,0.257022,0.16658,0.791862,1.0,-0.025751,-0.084915,-0.07004,-0.151834
total_proteins,-0.186248,-0.095149,-0.007906,3.3e-05,-0.027062,-0.042432,-0.025751,1.0,0.783112,0.234887,0.033614
albumin,-0.264211,-0.095579,-0.222087,-0.228409,-0.163419,-0.028658,-0.084915,0.783112,1.0,0.689632,0.15977
albumin_and_globulin_ratio,-0.216408,-0.003424,-0.206267,-0.200125,-0.234166,-0.002375,-0.07004,0.234887,0.689632,1.0,0.163131


In [14]:
target_variable = 'age'  # regression y variable chosen for this dummy example

## Feature Scaling

In [15]:
sds = data.std()
for col in sds.index.drop([target_variable, 'liver_patient']):
    data[col] /= sds[col]

## Modelling

In [16]:
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor

In [17]:
models = [MLPRegressor(), KNeighborsRegressor(), GaussianProcessRegressor(), SVR(), BayesianRidge(), ElasticNet(), 
          Lasso(), RandomForestRegressor()]  # models to compare against each other

### Ignore Warnings for Demo

In [18]:
import warnings
warnings.simplefilter('ignore')

Split the dataset along categories. For each subdatset, compare all the performance of algorithms. For this example, the dummy categorical variable is used to segment the dataset.

In [19]:
df_preds = pd.DataFrame()

scores = []
preds = []

model_id = -1

for cat in data.dummy_category.unique():  # train a unique algorithm on each subdataset
    
    df_Xy = data[data.dummy_category==cat].drop('dummy_category', axis=1)
    
    n_folds = 3
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True)
    kfold_indices = list(skf.split(X=df_Xy, 
                                   y=df_Xy['liver_patient']))
    
    for model in models:
        model_id += 1
        for ifold in range(len(kfold_indices)):
            train_indices, test_indices = kfold_indices[ifold]
            df_train, df_test = df_Xy.iloc[train_indices], df_Xy.iloc[test_indices].copy()
            
            X, y = df_train.drop(target_variable, axis=1), df_train[target_variable]
            X_test, y_test = df_test.drop([target_variable], axis=1), df_test[target_variable]

            model = model.fit(X, y)
            
            pred = model.predict(X_test).clip(0)  # age cannot be negative

            score = {'model': type(model).__name__, 'model_id': model_id, 'fold': ifold, 
                     'RMSE': mean_squared_error(y_test, pred)**0.5, 'MAE': mean_absolute_error(y_test, pred), 
                     'r2': r2_score(y_test, pred), 'category': cat}
            
            df_pred = X_test.copy()
            df_pred[f'predId{model_id}Fold{ifold}'] = pred
            df_preds = df_preds.join(df_pred[[f'predId{model_id}Fold{ifold}']], how='outer')
            scores.append(score)

## Process Results

In [20]:
df_results = pd.DataFrame(scores).set_index('category')
df_results['model'] = df_results['model'].astype(str)

### Display all results for one subdataset (split along the categories)

In [23]:
def display_results_for_one_category(category):
    display(df_results.loc[category])
interact(display_results_for_one_category, category=df_results.index.unique().values)

interactive(children=(Dropdown(description='category', options=('orange', 'yellow', 'red'), value='orange'), O…

<function __main__.display_results_for_one_category(category)>

In [24]:
mae_fold_mean = df_results.groupby('model_id').MAE.mean()
r2_fold_mean = df_results.groupby('model_id').r2.mean()
RMSE_fold_mean = df_results.groupby('model_id').RMSE.mean()

In [25]:
df_results['MAE_fold_mean'] = df_results.model_id.map(mae_fold_mean)
df_results['r2_fold_mean'] = df_results.model_id.map(r2_fold_mean)
df_results['RMSE_fold_mean'] = df_results.model_id.map(RMSE_fold_mean)

## Compare the performance of algorithms across all data

In [26]:
df_results_means = df_results[['model_id', 'model', 'MAE_fold_mean', 
                               'r2_fold_mean', 'RMSE_fold_mean']].reset_index().groupby('model_id').first()

In [27]:
def compare_algorithms_for_different_metrics(metric):
    df_plot = df_results_means.pivot(index='model', columns='category', values=metric+'_fold_mean')
    df_plot.plot.bar(figsize=(15, 8), stacked=False)
    plt.ylabel(metric)
    plt.show()
interact(compare_algorithms_for_different_metrics, metric=['MAE', 'r2', 'RMSE'])

interactive(children=(Dropdown(description='metric', options=('MAE', 'r2', 'RMSE'), value='MAE'), Output()), _…

<function __main__.compare_algorithms_for_different_metrics(metric)>

## Best Results and Model Per Category

In [28]:
df_results.sort_values('MAE_fold_mean', ascending=True).groupby('category').first()[['model', 'MAE_fold_mean',
                                                                                    'r2_fold_mean', 'RMSE_fold_mean']]

Unnamed: 0_level_0,model,MAE_fold_mean,r2_fold_mean,RMSE_fold_mean
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
orange,BayesianRidge,13.127314,-0.003182,15.992155
red,ElasticNet,12.839579,-0.079034,15.783508
yellow,Lasso,12.75772,0.100933,15.590587
