<a href="https://colab.research.google.com/github/hantswilliams/HHA-507-2022/blob/main/autoML/autoML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Pacakges

In [1]:
!pip install tpot mljar-supervised

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.0 MB/s 
[?25hCollecting mljar-supervised
  Downloading mljar-supervised-0.11.3.tar.gz (112 kB)
[K     |████████████████████████████████| 112 kB 59.0 MB/s 
Collecting xgboost>=1.1.0
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[K     |████████████████████████████████| 255.9 MB 50 kB/s 
[?25hCollecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting deap>=1.2
  Downloading deap-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (139 kB)
[K     |████████████████████████████████| 139 kB 46.0 MB/s 
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting lightgbm>=3.0.0
  Downloading lightgbm-3.3.3-py3-none-manylinux1_x86_64.whl

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML


# Options Available

- mode — the package ships with four built-in models. 
  - The Explain mode is ideal for explaining and understanding the data. It results in visualizations of feature importance as well as tree visualizations.
  - The Perform is used when building ML models for production. 
  - The Compete is meant to build models used in machine learning competitions. 
  - The Optuna mode is used to search for highly-tuned ML models.
- algorithms — specifies the algorithms you would like to use. They are usually passed in as a list.
- results_path — the path where the results will be stored
- total_time_limit — the total time in seconds for training the model
- train_ensemble — dictates if an ensemble will be created at the end of the training process
- stack_models — determines if a models stack will be created
- eval_metric — the metric that will be optimized. If auto the logloss is used for classification problems while the rmse is used for regression problems

In [None]:
#automl = AutoML(
    # mode="Explain"
    # algorithms=""
    # results_path="AutoML_22",
    # total_time_limit=30 * 60,
    # train_ensemble=True,
    # stack_models="",
    # eval_metric=""
#)

# Healthcare Dataset - SPARCS

## Load in dataset

In [2]:
import pandas as pd
sparcs = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_sparcs.csv')
sparcs

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [3]:
sparcs.columns

Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCS Diagnosis Code', 'CCS Diagnosis Description',
       'CCS Procedure Code', 'CCS Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Birth Weight', 'Abortion Edit Indicator',
       'Emergency Department Indicator', 'Total Charges', 'Total Costs'],
      dtype='object')

## Potential variables of interest

- APR Risk of Mortality (categorical) 
- Total costs (continuous) 
- Length of Stay

In [4]:
sparcs['Length of Stay'].describe()

count     23583
unique       97
top           2
freq       5378
Name: Length of Stay, dtype: object

In [5]:
sparcs['Total Costs'].describe()

count    2.358300e+04
mean     1.472282e+04
std      2.718098e+04
min      6.700000e-01
25%      4.471700e+03
50%      8.320120e+03
75%      1.590874e+04
max      1.591541e+06
Name: Total Costs, dtype: float64

In [14]:
sparcs['APR Risk of Mortality'].value_counts()

Minor       13990
Moderate     4952
Major        3452
Extreme      1187
Name: APR Risk of Mortality, dtype: int64

## Create some simplified binary versions

In [15]:
sparcs['Length of Stay'] = pd.to_numeric(sparcs['Length of Stay'], errors='coerce')
sparcs['sparcs_los'] = sparcs['Length of Stay'].apply(lambda x: 'long' if x > 3 else 'short')
sparcs.drop('Length of Stay', axis=1, inplace=True)
sparcs['sparcs_los'].value_counts()


short    13008
long     10575
Name: sparcs_los, dtype: int64

# MLJar Examples

## Binary Classifier Example 1 - SPARCS

### **Create new model**

In [16]:
X = sparcs.drop(columns=['sparcs_los'])

In [17]:
y = sparcs["sparcs_los"]

In [18]:
X

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [19]:
y

0        short
1        short
2        short
3        short
4         long
         ...  
23578     long
23579    short
23580     long
23581     long
23582     long
Name: sparcs_los, Length: 23583, dtype: object

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

In [26]:
X_test

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
18664,New York City,Queens,7003004.0,1630.0,Long Island Jewish Medical Center,18 to 29,113,M,Other Race,Spanish/Hispanic,...,Minor,Medical,Medicaid,Medicaid,Self-Pay,0,N,Y,36909.00,7266.25
6994,Hudson Valley,Dutchess,1302001.0,181.0,Vassar Brothers Medical Center,70 or Older,126,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,,,0,N,Y,33749.32,8782.71
18784,New York City,Queens,7003007.0,1633.0,Queens Hospital Center,70 or Older,114,F,White,Unknown,...,Minor,Medical,Medicare,Medicare,Private Health Insurance,0,N,N,22400.00,11047.39
17631,New York City,Manhattan,7002054.0,3975.0,New York Presbyterian Hospital - Allen Hospital,30 to 49,OOS,F,Black/African American,Not Span/Hispanic,...,Minor,Medical,Medicaid,Self-Pay,,0,N,N,680561.47,161077.77
11362,New York City,Kings,7001003.0,1288.0,Brooklyn Hospital Center - Downtown Campus,50 to 69,114,F,Black/African American,Unknown,...,Moderate,Surgical,Blue Cross/Blue Shield,Self-Pay,,0,N,Y,55032.23,13654.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,Western NY,Chautauqua,602001.0,103.0,Woman's Christian Association,70 or Older,147,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,,0,N,Y,4825.80,2401.68
21182,Long Island,Nassau,2950001.0,527.0,South Nassau Communities Hospital,70 or Older,115,M,White,Spanish/Hispanic,...,Moderate,Medical,Medicare,Medicare,,0,N,Y,56879.72,10790.79
5579,Capital/Adirond,Albany,101004.0,5.0,St Peters Hospital,70 or Older,123,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,16730.69,3729.33
21325,Long Island,Nassau,2950001.0,527.0,South Nassau Communities Hospital,70 or Older,115,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,,0,N,Y,51917.30,11391.66


In [29]:
automl = AutoML(results_path="sparcs_binary_los", mode="Explain")

In [30]:
automl.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: sparcs_binary_los
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.687821 trained in 0.58 seconds




2_DecisionTree logloss 0.448005 trained in 15.09 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.259742 trained in 20.64 seconds
4_Default_NeuralNetwork logloss 0.400019 trained in 7.95 seconds
5_Default_RandomForest logloss 0.391253 trained in 20.0 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.259742 trained in 1.13 seconds
AutoML fit time: 75.1 seconds
AutoML best model: 3_Default_Xgboost


AutoML(results_path='sparcs_binary_los')

In [31]:
pred = automl.predict(X_test)
pred

array(['long', 'long', 'long', ..., 'short', 'long', 'short'],
      dtype=object)

In [None]:
automl.report()

### **Test new (not really) data**

In [34]:
# load in the data model 

automl_sparcs_los = AutoML(results_path="sparcs_binary_los")

In [40]:
# create a new dataset that follows the same data structure as the training set
X_withlos = sparcs.sample(25)
X_withoutlos = X_withlos.drop(columns=['sparcs_los'])

In [36]:
X_withlos

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs,sparcs_los
15552,New York City,Manhattan,7002024.0,1456.0,Mount Sinai Hospital,30 to 49,113,F,White,Not Span/Hispanic,...,Surgical,Private Health Insurance,Self-Pay,,0,N,N,36155.14,14532.5,long
6324,Capital/Adirond,Rensselaer,4102002.0,756.0,Samaritan Hospital,50 to 69,125,M,White,Not Span/Hispanic,...,Surgical,Private Health Insurance,Self-Pay,Self-Pay,0,N,Y,113306.08,44874.17,long
14296,New York City,Manhattan,7002002.0,1439.0,Mount Sinai Beth Israel,50 to 69,100,M,Black/African American,Spanish/Hispanic,...,Surgical,Medicaid,Medicaid,Self-Pay,0,N,Y,78430.1,19496.22,long
13714,New York City,Manhattan,7002054.0,1437.0,New York-Presbyterian/Lower Manhattan Hospital,30 to 49,112,M,White,Not Span/Hispanic,...,Surgical,Private Health Insurance,Self-Pay,,0,N,Y,18995.85,5751.28,short
4629,Central NY,Onondaga,3301007.0,635.0,University Hospital SUNY Health Science Center,50 to 69,130,M,Other Race,Not Span/Hispanic,...,Surgical,Private Health Insurance,,,0,N,N,124879.4,56514.75,long


In [37]:
X_withoutlos

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
15552,New York City,Manhattan,7002024.0,1456.0,Mount Sinai Hospital,30 to 49,113,F,White,Not Span/Hispanic,...,Minor,Surgical,Private Health Insurance,Self-Pay,,0,N,N,36155.14,14532.5
6324,Capital/Adirond,Rensselaer,4102002.0,756.0,Samaritan Hospital,50 to 69,125,M,White,Not Span/Hispanic,...,Moderate,Surgical,Private Health Insurance,Self-Pay,Self-Pay,0,N,Y,113306.08,44874.17
14296,New York City,Manhattan,7002002.0,1439.0,Mount Sinai Beth Israel,50 to 69,100,M,Black/African American,Spanish/Hispanic,...,Moderate,Surgical,Medicaid,Medicaid,Self-Pay,0,N,Y,78430.1,19496.22
13714,New York City,Manhattan,7002054.0,1437.0,New York-Presbyterian/Lower Manhattan Hospital,30 to 49,112,M,White,Not Span/Hispanic,...,Minor,Surgical,Private Health Insurance,Self-Pay,,0,N,Y,18995.85,5751.28
4629,Central NY,Onondaga,3301007.0,635.0,University Hospital SUNY Health Science Center,50 to 69,130,M,Other Race,Not Span/Hispanic,...,Minor,Surgical,Private Health Insurance,,,0,N,N,124879.4,56514.75


In [41]:
predict = automl.predict(X_withoutlos)
predict

array(['short', 'short', 'long', 'short', 'short', 'short', 'short',
       'long', 'long', 'short', 'short', 'long', 'short', 'short',
       'short', 'short', 'long', 'short', 'short', 'long', 'short',
       'long', 'short', 'short', 'long'], dtype=object)

In [42]:
# actual values from X_withlos
values_actual = X_withlos['sparcs_los'].values.tolist()
values_predicted = predict.tolist()
output = pd.DataFrame({'actual': values_actual, 'predicted': values_predicted})
output

Unnamed: 0,actual,predicted
0,short,short
1,short,short
2,long,long
3,short,short
4,short,short
5,short,short
6,short,short
7,long,long
8,long,long
9,short,short


## Binary Classifier Example 2 - GENERIC

In [None]:
import pandas as pd
from supervised.automl import AutoML
import os

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_binary_bank.csv")

X = df[df.columns[:-1]]
y = df["y"]

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


In [None]:
automl = AutoML(
    # results_path="AutoML_22",
    # total_time_limit=30 * 60,
    # start_random_models=10,
    # hill_climbing_steps=3,
    # top_models_to_improve=3,
    # train_ensemble=True,
    mode="Explain"
)

automl.fit(X_train, y_train)

In [None]:
pred = automl.predict(X_test)
pred
# print("Test accuracy", accuracy_score(y_test, pred["label"]))

In [None]:
automl.report()

## Regression - Example - GENERIC

## Regression - Example 2 - GENERIC

In [None]:
import numpy as np
import pandas as pd
from supervised.automl import AutoML

df = pd.read_csv("https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_regression_housing.csv")
x_cols = [c for c in df.columns if c != "MEDV"]
X = df[x_cols]
y = df["MEDV"]

In [None]:
df

In [None]:
x_cols

In [None]:
X

In [None]:
y

In [None]:
automl = AutoML()
automl.fit(X, y)

In [None]:
df["predictions"] = automl.predict(X)


In [None]:
print("Predictions")
print(df[["MEDV", "predictions"]].head())

## Multiclass Classifier - GENERIC

In [None]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
import supervised


import warnings

from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

from supervised import AutoML
from supervised.exceptions import AutoMLException

# warnings.filterwarnings('error')
warnings.filterwarnings(
    "error", category=pd.core.common.SettingWithCopyWarning
)  # message="*ndarray*")

df = pd.read_csv("https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_classes_iris.csv")
X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
y = df["class"]



In [None]:
df

In [None]:
X

In [None]:
y.value_counts()

In [None]:
automl = AutoML()


In [None]:
automl.fit(X, y)

In [None]:
predictions = automl.predict_all(X)


In [None]:
print(predictions.head())
print(predictions.tail())

print(X.shape)
print(predictions.shape)

# Download outputs

In [32]:
# get current working directory
import os
os.getcwd()

'/content'

In [None]:
folders = os.listdir()
foldersML = [x for x in folders if x.startswith('sparcs')]
print(foldersML)

In [33]:
!zip -r /content/sparcs_binary_los.zip /content/sparcs_binary_los

  adding: content/sparcs_binary_los/ (stored 0%)
  adding: content/sparcs_binary_los/Ensemble/ (stored 0%)
  adding: content/sparcs_binary_los/Ensemble/roc_curve.png (deflated 9%)
  adding: content/sparcs_binary_los/Ensemble/precision_recall_curve.png (deflated 11%)
  adding: content/sparcs_binary_los/Ensemble/predictions_ensemble.csv (deflated 62%)
  adding: content/sparcs_binary_los/Ensemble/lift_curve.png (deflated 7%)
  adding: content/sparcs_binary_los/Ensemble/confusion_matrix_normalized.png (deflated 18%)
  adding: content/sparcs_binary_los/Ensemble/calibration_curve_curve.png (deflated 14%)
  adding: content/sparcs_binary_los/Ensemble/cumulative_gains_curve.png (deflated 7%)
  adding: content/sparcs_binary_los/Ensemble/learning_curves.png (deflated 11%)
  adding: content/sparcs_binary_los/Ensemble/confusion_matrix.png (deflated 17%)
  adding: content/sparcs_binary_los/Ensemble/README.md (deflated 67%)
  adding: content/sparcs_binary_los/Ensemble/ensemble.json (deflated 44%)
  a