# AutoGluon Tabular Example
>__NOTE:__ Make sure to use the Pyton 3 (Data Science) Jupyter Kernel.

## Prerequisites

### Intalling the Image Build CLI

In [None]:
%%capture
import sys
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

!{sys.executable} -m pip install -U pip sagemaker-studio-image-build

### Configuring the AutoGluon Training/Testing Script

In [None]:
%%writefile train.py
import os
import json
import boto3
import json
import warnings
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor

warnings.filterwarnings("ignore", category=DeprecationWarning)
prefix = "/opt/ml"
input_path = os.path.join(prefix, "input/data")
output_path = os.path.join(prefix, "output")
model_path = os.path.join(prefix, "model")
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')


def train(params):
    label = params["label"]
    channel_name = "training"
    training_path = os.path.join(input_path, channel_name)
    training_dataset = TabularDataset(os.path.join(training_path, "training.csv"))
    predictor = TabularPredictor(label=label, path=model_path).fit(training_dataset)
    with open(os.path.join(model_path, "Fit_Summary.txt"), "w") as f:
        print(predictor.fit_summary(), file=f)
    return predictor
    

def test(params, predictor):
    label = params["label"]
    channel_name = "testing"
    testing_path = os.path.join(input_path, channel_name)
    testing_dataset = TabularDataset(os.path.join(testing_path, "testing.csv"))
    ground_truth = testing_dataset[label]
    testing_data = testing_dataset.drop(columns=label)
    predictions = predictor.predict(testing_data)
    with open(os.path.join(model_path, "Model_Evaluation.txt"), "w") as f:
        print(
            json.dumps(
                predictor.evaluate_predictions(
                    y_true=ground_truth,
                    y_pred=predictions,
                    auxiliary_metrics=True
                ),
                indent=4
            ),
            file=f
        )
    leaderboard = predictor.leaderboard(testing_dataset, silent=True)
    leaderboard.to_csv(os.path.join(model_path, "Leaderboard.csv"))


if __name__ == "__main__":
    print("Loading Parameters\n")
    with open(param_path) as f:
        params = json.load(f)
    print("Training Models\n")
    predictor = train(params)
    print("Testig Models\n")
    test(params, predictor)
    print("AutoGluon Job Complete")

### Container Image Build Instructions (Dockerfile)

In [None]:
%%writefile Dockerfile
ARG REGION
FROM 763104351884.dkr.ecr.${REGION}.amazonaws.com/autogluon-training:0.3.1-cpu-py37-ubuntu18.04
RUN pip install -U pip
RUN pip install bokeh==2.0.1
RUN mkdir -p /opt/program
RUN mkdir -p /opt/ml
COPY train.py /opt/program
WORKDIR /opt/program
ENTRYPOINT ["python", "train.py"]

### Container Build Process

In [None]:
import boto3
import sagemaker

aws_region = sagemaker.Session().boto_session.region_name
!sm-docker build --build-arg REGION={aws_region} .

---

## AutoGluon Experiment

### Download the Abalone Data

In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

column_names = ["sex", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings"]
abalone_data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", names=column_names)
training_data, testing_data = train_test_split(abalone_data, test_size=0.1)
training_data.to_csv("training.csv")
testing_data.to_csv("testing.csv")

### Experiment Parameters

>__NOTE:__ Update the `image_uri` parameter with the _Image URI_ output the __Container Build Process__.

In [None]:
import sagemaker
import datetime

image_uri = "<Enter the Image URI from the sm-docker output>"
role = sagemaker.get_execution_role()
session = sagemaker.session.Session()
bucket = session.default_bucket()
job_version = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
job_name = f"abalone-autogluon-{job_version}"

### Create the AutoGluon Estimator 

In [None]:
from sagemaker.estimator import Estimator

autogluon = Estimator(
    image_uri=image_uri,
    role=role,
    output_path=f"s3://{bucket}/{job_name}",
    base_job_name=job_name,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    hyperparameters={
        "label": "rings",
        "bucket": bucket,
        "training_job": job_name
    },
    volume_size=20
)

### Execute the Experiment

In [None]:
autogluon.fit(
    inputs={
        "training": session.upload_data(
            "training.csv",
            bucket=bucket,
            key_prefix=f"{job_name}/input"
        ),
        "testing": session.upload_data(
            "testing.csv",
            bucket=bucket,
            key_prefix=f"{job_name}/input"
        )
    }
)

### Experiment Results

#### Download Model Artifacts

In [None]:
!mkdir extract
sagemaker.s3.S3Downloader.download(autogluon.model_data, "./")
!tar xfz ./model.tar.gz -C extract

#### Review Model Leaderboard

In [None]:
df = pd.read_csv("./extract/Leaderboard.csv")
df = df.filter(["model","score_test", "score_val"]).sort_values(by="score_val", ascending=False).reset_index().drop(columns="index")
df

#### Plot Model Comparison

In [None]:
import IPython
IPython.display.HTML(filename="./extract/SummaryOfModels.html")