# Automated ML

In [None]:
import pickle
import requests
import json

from azureml.core import Environment
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.experiment import Experiment
from azureml.core.dataset import Dataset
from azureml.core.workspace import Workspace
from azureml.core.webservice.aci import AciWebservice
from azureml.core.model import InferenceConfig, Model

from azureml.train.automl import AutoMLConfig

## Dataset

### Overview
In this experiment we will be using **Kaggle - Credit Card Fraud Dataset**, the dataset can be downloaded from [here](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud). This dataset consist of **~99% Non-Fraudulent** transactions while the rest **~0.1% Fraudulent** transaction, hence the data is imbalance. There are no null values, and all the columns except for the **transaction** and **amount** are unknown, maybe for privacy reasons. As additional note, all the data in this datasets has been scaled.

In [None]:
ws = Workspace.from_config("./config.json")

# choose a name for experiment
experiment_name = "creditcard-experiment"
project_folder = './creditcard-pipeline-project'

experiment = Experiment(ws, experiment_name)

In [None]:
key = "creditcard-dataset"
description = "Credit Card - Dealing from Imbalance Datasets from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud"

found = False
if key in ws.datasets.keys():
    print("Found existing dataset, use it.")
    found = True
    dataset = ws.datasets[key] # already registered
    
if not found:
    example_data = "https://media.githubusercontent.com/media/satriawadhipurusa/ml-dataset-collection/master/Fraud-Detection/creditcard-fraud.csv" # uploaded to Git for download
    dataset = Dataset.Tabular.from_delimited_files(example_data)
    dataset = dataset.register(workspace=ws, name=key, description=description)

## AutoML Configuration

The AutoML usually consist of the followings configurations:

* Compute Target: For the compute target we will use `STANDARD_D2_v3` (2CPU, 8GB memory, 50GB storage) in low priority which has been created earlier, and max nodes of 4, this will enable more parallel trials in training Automated ML 
* Task: Since we're predicting Fraud (0/1), this should be binary **Classification** task
* Early Stopping 
  * Timeout: We set the timeout to be 30 mins instead of 60 mins, so we can iterate faster
  * Primary Metric: We are interested to see **AUC Weighted** with **0.98** exit score, since this is an imbalance dataset. An accuracy of 0.99 will be misleading since we can achieve the same accuracy with just predicting 0s, but low precision/recall. 

### Compute Cluster

In [None]:
amlcompute_cluster_name = "automl-cls"

try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_v3", max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count=0, timeout_in_minutes=10)

In [None]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "primary_metric": "AUC_weighted",
    "experiment_exit_score" : 0.98
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_target,
                             task="classification",
                             training_data=dataset,
                             label_column_name="Class",
                             path=project_folder,
                             enable_early_stopping=True,
                             featurization="auto",
                             debug_log="automl_errors.log",
                             **automl_settings)

In [None]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

## Run Details

There are couple of **models**, **preprocessor**, and **hyperparameters** trained in this Automated ML experiment, some of the models are:

* LightGBM
* RandomForest
* XGBoost
* ExtremeRandomTrees
* LogisticRegression
* VotingEnsemble
* StackEnsemble

These **worst** model is combination of **PCA** and **LighGBM** with AUC Weighted of **0.72**, while the best one is a combination of **StandardScalerWrapper** with **LightGBM** with AUC Weighted of **0.9739**.

This discrepancy due to the nature of the data and the nature of the modeling done on those data. 
* Since this datasets already very condensed in information, a PCA to reduce the information may backfire and reduce the overall metric
* Standard scaler to scale the various numerical values, can help the learning algorithm to classifify fraud / non-fraud better. 

The other combinations also stemmed from preprocessor that are not suited, algorithm or hyperparameters that may overfit.

In [None]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

In [None]:
remote_run.wait_for_completion(show_output=True)

## Best Model

In [None]:
!mkdir outputs

In [None]:
best_automl_run, best_automl_model = remote_run.get_output()
print(f"Best AutoML Run:\n\n{best_automl_run}")
print("==============")
print(f"Best AutoML Model:\n\n{best_automl_model}")

In [None]:
#TODO: Save the best model
print("Saving the best Model.....")
model_path = "outputs/model.pkl"
pickle.dump(best_automl_model, open(model_path, "wb"))

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

In [None]:
# Register the model
best_automl_run.register_model(model_name="credit-fraud-model", model_path="outputs/model.pkl")

In [None]:
best_automl_run.download_file("outputs/conda_env_v_1_0_0.yml", "conda.yaml")
env = Environment.from_conda_specification(name="env", file_path="conda.yaml")

best_automl_run.download_file("outputs/scoring_file_v_2_0_0.py", "score.py")
inference_config = InferenceConfig(entry_script="score.py", environment=env)

deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1, auth_enabled=True)

model = Model(ws, "credit-fraud-model", version=1, run_id=best_automl_run.id)
webservice = Model.deploy(ws, "credit-fraud-model",
                          models=[model],
                          inference_config=inference_config,
                          deployment_config=deployment_config,
                          overwrite=True)
webservice.wait_for_deployment(show_output=True)

In [None]:
print(f"Scoring URI:\n\n{webservice.scoring_uri}")
print("==============")
primary_key, secondary_key = webservice.get_keys()
print(f"Primary Key:\n\n{primary_key}")

In [None]:
df = dataset.to_pandas_dataframe()
sample = df.drop(columns="Class").sample(2)

In [None]:
sample_json = sample.to_dict(orient="record")
data = {
    "Inputs": {
        "data": sample_json
    },
    "GlobalParameters": {
        "method": "predict"
    }
}

print(f"Sample Data:\n\n:{json.dumps(data, indent=2)}")

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {primary_key}"
}
response = requests.post(webservice.scoring_uri, json=data, headers=headers)
print("=========")
print(response.text)

In [None]:
webservice.update(enable_app_insights=True)  # enable app insights
logs = webservice.get_logs()
for line in logs.split('\n'):
    print(line)

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
