# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [1]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker==2.117.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.27.46 requires botocore==1.29.46, but you have botocore 1.29.74 which is incompatible.
aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.29.74 which is incompatible.[0m[31m
[0m

In [None]:
!pip install scikit-learn==1.1.3

In [2]:
import sagemaker
import pandas as pd
from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# uri of your remote mlflow server
tracking_uri = 'http://mlflo-mlflo-110tkpke38vv-e1e1dc1c5f95c722.elb.us-east-1.amazonaws.com/' 

## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [3]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [4]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [5]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing-trial-2',
    'n-estimators': 100,
    'min-samples-leaf': 3,
    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.0-1',
    base_job_name='mlflow',
)

In [6]:
estimator.fit({'train':train_path, 'test': test_path})

2023-02-18 06:29:36 Starting - Starting the training job...
2023-02-18 06:29:59 Starting - Preparing the instances for trainingProfilerReport-1676701775: InProgress
......
2023-02-18 06:31:02 Downloading - Downloading input data
2023-02-18 06:31:02 Training - Downloading the training image...
2023-02-18 06:31:33 Training - Training image download completed. Training in progress...[34m2023-02-18 06:31:40,163 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-02-18 06:31:40,167 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-18 06:31:40,175 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-02-18 06:31:40,365 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[34mProcessing /opt/ml/code
  Preparing metadata (setup.py): started
  Preparing metada

[34mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.[0m
[34msagemaker-sklearn-container 2.0 requires numpy==1.19.2, but you have numpy 1.23.5 which is incompatible.[0m
[34mSuccessfully installed Mako-1.2.4 alembic-1.9.4 cloudpickle-2.2.1 contourpy-1.0.7 cycler-0.11.0 databricks-cli-0.17.4 docker-6.0.1 entrypoints-0.4 fonttools-4.38.0 gitdb-4.0.10 gitpython-3.1.31 importlib-metadata-5.2.0 importlib-resources-5.12.0 kiwisolver-1.4.4 llvmlite-0.39.1 markdown-3.4.1 matplotlib-3.7.0 mlflow-2.0.1 numba-0.56.4 numpy-1.23.5 oauthlib-3.2.2 packaging-21.3 pyarrow-10.0.1 pyjwt-2.6.0 pyparsing-3.0.9 pyyaml-6.0 querystring-parser-1.2.4 sagemaker-example-1.0 shap-0.41.0 slicer-0.0.7 smmap-5.0.0 sqlalchemy-1.4.46 sqlparse-0.4.3 tabulate-0.9.0 websocket-client-1.5.1 zipp-3.14.0[0m
[34m[notice] A new release of pip available: 22.3.1 -> 23.0.1[0m
[34m[notice] To update, 


2023-02-18 06:32:20 Uploading - Uploading generated training model
2023-02-18 06:32:20 Completed - Training job completed
Training seconds: 92
Billable seconds: 92
