In [1]:
%%writefile train.py
import os
import sys
import pandas as pd
import re
import joblib
import json
from sklearn.ensemble import RandomForestClassifier

def load_dataset(path):
    # Take the set of files and read them all into a single pandas dataframe
    files = [ os.path.join(path, file) for file in os.listdir(path) ]
    
    if len(files) == 0:
        raise ValueError("Invalid # of files in dir: {}".format(path))

    raw_data = [ pd.read_csv(file, sep=",", header=None ) for file in files ]
    data = pd.concat(raw_data)

    # labels are in the first column
    y = data.iloc[:,0]
    X = data.iloc[:,1:]
    return X,y
    
def start(args):
    print("Training mode")

    try:
        X_train, y_train = load_dataset(args.train)
        X_test, y_test = load_dataset(args.validation)
        
        hyperparameters = {
            "max_depth": args.max_depth,
            "verbose": 1, # show all logs
            "n_jobs": args.n_jobs,
            "n_estimators": args.n_estimators
        }
        print("Training the classifier")
        model = RandomForestClassifier()
        model.set_params(**hyperparameters)
        model.fit(X_train, y_train)
        print("Score: {}".format( model.score(X_test, y_test)) )
        joblib.dump(model, open(os.path.join(args.model_dir, "iris_model.pkl"), "wb"))
    
    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(args.output_dir, "failure"), "w") as s:
            s.write("Exception during training: " + str(e) + "\\n" + trc)
            
        # Printing this causes the exception to be in the training job logs, as well.
        print("Exception during training: " + str(e) + "\\n" + trc, file=sys.stderr)
        
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)

Writing train.py


In [2]:
%%writefile handler.py
import os
import sys
import joblib
from sagemaker_inference.default_inference_handler import DefaultInferenceHandler
from sagemaker_inference.default_handler_service import DefaultHandlerService
from sagemaker_inference import content_types, errors, transformer, encoder, decoder

class HandlerService(DefaultHandlerService, DefaultInferenceHandler):
    def __init__(self):
        op = transformer.Transformer(default_inference_handler=self)
        super(HandlerService, self).__init__(transformer=op)
    
    ## Loads the model from the disk
    def default_model_fn(self, model_dir):
        model_filename = os.path.join(model_dir, "iris_model.pkl")
        return joblib.load(open(model_filename, "rb"))
    
    ## Parse and check the format of the input data
    def default_input_fn(self, input_data, content_type):
        if content_type != "text/csv":
            raise Exception("Invalid content-type: %s" % content_type)
        return decoder.decode(input_data, content_type).reshape(1,-1)
    
    ## Run our model and do the prediction
    def default_predict_fn(self, payload, model):
        return model.predict( payload ).tolist()
    
    ## Gets the prediction output and format it to be returned to the user
    def default_output_fn(self, prediction, accept):
        if accept != "text/csv":
            raise Exception("Invalid accept: %s" % accept)
        return encoder.encode(prediction, accept)

Writing handler.py


In [3]:
%%writefile main.py
import train
import argparse
import sys
import os
import traceback
from sagemaker_inference import model_server
from sagemaker_training import environment

if __name__ == "__main__":
    if len(sys.argv) < 2 or ( not sys.argv[1] in [ "serve", "train" ] ):
        raise Exception("Invalid argument: you must inform 'train' for training mode or 'serve' predicting mode") 
        
    if sys.argv[1] == "train":
        
        env = environment.Environment()
        
        parser = argparse.ArgumentParser()
        # https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md
        parser.add_argument("--max-depth", type=int, default=10)
        parser.add_argument("--n-jobs", type=int, default=env.num_cpus)
        parser.add_argument("--n-estimators", type=int, default=120)
        
        # reads input channels training and testing from the environment variables
        parser.add_argument("--train", type=str, default=env.channel_input_dirs["train"])
        parser.add_argument("--validation", type=str, default=env.channel_input_dirs["validation"])

        parser.add_argument("--model-dir", type=str, default=env.model_dir)
        parser.add_argument("--output-dir", type=str, default=env.output_dir)
        
        args,unknown = parser.parse_known_args()
        train.start(args)
    else:
        model_server.start_model_server(handler_service="serving.handler")

Writing main.py


In [4]:
%%writefile Dockerfile
FROM python:3.7-buster

# Set a docker label to advertise multi-model support on the container
LABEL com.amazonaws.sagemaker.capabilities.multi-models=false
# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true

RUN apt-get update -y && apt-get -y install --no-install-recommends default-jdk
RUN rm -rf /var/lib/apt/lists/*

RUN pip --no-cache-dir install multi-model-server sagemaker-inference sagemaker-training
RUN pip --no-cache-dir install pandas numpy scipy scikit-learn

ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PYTHONPATH="/opt/ml/code:${PATH}"

COPY main.py /opt/ml/code/main.py
COPY train.py /opt/ml/code/train.py
COPY handler.py /opt/ml/code/serving/handler.py

ENTRYPOINT ["python", "/opt/ml/code/main.py"]

Writing Dockerfile


In [5]:
%%writefile buildspec.yml
version: 0.2

phases:
  install:
    runtime-versions:
      docker: 18

  pre_build:
    commands:
      - echo Logging in to Amazon ECR...
      - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
  build:
    commands:
      - echo Build started on `date`
      - echo Building the Docker image...
      - docker build -t $IMAGE_REPO_NAME:$IMAGE_TAG .
      - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG

  post_build:
    commands:
      - echo Build completed on `date`
      - echo Pushing the Docker image...
      - echo docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
      - docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
      - echo $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG > image.url
      - echo Done
artifacts:
  files:
    - image.url
  name: image_url
  discard-paths: yes

Writing buildspec.yml


In [6]:
!docker build -f Dockerfile -t iris_model:1.0 .

DEPRECATED: The legacy builder is deprecated and will be removed in a future release.
            BuildKit is currently disabled; enable it by removing the DOCKER_BUILDKIT=0
            environment-variable.

Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?


In [11]:
export DOCKER_BUILDKIT=1


SyntaxError: invalid syntax (3673499175.py, line 1)

In [12]:
aws ecr get-login --region us-east1 --no-include-email

SyntaxError: invalid syntax (3275204144.py, line 1)

In [13]:
!aws ecr get-login --region us-east-1 --no-include-email



usage: aws [options] <command> <subcommand> [<subcommand> ...] [parameters]
To see help text, you can run:

  aws help
  aws <command> help
  aws <command> <subcommand> help

aws: error: argument operation: Invalid choice, valid choices are:

batch-check-layer-availability           | batch-delete-image                      
batch-get-image                          | batch-get-repository-scanning-configuration
complete-layer-upload                    | create-pull-through-cache-rule          
create-repository                        | create-repository-creation-template     
delete-lifecycle-policy                  | delete-pull-through-cache-rule          
delete-registry-policy                   | delete-repository                       
delete-repository-creation-template      | delete-repository-policy                
describe-image-replication-status        | describe-image-scan-findings            
describe-images                          | describe-pull-through-cache-rules     

In [14]:
!aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 853973692277.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credential-stores

Login Succeeded


In [15]:
!docker build -f Dockerfile -t iris_model:1.0 .

DEPRECATED: The legacy builder is deprecated and will be removed in a future release.
            BuildKit is currently disabled; enable it by removing the DOCKER_BUILDKIT=0
            environment-variable.

Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?


In [16]:
!sudo systemctl status docker


sudo: systemctl: command not found


In [17]:
!sudo service docker status


docker: unrecognized service


In [18]:
!pip install docker




In [19]:
sudo usermod -aG docker $(whoami)


SyntaxError: invalid syntax (3369212131.py, line 1)

In [20]:
!sudo usermod -aG docker $(whoami)


usermod: group 'docker' does not exist


In [21]:
!sudo service docker status


docker: unrecognized service


In [22]:
!docker --version


Docker version unknown-version, build unknown-commit


In [23]:
!docker build -f Dockerfile -t iris_model:1.0 .

DEPRECATED: The legacy builder is deprecated and will be removed in a future release.
            BuildKit is currently disabled; enable it by removing the DOCKER_BUILDKIT=0
            environment-variable.

Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?


In [None]:
!rm -rf input
!mkdir -p input/data/train
!mkdir -p input/data/validation

import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

dataset = np.insert(iris.data, 0, iris.target,axis=1)

df = pd.DataFrame(data=dataset, columns=["iris_id"] + iris.feature_names)
X = df.iloc[:,1:]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

train_df = X_train.copy()
train_df.insert(0, "iris_id", y_train)
train_df.to_csv("input/data/train/training.csv", sep=",", header=None, index=None)

test_df = X_test.copy()
test_df.insert(0, "iris_id", y_test)
test_df.to_csv("input/data/validation/testing.csv", sep=",", header=None, index=None)

df.head()