# Session 2

TBD

### Sagemaker Training Jobs

- TensorFlow Estimators
- The train.py script
- Inputs
- Hyperparameters
- Saving the model
- Training Logs


In [1]:
%%writefile train.py


import os
import argparse

import numpy as np
import random
import tensorflow as tf

from pathlib import Path
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Conv2D, Dense, MaxPooling2D
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Input, Dropout


def train(base_directory, epochs=10, batch_size=32):
    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    X_train = (X_train - 0.0) / (255.0 - 0.0)
    X_test = (X_test - 0.0) / (255.0 - 0.0)

    X_train = X_train.reshape((X_train.shape + (1,)))
    X_test = X_test.reshape((X_test.shape + (1,)))

    model = Sequential([
        Conv2D(32, (3, 3), activation="relu", input_shape=(28, 28, 1)),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(100, activation="relu"),
        Dense(10, activation="softmax")
    ])

    optimizer = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(
        optimizer=optimizer, 
        loss="sparse_categorical_crossentropy", 
        metrics=["accuracy"]
    )

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    predictions = np.argmax(model.predict(X_test), axis=-1)
    print(f"Accuracy: {accuracy_score(y_test, predictions)}")
    
    model_filepath = os.path.join(base_directory, "model", "0001")
    model.save(model_filepath)
    
    
parser = argparse.ArgumentParser()
parser.add_argument("--base_directory", type=str, default="/opt/ml/")
parser.add_argument("--train_folder", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", None))
parser.add_argument("--validation_folder", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION", None))
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=32)
args, _ = parser.parse_known_args()

# args.train_folder is where the train data is
# args.validation_folder is where the validation data is

train(
    base_directory=args.base_directory,
    epochs=args.epochs,
    batch_size=args.batch_size
)


Overwriting train.py


In [2]:
import os
import sagemaker
import numpy as np

from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import FileSystemInput

from utils import to_tensor

In [3]:
CONFIGURATION_DIRECTORY = "/tmp/training"

!mkdir -p $CONFIGURATION_DIRECTORY
!cp train.py $CONFIGURATION_DIRECTORY
!cp requirements.txt $CONFIGURATION_DIRECTORY

In [4]:
!ls $CONFIGURATION_DIRECTORY

requirements.txt  train.py


In [5]:
hyperparameters = {
    "epochs": 10,
    "batch_size": 32
}

inputs = {
    # "train": "s3://MyBucketName/dataset/train",
    # "validation": "s3://MyBucketName/dataset/validation",
}

## Esta es la configuración de un Training Job.
estimator = TensorFlow(
    base_job_name="sample-training",
    source_dir=CONFIGURATION_DIRECTORY,
    entry_point="train.py",
    role=sagemaker.get_execution_role(),
    hyperparameters=hyperparameters,
    instance_type="ml.m5.large",
    instance_count=1,
    py_version="py37",
    framework_version="2.4",
    debugger_hook_config=False,
    script_mode=True,
    volume_size=5,
)

estimator.fit(inputs=None, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: sample-training-2023-02-07-15-08-13-159


In [None]:
# tuner = HyperparameterTuner(
#     estimator,
#     objective_metric_name,
#     hyperparamter_range,
#     metric_definitions,
#     max_jobs=3,
#     max_parallel_jobs=3,
#     objective_type=objective_type,
# )

# tuner.fit(inputs=channels)

### Deploying model using the same estimator

In [6]:
predictor = estimator.deploy(
    initial_instance_count=1, 
    instance_type="ml.c5.xlarge"
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.tensorflow.model:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating model with name: sample-training-2023-02-07-15-15-01-530
INFO:sagemaker:Creating endpoint-config with name sample-training-2023-02-07-15-15-01-530
INFO:sagemaker:Creating endpoint with name sample-training-2023-02-07-15-15-01-530


----!

In [7]:
for image_path in os.listdir("./images"):
    if image_path.endswith(".png"):
        image = to_tensor(os.path.join("./images", image_path))
        
        result = predictor.predict(image)
        prediction = np.argmax(result["predictions"])
        print(f"{image_path} ---> {prediction}")
        

7.png ---> 7
1.png ---> 1
2.png ---> 2


In [20]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sample-training-2023-02-07-15-15-01-530
INFO:sagemaker:Deleting endpoint with name: sample-training-2023-02-07-15-15-01-530
