Importamos las librerias

In [None]:
!pip install tensorflow

In [None]:
import os
import sagemaker
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.tensorflow import TensorFlow
from time import gmtime, strftime 

Instanciamos una sesion de sagemaker y creamos los directorios de trabajo

In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket() 

data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

raw_dir = os.path.join(os.getcwd(), 'data/raw')
os.makedirs(raw_dir, exist_ok=True)

Definimos una función para separar el dataset

In [None]:
def load_data(test_size=0.2):
    data = np.loadtxt('dataset_eval.csv', delimiter=',', skiprows=1)    
    n_features = data.shape[1] - 1
    x = data[:, 0:n_features]
    y = data[:, -1].reshape(-1, 1)
    return train_test_split(x, y, test_size=test_size)

In [None]:
x_train, x_test, y_train, y_test = load_data()

Guardamos los resultados

In [None]:
np.save(os.path.join(raw_dir, 'x_train.npy'), x_train)
np.save(os.path.join(raw_dir, 'x_test.npy'), x_test)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
np.save(os.path.join(test_dir, 'y_test.npy'), y_test)

Subimos los datos raw a un bucket de S3

In [None]:
s3_prefix = 'tf-2-workflow'
rawdata_s3_prefix = '{}/data/raw'.format(s3_prefix)
raw_s3 = sess.upload_data(path='./data/raw/', key_prefix=rawdata_s3_prefix)
print(raw_s3)

Preparamos la tarea de preprocesado de los datos

In [None]:
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=get_execution_role(),
                                     instance_type='ml.t3.medium',
                                     instance_count=1)

In [None]:
processing_job_name = "tf-2-workflow-{}".format(strftime("%d-%H-%M-%S", gmtime()))
output_destination = 's3://{}/{}/data'.format(bucket, s3_prefix)

sklearn_processor.run(code='utils/preprocessing.py',
                      job_name=processing_job_name,
                      inputs=[ProcessingInput(
                        source=raw_s3,
                        destination='/opt/ml/processing/input',
                        s3_data_distribution_type='ShardedByS3Key')],
                      outputs=[ProcessingOutput(output_name='train',
                                                destination='{}/train'.format(output_destination),
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='test',
                                                destination='{}/test'.format(output_destination),
                                                source='/opt/ml/processing/test')])

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

Copiamos el resultado del preprocesado almacenado en S3 a nuestra máquina

In [None]:
train_in_s3 = '{}/train/x_train.npy'.format(output_destination)
test_in_s3 = '{}/test/x_test.npy'.format(output_destination)
!aws s3 cp {train_in_s3} ./data/train/x_train.npy
!aws s3 cp {test_in_s3} ./data/test/x_test.npy

Especificamos las rutas y subimos los datos a un S3

In [None]:
traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)
train_s3 = sess.upload_data(path='./data/train/', key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./data/test/', key_prefix=testdata_s3_prefix)
inputs = {'train':train_s3, 'test': test_s3}

print(inputs)

Preparamos el entrenamiento del modelo

In [None]:
git_config = {'repo': 'https://github.com/ricardoferrero/tfm-bme', 
              'branch': 'master'}
model_dir = '/opt/ml/model'
train_instance_type = 'ml.c5.xlarge'
hyperparameters = {'epochs': 30, 'batch_size': 128, 'learning_rate': 0.01}

estimator = TensorFlow(git_config=git_config,
                       source_dir='models/tf_model',
                       entry_point='train_aws.py',
                       model_dir=model_dir,
                       instance_type=train_instance_type,
                       instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-2-workflow',
                       framework_version='2.2',
                       py_version='py37',
                       script_mode=True)

In [None]:
estimator.fit(inputs)

Deploy del modelo

In [None]:
tuning_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

Predecimos con el modelo

In [None]:
results = tuning_predictor.predict(x_test[:10])['predictions'] 
flat_list = [float('%.1f'%(item)) for sublist in results for item in sublist]
print('predictions: \t{}'.format(np.array(flat_list)))
print('target values: \t{}'.format(y_test[:10].round(decimals=1)))

Borramos el endpoint

In [None]:
sess.delete_endpoint(tuning_predictor.endpoint)