In [10]:
import sagemaker
from sagemaker.tensorflow import TensorFlow
from time import gmtime, strftime
import os


In [11]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name = 'sagemaker-mlops-demo-dev-2024'  # Tu bucket

In [12]:
os.makedirs('source_dir', exist_ok=True)
script_path = 'source_dir/train.py'

with open(script_path, 'w') as f:
    f.write('''
import tensorflow as tf
import argparse
import os
import numpy as np

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--learning-rate', type=float, default=0.001)
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
    return parser.parse_known_args()[0]

def load_data(train_dir, test_dir):
    train_images = np.load(os.path.join(train_dir, 'train_images.npy'))
    train_labels = np.load(os.path.join(train_dir, 'train_labels.npy'))
    test_images = np.load(os.path.join(test_dir, 'test_images.npy'))
    test_labels = np.load(os.path.join(test_dir, 'test_labels.npy'))
    return (train_images, train_labels), (test_images, test_labels)

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu', input_shape=(784,)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

if __name__ == '__main__':
    args = parse_args()
    
    (train_images, train_labels), (test_images, test_labels) = load_data(args.train, args.test)
    
    model = create_model()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    model.fit(
        train_images, train_labels,
        batch_size=args.batch_size,
        epochs=args.epochs,
        validation_data=(test_images, test_labels)
    )
    
    test_loss, test_accuracy = model.evaluate(test_images, test_labels)
    print(f'Test accuracy: {test_accuracy}')
    
    model.save(os.path.join(args.model_dir, '1'))
''')


In [13]:
estimator = TensorFlow(
    entry_point='train.py',
    source_dir='source_dir',  
    framework_version='2.11.0',
    py_version='py39',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    hyperparameters={
        'epochs': 5,
        'batch-size': 32,
        'learning-rate': 0.001
    },
    output_path=f's3://{bucket_name}/mnist/models'
)

In [14]:
print("Iniciando trabajo de entrenamiento...")
estimator.fit({
    'train': f's3://{bucket_name}/mnist/data',
    'test': f's3://{bucket_name}/mnist/data'
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2024-11-24-09-08-21-911


Iniciando trabajo de entrenamiento...
2024-11-24 09:08:23 Starting - Starting the training job...
2024-11-24 09:08:38 Starting - Preparing the instances for training...
2024-11-24 09:09:11 Downloading - Downloading input data......
2024-11-24 09:10:11 Downloading - Downloading the training image......
2024-11-24 09:11:17 Training - Training image download completed. Training in progress..[34m2024-11-24 09:11:22.153036: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F[0m
[34mTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.[0m
[34m2024-11-24 09:11:22.258289: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-11-24 09:11:22.258946: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profi