In [None]:
# preparar os dados
# mover estes dados para um s3 bucket
# criar o modelo
# treinar o modelo
# deploy do modelo

In [1]:
import urllib.request
urllib.request.urlretrieve('https://archive.ics.uci.edu/static/public/53/iris.zip',
                          'data.zip')

('data.zip', <http.client.HTTPMessage at 0x7f64b020b190>)

In [2]:
#preparando os dados
!mkdir data
!unzip data.zip -d data/

Archive:  data.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [11]:
#passo a passo da preparação

import pandas as pd

#ler os dados
data = pd.read_csv('data/iris.data', header=None)

#converter para valores numericos
data[4] = data[4].replace('Iris-setosa', 0)
data[4] = data[4].replace('Iris-virginica', 1)
data[4] = data[4].replace('Iris-versicolor', 2)

#shuffle
data = data.sample(frac=1).reset_index(drop=True)

#modificar o rótulo da coluna de indice
data = data[[4, 0, 1, 2, 3]] #definr a coluna de categorias como a primeira do dataset

#dividir os dados em um conjunto de treinamento e um conjunto de validação
#possui 150 instancias
#80% será para treinamento e 20% para validação
train_data = data[:120] #até o indice 120
val_data = data[120:] #a partir do indice 120

  data[4] = data[4].replace('Iris-versicolor', 2)


In [13]:
#mover dados para o s3 bucket

import boto3

bucket_name = 'sagemaker-build-and-deploy-model-cn'

train_data.to_csv('data.csv', header=False, index=False)
key = 'data/train/data' #localização dos dados que serão salvos no s3 bucket
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

val_data.to_csv('data.csv', header=False, index=False)
key = 'data/val/data' #localização dos dados que serão salvos no s3 bucket
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

In [19]:
#criar o modelo
import scipy.sparse
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role
key = 'model/xgb_model'
s3_output_location = url = 's3://{}/{}'.format(bucket_name, key)

#o algoritmo que será utilizado é o xgboost
#criação do objeto python que será utilizado para treinar o modelo
xgb_model = sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name, 'xgboost'),
    get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    train_volume_size=5, #5 GB
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session()
)

xgb_model.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    silent=0,
    objective='multi:softmax',
    num_class=3,
    num_round=10 #quantas iterações serão feitas no treinamento
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [20]:
#treinar o modelo
train_data = 's3://{}/{}'.format(bucket_name, 'data/train')
val_data = 's3://{}/{}'.format(bucket_name, 'data/val')

train_channel = sagemaker.session.s3_input(train_data, content_type='text/csv')
val_channel = sagemaker.session.s3_input(val_data, content_type='text/csv')

data_channels = {'train': train_channel, 'validation': val_channel}

xgb_model.fit(inputs=data_channels)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-08-29-02-18-20-354


2024-08-29 02:18:21 Starting - Starting the training job...
2024-08-29 02:18:35 Starting - Preparing the instances for training...
2024-08-29 02:19:09 Downloading - Downloading input data...
2024-08-29 02:19:44 Downloading - Downloading the training image......
2024-08-29 02:20:51 Training - Training image download completed. Training in progress.
2024-08-29 02:20:51 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-08-29:02:20:42:INFO] Running standalone xgboost training.[0m
[34m[2024-08-29:02:20:42:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8460.46mb[0m
[34m[2024-08-29:02:20:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[02:20:42] S3DistributionType set as FullyReplicated[0m
[34m[02:20:42] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-08-29:02:20:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[02:20:

In [21]:
#fazer deploy do modelo

xgb_predictor = xgb_model.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-08-29-02-29-07-947
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-08-29-02-29-07-947
INFO:sagemaker:Creating endpoint with name xgboost-2024-08-29-02-29-07-947


-------!