In [37]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_colwidth', None)

In [38]:
import boto3
import sagemaker

role = sagemaker.get_execution_role()

s3 = boto3.client("s3")
sagemaker = boto3.client("sagemaker")

In [39]:
df = pd.read_csv('./iris_extended.csv')
df = pd.get_dummies(df, columns=['soil_type'], dtype='float32')
df

Unnamed: 0,species,elevation,sepal_length,sepal_width,petal_length,petal_width,sepal_area,petal_area,sepal_aspect_ratio,petal_aspect_ratio,...,sepal_petal_width_diff,petal_curvature_mm,petal_texture_trichomes_per_mm2,leaf_area_cm2,sepal_area_sqrt,petal_area_sqrt,area_ratios,soil_type_clay,soil_type_loamy,soil_type_sandy
0,setosa,161.8,5.16,3.41,1.64,0.26,17.5956,0.4264,1.513196,6.307692,...,3.15,5.33,18.33,53.21,4.194711,0.652993,41.265478,0.0,0.0,1.0
1,setosa,291.4,5.48,4.05,1.53,0.37,22.1940,0.5661,1.353086,4.135135,...,3.68,5.90,20.45,52.53,4.711051,0.752396,39.205087,1.0,0.0,0.0
2,setosa,144.3,5.10,2.80,1.47,0.38,14.2800,0.5586,1.821429,3.868421,...,2.42,5.66,24.62,50.25,3.778889,0.747395,25.563910,0.0,0.0,1.0
3,setosa,114.6,4.64,3.44,1.53,0.17,15.9616,0.2601,1.348837,9.000000,...,3.27,4.51,22.91,50.85,3.995197,0.510000,61.367166,1.0,0.0,0.0
4,setosa,110.9,4.85,2.87,1.23,0.26,13.9195,0.3198,1.689895,4.730769,...,2.61,4.03,21.56,40.57,3.730885,0.565509,43.525641,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,virginica,268.8,5.36,2.51,5.16,1.93,13.4536,9.9588,2.135458,2.673575,...,0.58,11.61,11.52,59.46,3.667915,3.155757,1.350926,0.0,1.0,0.0
1196,virginica,125.4,7.49,3.06,7.68,2.17,22.9194,16.6656,2.447712,3.539171,...,0.89,13.85,6.99,77.12,4.787421,4.082352,1.375252,1.0,0.0,0.0
1197,virginica,73.6,6.79,3.25,4.72,2.26,22.0675,10.6672,2.089231,2.088496,...,0.99,13.13,9.16,74.39,4.697606,3.266068,2.068725,1.0,0.0,0.0
1198,virginica,239.6,6.38,2.24,5.30,1.71,14.2912,9.0630,2.848214,3.099415,...,0.53,11.01,6.46,73.90,3.780370,3.010482,1.576873,0.0,0.0,1.0


In [44]:
df['species'] = pd.Categorical(df['species'])
df['species'] = df['species'].cat.codes
df

Unnamed: 0,species,elevation,sepal_length,sepal_width,petal_length,petal_width,sepal_area,petal_area,sepal_aspect_ratio,petal_aspect_ratio,...,sepal_petal_width_diff,petal_curvature_mm,petal_texture_trichomes_per_mm2,leaf_area_cm2,sepal_area_sqrt,petal_area_sqrt,area_ratios,soil_type_clay,soil_type_loamy,soil_type_sandy
0,0,161.8,5.16,3.41,1.64,0.26,17.5956,0.4264,1.513196,6.307692,...,3.15,5.33,18.33,53.21,4.194711,0.652993,41.265478,0.0,0.0,1.0
1,0,291.4,5.48,4.05,1.53,0.37,22.1940,0.5661,1.353086,4.135135,...,3.68,5.90,20.45,52.53,4.711051,0.752396,39.205087,1.0,0.0,0.0
2,0,144.3,5.10,2.80,1.47,0.38,14.2800,0.5586,1.821429,3.868421,...,2.42,5.66,24.62,50.25,3.778889,0.747395,25.563910,0.0,0.0,1.0
3,0,114.6,4.64,3.44,1.53,0.17,15.9616,0.2601,1.348837,9.000000,...,3.27,4.51,22.91,50.85,3.995197,0.510000,61.367166,1.0,0.0,0.0
4,0,110.9,4.85,2.87,1.23,0.26,13.9195,0.3198,1.689895,4.730769,...,2.61,4.03,21.56,40.57,3.730885,0.565509,43.525641,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,2,268.8,5.36,2.51,5.16,1.93,13.4536,9.9588,2.135458,2.673575,...,0.58,11.61,11.52,59.46,3.667915,3.155757,1.350926,0.0,1.0,0.0
1196,2,125.4,7.49,3.06,7.68,2.17,22.9194,16.6656,2.447712,3.539171,...,0.89,13.85,6.99,77.12,4.787421,4.082352,1.375252,1.0,0.0,0.0
1197,2,73.6,6.79,3.25,4.72,2.26,22.0675,10.6672,2.089231,2.088496,...,0.99,13.13,9.16,74.39,4.697606,3.266068,2.068725,1.0,0.0,0.0
1198,2,239.6,6.38,2.24,5.30,1.71,14.2912,9.0630,2.848214,3.099415,...,0.53,11.01,6.46,73.90,3.780370,3.010482,1.576873,0.0,0.0,1.0


In [45]:
train_data, test_data = train_test_split(df, train_size=.7, test_size=.3, shuffle=True)
validation_data, test_data = train_test_split(test_data, train_size=.2, test_size=.1, shuffle=True)

Bucket = "sagemaker-ap-northeast-2-648911607072"

s3.put_object(Bucket=Bucket, Key="inputs/train/data.csv", Body=train_data.to_csv(header=False, index=False))
s3.put_object(Bucket=Bucket, Key="inputs/validation/data.csv", Body=validation_data.to_csv(header=False, index=False))
s3.put_object(Bucket=Bucket, Key="inputs/test/data.csv", Body=test_data.to_csv(header=False, index=False))

{'ResponseMetadata': {'RequestId': 'TPY050N9Z1TTGF4J',
  'HostId': 'u9SQHvYRgXR1+CFqgdWn1Z/LrVS3yxJYGfyujrzX69oP/5yZGC6XeNfhDm0kqtcBsB+9uIu7eArDVVkJ1GWY8SOmEvBMTMAYJmXexHg6PSs=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'u9SQHvYRgXR1+CFqgdWn1Z/LrVS3yxJYGfyujrzX69oP/5yZGC6XeNfhDm0kqtcBsB+9uIu7eArDVVkJ1GWY8SOmEvBMTMAYJmXexHg6PSs=',
   'x-amz-request-id': 'TPY050N9Z1TTGF4J',
   'date': 'Fri, 08 Nov 2024 11:34:23 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"0368fdc5f14dcd74d307132df565cd64"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"0368fdc5f14dcd74d307132df565cd64"',
 'ServerSideEncryption': 'AES256'}

In [46]:
from sagemaker.image_uris import retrieve

container = retrieve(framework="xgboost", region="ap-northeast-2", version="latest")
container

'306986355934.dkr.ecr.ap-northeast-2.amazonaws.com/xgboost:latest'

In [47]:
from time import gmtime, strftime

training_job_name = f"project-classification-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": f"s3://{Bucket}/outputs"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.2xlarge", "VolumeSizeInGB": 5},
    "TrainingJobName": training_job_name,
    "HyperParameters": {
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.7",
        "objective": "multi:softmax",
        "num_round": "50",
        "verbosity": "2",
        "num_class": "3"
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 3600},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{Bucket}/inputs/train",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{Bucket}/inputs/validation",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None",
        },
    ],
}

sagemaker.create_training_job(**create_training_params)

{'TrainingJobArn': 'arn:aws:sagemaker:ap-northeast-2:648911607072:training-job/project-classification-2024-11-08-11-34-25',
 'ResponseMetadata': {'RequestId': 'baa578ac-928e-4ff4-99a4-f6a227b880e3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'baa578ac-928e-4ff4-99a4-f6a227b880e3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '122',
   'date': 'Fri, 08 Nov 2024 11:34:25 GMT'},
  'RetryAttempts': 0}}

In [48]:
model_name = f"{training_job_name}-model"
print(model_name)

info = sagemaker.describe_training_job(TrainingJobName=training_job_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]
print(model_data)

primary_container = {"Image": container, "ModelDataUrl": model_data}

create_model_response = sagemaker.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])

project-classification-2024-11-08-11-34-25-model
s3://sagemaker-ap-northeast-2-648911607072/outputs/project-classification-2024-11-08-11-34-25/output/model.tar.gz
arn:aws:sagemaker:ap-northeast-2:648911607072:model/project-classification-2024-11-08-11-34-25-model


In [49]:
endpoint_config_name = f"{training_job_name}-endpointconf"
print(f"Creating endpoint config with name: {endpoint_config_name}.")
create_endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.m5.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print(f"Endpoint Config Arn: {create_endpoint_config_response['EndpointConfigArn']}")

Creating endpoint config with name: project-classification-2024-11-08-11-34-25-endpointconf.
Endpoint Config Arn: arn:aws:sagemaker:ap-northeast-2:648911607072:endpoint-config/project-classification-2024-11-08-11-34-25-endpointconf


In [50]:
endpoint_name = f"{training_job_name}-endpoint"
print(
    f"Creating endpoint with name: {endpoint_name}. This will take between 9 and 11 minutes to complete."
)
create_endpoint_response = sagemaker.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response["EndpointArn"])

Creating endpoint with name: project-classification-2024-11-08-11-34-25-endpoint. This will take between 9 and 11 minutes to complete.
arn:aws:sagemaker:ap-northeast-2:648911607072:endpoint/project-classification-2024-11-08-11-34-25-endpoint


In [57]:
runtime_client = boto3.client("runtime.sagemaker")

x = test_data.drop(['species'], axis=1)
t = test_data['species']

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/csv", Body=x.to_csv(header=False, index=False))

y=response["Body"].read().decode('utf-8').split(',')

pd.DataFrame({
    'y': y,
    't': t
})

Unnamed: 0,y,t
243,0.0,0
885,2.0,2
440,2.0,1
19,0.0,0
1190,2.0,2
719,1.0,1
92,0.0,0
1081,2.0,2
1016,1.0,2
1080,2.0,2
