In [93]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_colwidth', None)

In [94]:
import boto3
import sagemaker

role = sagemaker.get_execution_role()

s3 = boto3.client("s3")
sagemaker = boto3.client("sagemaker")

In [95]:
df = pd.read_csv('./housing.csv')[[
      'LotFrontage',
      'LotArea',
      'GrLivArea',
      'BsmtUnfSF',
      'GarageArea',
      'BsmtFinSF1',
      'YearRemodAdd',
      'TotalBsmtSF',
      'YearBuilt',
      'SalePrice']]
df

Unnamed: 0,LotFrontage,LotArea,GrLivArea,BsmtUnfSF,GarageArea,BsmtFinSF1,YearRemodAdd,TotalBsmtSF,YearBuilt,SalePrice
0,65.0,8450,1710,150,548,706,2003,856,2003,208500
1,80.0,9600,1262,284,460,978,1976,1262,1976,181500
2,68.0,11250,1786,434,608,486,2002,920,2001,223500
3,60.0,9550,1717,540,642,216,1970,756,1915,140000
4,84.0,14260,2198,490,836,655,2000,1145,2000,250000
...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,1647,953,460,0,2000,953,1999,175000
1456,85.0,13175,2073,589,500,790,1988,1542,1978,210000
1457,66.0,9042,2340,877,252,275,2006,1152,1941,266500
1458,68.0,9717,1078,0,240,49,1996,1078,1950,142125


In [96]:
cols = df.columns.tolist()
df = df[cols[-1:] + cols[:-1]]
df

Unnamed: 0,SalePrice,LotFrontage,LotArea,GrLivArea,BsmtUnfSF,GarageArea,BsmtFinSF1,YearRemodAdd,TotalBsmtSF,YearBuilt
0,208500,65.0,8450,1710,150,548,706,2003,856,2003
1,181500,80.0,9600,1262,284,460,978,1976,1262,1976
2,223500,68.0,11250,1786,434,608,486,2002,920,2001
3,140000,60.0,9550,1717,540,642,216,1970,756,1915
4,250000,84.0,14260,2198,490,836,655,2000,1145,2000
...,...,...,...,...,...,...,...,...,...,...
1455,175000,62.0,7917,1647,953,460,0,2000,953,1999
1456,210000,85.0,13175,2073,589,500,790,1988,1542,1978
1457,266500,66.0,9042,2340,877,252,275,2006,1152,1941
1458,142125,68.0,9717,1078,0,240,49,1996,1078,1950


In [97]:
train_data, test_data = train_test_split(df, train_size=.7, test_size=.3, shuffle=True)
validation_data, test_data = train_test_split(test_data, train_size=.2, test_size=.1, shuffle=True)

Bucket = "sagemaker-ap-northeast-2-648911607072"

s3.put_object(Bucket=Bucket, Key="inputs/train/data.csv", Body=train_data.to_csv(header=False, index=False))
s3.put_object(Bucket=Bucket, Key="inputs/validation/data.csv", Body=validation_data.to_csv(header=False, index=False))
s3.put_object(Bucket=Bucket, Key="inputs/test/data.csv", Body=test_data.to_csv(header=False, index=False))

{'ResponseMetadata': {'RequestId': '02MTH134QQAFZ4ZH',
  'HostId': 'wYGRfg8RO+F8wtjjNnjFwWA3qUXIANppgmxMsEBBLlLNWzzh2cUdQkhKkHOCvXW+HE7N27US0v8l7bXKe33Ne+hHbCUXJxL8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'wYGRfg8RO+F8wtjjNnjFwWA3qUXIANppgmxMsEBBLlLNWzzh2cUdQkhKkHOCvXW+HE7N27US0v8l7bXKe33Ne+hHbCUXJxL8',
   'x-amz-request-id': '02MTH134QQAFZ4ZH',
   'date': 'Fri, 08 Nov 2024 08:53:57 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"47a4108231fb62ee90b40a1002311d01"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"47a4108231fb62ee90b40a1002311d01"',
 'ServerSideEncryption': 'AES256'}

In [98]:
from sagemaker.image_uris import retrieve

container = retrieve(framework="xgboost", region="ap-northeast-2", version="latest")
container

'306986355934.dkr.ecr.ap-northeast-2.amazonaws.com/xgboost:latest'

In [99]:
from time import gmtime, strftime

training_job_name = f"project-regression-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": f"s3://{Bucket}/outputs"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.2xlarge", "VolumeSizeInGB": 5},
    "TrainingJobName": training_job_name,
    "HyperParameters": {
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.7",
        "objective": "reg:linear",
        "num_round": "50",
        "verbosity": "2",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 3600},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{Bucket}/inputs/train",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{Bucket}/inputs/validation",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None",
        },
    ],
}

sagemaker.create_training_job(**create_training_params)

{'TrainingJobArn': 'arn:aws:sagemaker:ap-northeast-2:648911607072:training-job/project-regression-2024-11-08-08-53-58',
 'ResponseMetadata': {'RequestId': 'fa006088-f843-48a9-8933-9d41118f4e35',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fa006088-f843-48a9-8933-9d41118f4e35',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '118',
   'date': 'Fri, 08 Nov 2024 08:53:58 GMT'},
  'RetryAttempts': 0}}

In [100]:
model_name = f"{training_job_name}-model"
print(model_name)

info = sagemaker.describe_training_job(TrainingJobName=training_job_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]
print(model_data)

primary_container = {"Image": container, "ModelDataUrl": model_data}

create_model_response = sagemaker.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])

project-regression-2024-11-08-08-53-58-model
s3://sagemaker-ap-northeast-2-648911607072/outputs/project-regression-2024-11-08-08-53-58/output/model.tar.gz
arn:aws:sagemaker:ap-northeast-2:648911607072:model/project-regression-2024-11-08-08-53-58-model


In [101]:
endpoint_config_name = f"{training_job_name}-endpointconf"
print(f"Creating endpoint config with name: {endpoint_config_name}.")
create_endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.m5.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print(f"Endpoint Config Arn: {create_endpoint_config_response['EndpointConfigArn']}")

Creating endpoint config with name: project-regression-2024-11-08-08-53-58-endpointconf.
Endpoint Config Arn: arn:aws:sagemaker:ap-northeast-2:648911607072:endpoint-config/project-regression-2024-11-08-08-53-58-endpointconf


In [102]:
endpoint_name = f"{training_job_name}-endpoint"
print(
    f"Creating endpoint with name: {endpoint_name}. This will take between 9 and 11 minutes to complete."
)
create_endpoint_response = sagemaker.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response["EndpointArn"])

Creating endpoint with name: project-regression-2024-11-08-08-53-58-endpoint. This will take between 9 and 11 minutes to complete.
arn:aws:sagemaker:ap-northeast-2:648911607072:endpoint/project-regression-2024-11-08-08-53-58-endpoint


In [104]:
runtime_client = boto3.client("runtime.sagemaker")

x = test_data.drop('SalePrice', axis=1)
t = test_data['SalePrice']

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/csv", Body=x.to_csv(header=False, index=False))

y=response["Body"].read().decode('utf-8').split(',')

pd.DataFrame({
    'y': y,
    't': t
})

Unnamed: 0,y,t
1408,100536.1171875,125500
316,211321.4375,260000
508,128797.1875,161000
1274,114801.3046875,139000
916,82247.0234375,35311
728,143986.34375,110000
504,135566.421875,147000
959,179204.453125,155000
876,158136.078125,132250
1289,267500.71875,281000
