In [1]:
import sagemaker
from sagemaker import image_uris
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
import boto3
import json
import numpy as np
import io
from sklearn.metrics import f1_score
from sagemaker.predictor import Predictor
from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner
from datetime import datetime
import pandas as pd

S3_BUCKET = "sagemaker-experiments-ml"
S3_PREFIX_VIDEOGAMES = "videogames"
S3_PREFIX_IMDB = "imdb"
VIDEOGAME_ENDPOINT_NAME = "videogames"
ROLE = "sagemaker-experiments-ml-role"


In [2]:

def get_session():
    sess = sagemaker.Session(default_bucket=S3_BUCKET)
    return sess


def get_role():
    iam_client = boto3.client('iam')
    role_arn = iam_client.get_role(RoleName=ROLE)['Role']['Arn']
    return role_arn


def delete_endpoint(*endpoints):
    for name in endpoints:
        predictor = Predictor(endpoint_name=name, sagemaker_session=get_session())
        predictor.delete_endpoint()


def get_data_from_s3(key):
    client = boto3.client('s3')
    response = client.get_object(
        Bucket=S3_BUCKET,
        Key=key
    )

    data = response['Body'].read().decode('utf-8')
    csv_file = io.StringIO()
    return pd.read_csv(io.StringIO(data))

 
        

def create_xgboost_estimator():
    # initialize hyperparameters
    hyperparameters = {
            "eval_metric": "auc",
            "scale_pos_weight": "2.0",
            "subsample":"0.5",
            "objective":"binary:logistic",
            "num_round":"50"}
    output_path = 's3://{}/{}/output'.format(S3_BUCKET, S3_PREFIX_VIDEOGAMES)
    xgboost_container = sagemaker.image_uris.retrieve("xgboost", "us-east-1", "1.5-1")
    # construct a SageMaker estimator that calls the xgboost-container
    estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                              hyperparameters=hyperparameters,
                                              role=get_role(),
                                              instance_count=1,
                                              instance_type='ml.m4.4xlarge',
                                              volume_size=5,  # 5 GB
                                              output_path=output_path)

    return estimator


def train_estimator(estimator, max_jobs=3, max_parallel_jobs=3):
    # define the data type and paths to the training and validation datasets
    objective_metric_name = "validation:auc"
    hyperparamter_range = {"eta": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"), 
                       "lambda": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
                       "max_depth":IntegerParameter(3, 7)}

    content_type = "csv"
    train_input = TrainingInput("s3://{}/{}/{}/".format(S3_BUCKET, S3_PREFIX_VIDEOGAMES, 'train'), content_type=content_type)
    validation_input = TrainingInput("s3://{}/{}/{}/".format(S3_BUCKET, S3_PREFIX_VIDEOGAMES, 'validation'), content_type=content_type)
    # execute the XGBoost training job
    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparamter_range,
        max_jobs=max_jobs,
        max_parallel_jobs=max_parallel_jobs,
    )

    tuner.fit(
        {"train": train_input, "validation": validation_input},
        include_cls_metadata=False,
        job_name="xgb-hpsearch-" + datetime.now().strftime("%Y%m%d-%H-%M-%S"),
    )
    return tuner


def deploy_endpoint(estimator):
    serializer = CSVSerializer()
    estimator.deploy(
        initial_instance_count=1,
        endpoint_name=VIDEOGAME_ENDPOINT_NAME,
        instance_type="ml.m5.xlarge",
        serializer=serializer,
    )
    
    
def get_prediction(data, endpoint_name, content_type="text/csv"):
    response = runtime.invoke_endpoint(EndpointName=endpoint_name,
                                       ContentType=content_type,
                                       Body=data)
    #result = json.loads(response['Body'].read().decode())
    return response['Body'].read().decode('utf-8').splitlines()



In [3]:
key_train=f'{S3_PREFIX_VIDEOGAMES}/train/train.csv'

df_train = get_data_from_s3(key_train)
df_train.head()

Unnamed: 0,Y,Critic_Score,User_Score,Platform_3DS,Platform_DC,Platform_DS,Platform_GBA,Platform_GC,Platform_PC,Platform_PS,...,Publisher_bitComposer Games,Publisher_id Software,Publisher_inXile Entertainment,Rating_AO,Rating_E,Rating_E10+,Rating_K-A,Rating_M,Rating_RP,Rating_T
0,0,75.0,6.4,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,59.0,7.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,66.0,6.6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,64.0,6.9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,65.0,8.7,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
estimator = create_xgboost_estimator()
estimator

<sagemaker.estimator.Estimator at 0x7fb0828e5a00>

In [8]:
tuner = train_estimator(estimator,max_jobs=16, max_parallel_jobs=4)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.........................................................................!


In [9]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

In [10]:
df_log = sagemaker.HyperparameterTuningJobAnalytics(
    tuner.latest_tuning_job.job_name
).dataframe()
df_log

Unnamed: 0,eta,lambda,max_depth,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.307458,0.521497,7.0,xgb-hpsearch-20221227-04-53-44-016-f4570370,Completed,0.84367,2022-12-27 04:59:19+00:00,2022-12-27 04:59:47+00:00,28.0
1,0.308007,0.551265,3.0,xgb-hpsearch-20221227-04-53-44-015-d08fc74b,Completed,0.85062,2022-12-27 04:59:07+00:00,2022-12-27 04:59:39+00:00,32.0
2,0.30995,0.742571,3.0,xgb-hpsearch-20221227-04-53-44-014-96c72a54,Completed,0.84876,2022-12-27 04:58:41+00:00,2022-12-27 04:59:13+00:00,32.0
3,0.300079,0.673198,4.0,xgb-hpsearch-20221227-04-53-44-013-49a3826c,Completed,0.84964,2022-12-27 04:58:40+00:00,2022-12-27 04:59:07+00:00,27.0
4,0.295273,0.412384,3.0,xgb-hpsearch-20221227-04-53-44-012-92ce54a8,Completed,0.8475,2022-12-27 04:58:27+00:00,2022-12-27 04:58:54+00:00,27.0
5,0.328129,6.133059,7.0,xgb-hpsearch-20221227-04-53-44-011-1bd0860c,Completed,0.84814,2022-12-27 04:58:42+00:00,2022-12-27 04:59:14+00:00,32.0
6,0.305554,0.03566,7.0,xgb-hpsearch-20221227-04-53-44-010-12a0c552,Completed,0.83803,2022-12-27 04:58:01+00:00,2022-12-27 04:58:28+00:00,27.0
7,0.302021,0.374912,3.0,xgb-hpsearch-20221227-04-53-44-009-e317d7cc,Completed,0.85033,2022-12-27 04:58:00+00:00,2022-12-27 04:58:27+00:00,27.0
8,0.547469,1.355174,3.0,xgb-hpsearch-20221227-04-53-44-008-1edb51b8,Completed,0.83508,2022-12-27 04:57:39+00:00,2022-12-27 04:58:11+00:00,32.0
9,0.301146,2.095995,7.0,xgb-hpsearch-20221227-04-53-44-007-5a05852f,Completed,0.8482,2022-12-27 04:57:19+00:00,2022-12-27 04:57:46+00:00,27.0


In [11]:
deploy_endpoint(tuner)


2022-12-27 04:59:41 Starting - Found matching resource for reuse
2022-12-27 04:59:41 Downloading - Downloading input data
2022-12-27 04:59:41 Training - Training image download completed. Training in progress.
2022-12-27 04:59:41 Uploading - Uploading generated training model
2022-12-27 04:59:41 Completed - Resource retained for reuse
-------!

In [12]:
key_test=f'{S3_PREFIX_VIDEOGAMES}/test/test.csv'
test_df = get_data_from_s3(key_test)
csv_file = io.StringIO()
test_df.drop(['Y'], axis=1).to_csv(csv_file, sep=",", header=False, index=False)

In [13]:
runtime = boto3.client('runtime.sagemaker')
preds = get_prediction(csv_file.getvalue(), VIDEOGAME_ENDPOINT_NAME)

In [14]:
test_df['predictions'] = np.where(pd.Series(preds).astype('float') > 0.5, 1, 0)
f1_score(test_df['Y'], test_df['predictions'], average='weighted')  


0.872077847433612

In [15]:
delete_endpoint(*[VIDEOGAME_ENDPOINT_NAME])