In [40]:
import sagemaker
import boto3
# from sagemaker.image_uris import retrieve
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

In [41]:
bucket_name = 'seteamperformance'
my_region = boto3.session.Session().region_name
print(my_region)

us-east-2


In [4]:
s3 = boto3.resource('s3')
try:
    if my_region == 'us-east-2':
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': my_region})
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error:', e)

S3 error: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [5]:
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://seteamperformance/xgboost-as-a-built-in-algo/output


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# from sklearn.tree import DecisionTreeClassifier
# import xgboost as xgb

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

projectInterval = pd.read_csv('setapProcessT9.csv')

cols = ['averageResponsesByStudent', 'SE Process grade']

processesDf = pd.DataFrame(projectInterval, columns=cols)

label_encoder = LabelEncoder()
processesDf['SE Process grade'] = label_encoder.fit_transform(processesDf['SE Process grade'])

X = processesDf[['averageResponsesByStudent']]
y = processesDf['SE Process grade']

print(processesDf)

train_data, test_data = np.split(processesDf.sample(frac=1, random_state=1729), [int(0.7 * len(processesDf))])

print(train_data.shape, test_data.shape)

    averageResponsesByStudent  SE Process grade
0                     10.1429                 1
1                      9.5714                 0
2                     10.5000                 0
3                     11.1667                 0
4                      9.1667                 1
..                        ...               ...
69                    11.1667                 0
70                    12.0000                 1
71                    11.3333                 0
72                    10.8000                 0
73                    10.0000                 0

[74 rows x 2 columns]
(51, 2) (23, 2)


In [7]:
# train data to bucket
import os
pd.concat([train_data['SE Process grade'], train_data.drop(['SE Process grade'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [8]:
# test data to bucket
import os
pd.concat([test_data['SE Process grade'], test_data.drop(['SE Process grade'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

print(s3_input_test)

<sagemaker.inputs.TrainingInput object at 0x7f0c9b9557e0>


In [9]:
print(sagemaker.__version__)
# xgb image uyi builds xgb container
# container = retrieve(boto3.Session().region_name,
#                          'xgboost')
# Specify the AWS region
region_name = boto3.Session().region_name

# Get the container image URI for XGBoost
container = image_uris.retrieve('xgboost', region_name, '1.0-1')

# Use the 'container' variable for further processing

2.155.0


In [10]:
hyperparameters = {
    "max_depth": "6",
    "eta": "0.005",
    "gamma": "4",
    "min_child_weight": "10",
    "subsample": "0.7",
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "num_round": "50",
}

In [11]:
# sagemaker estimator that calls xgboost
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                         hyperparameters=hyperparameters,
                                         role=sagemaker.get_execution_role(),
                                         instance_count=1,
                                         instance_type='ml.m5.4xlarge',
                                         volume_size=5, # gb
                                         output_path=output_path,
                                         use_spot_instances=True,
                                         max_run=300,
                                         max_wait=600)

In [12]:
# print(s3_input_train.config, s3_input_test.config)
print(s3_input_train.config['DataSource']['S3DataSource']['S3Uri'])
print(s3_input_test.config['DataSource']['S3DataSource']['S3Uri'])
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-06-11-20-14-31-647


s3://seteamperformance/xgboost-as-a-built-in-algo/train
s3://seteamperformance/xgboost-as-a-built-in-algo/test
2023-06-11 20:14:31 Starting - Starting the training job...
2023-06-11 20:14:45 Starting - Preparing the instances for training...
2023-06-11 20:15:29 Downloading - Downloading input data......
2023-06-11 20:16:20 Training - Training image download completed. Training in progress..[34m[2023-06-11 20:16:27.582 ip-10-0-221-53.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value logloss to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xg

In [13]:
xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-06-11-20-17-13-478
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-06-11-20-17-13-478
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-06-11-20-17-13-478


--------!

In [33]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['SE Process grade'], axis=1).values
# test_data_array = test_data['averageResponsesByStudent'].values

print(test_data_array)

xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = CSVSerializer()

print(test_data_array[2])

predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
predictions_array = np.fromstring(predictions[1:], sep=',')

predictions_array

[[ 9.6   ]
 [ 9.6667]
 [11.6667]
 [10.1429]
 [12.    ]
 [ 9.8   ]
 [11.1667]
 [11.    ]
 [10.7143]
 [11.    ]
 [ 8.6667]
 [11.    ]
 [10.1667]
 [11.    ]
 [10.8   ]
 [12.    ]
 [11.2   ]
 [10.8333]
 [10.    ]
 [12.    ]
 [10.3333]
 [12.    ]
 [10.7143]]
[11.6667]


array([0.36229271, 0.36229271, 0.36229271, 0.36229271, 0.36229271,
       0.36229271, 0.36229271, 0.36229271, 0.36229271, 0.36229271,
       0.36229271, 0.36229271, 0.36229271, 0.36229271, 0.36229271,
       0.36229271, 0.36229271, 0.36229271, 0.36229271, 0.36229271,
       0.36229271, 0.36229271, 0.36229271])

In [44]:
# delete endpoints

# sagemaker.Session().delete_endpoint(xgb_predictor.endpoint_name)
# bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
# bucket_to_delete.objects.all().delete()

# s3_client = boto3.client('s3')
# s3_client.delete_bucket(Bucket=bucket_name)

# need to delete correctly