In [2]:
import sagemaker
import sys

import boto3
import pandas as pd
import numpy as np
import io
import time
from time import gmtime, strftime, sleep
import datetime

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()


default_s3_bucket_name = "sagemaker-featurestore-pov"
prefix = "feature-group-storage"

In [3]:
feature_group1_name = 'bankruptcy-fg-2023-02-21-12-55-30'
feature_group2_name = 'bankruptcy-new-features-2023-02-21-12-56-19'

In [4]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group1 = FeatureGroup(name=feature_group1_name, sagemaker_session=sess)
feature_group2 = FeatureGroup(name=feature_group2_name, sagemaker_session=sess)

In [5]:
feature_query = feature_group1.athena_query()
table_name1 = feature_query.table_name
print(table_name1)
query_string = ('SELECT * FROM "%s"' % table_name1)
print('Running ' + query_string)

feature_query.run(query_string=query_string,
          output_location=f's3://{default_s3_bucket_name}/{prefix}')
feature_query.wait()
dataset1 = feature_query.as_dataframe()

bankruptcy-fg-2023-02-21-12-55-30-1676984149
Running SELECT * FROM "bankruptcy-fg-2023-02-21-12-55-30-1676984149"


In [6]:
dataset1

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,labels,write_time,api_invocation_time,is_deleted
0,0.239510,0.582210,591.63000,0.817450,0.055369,160590.00,3517,1.676984e+09,1year,0,2023-02-21 13:02:22.423,2023-02-21 12:57:14.000,False
1,0.005846,0.015571,10715.00000,0.483730,0.807540,-77114.00,3546,1.676984e+09,1year,0,2023-02-21 13:02:22.423,2023-02-21 12:57:15.000,False
2,0.140900,0.265620,1839.40000,0.494540,0.191350,8185.60,31,1.676984e+09,1year,0,2023-02-21 13:02:22.423,2023-02-21 12:57:15.000,False
3,0.106680,0.391730,902.39000,1.226300,0.073469,14161.00,80,1.676984e+09,1year,0,2023-02-21 13:02:22.423,2023-02-21 12:57:15.000,False
4,-0.110380,-0.306440,-4326.20000,-0.397840,-0.333680,2199.20,3650,1.676984e+09,1year,0,2023-02-21 13:02:22.423,2023-02-21 12:57:16.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17195,0.352560,7.891600,42.57900,3.883800,0.003757,92033.00,3180,1.676984e+09,1year,0,2023-02-21 13:02:22.279,2023-02-21 12:57:54.000,False
17196,0.301050,0.742140,325.30000,1321.989035,0.028721,-293.09,6918,1.676984e+09,1year,1,2023-02-21 13:02:22.279,2023-02-21 12:57:55.000,False
17197,0.347020,0.624530,522.06000,55.708000,0.047016,1523.20,3350,1.676984e+09,1year,0,2023-02-21 13:02:22.279,2023-02-21 12:57:56.000,False
17198,0.159030,0.265670,1465.80000,0.648010,0.138540,1846.00,3457,1.676984e+09,1year,0,2023-02-21 13:02:22.279,2023-02-21 12:57:57.000,False


In [7]:
new_feature_query = feature_group2.athena_query()
table_name2 = new_feature_query.table_name
print(table_name2)
query_string = ('SELECT * FROM "%s"' % table_name2)
print('Running ' + query_string)

new_feature_query.run(query_string=query_string,
          output_location=f's3://{default_s3_bucket_name}/{prefix}')
new_feature_query.wait()
dataset2 = new_feature_query.as_dataframe()

bankruptcy-new-features-2023-02-21-12-56-19-1676984204
Running SELECT * FROM "bankruptcy-new-features-2023-02-21-12-56-19-1676984204"


In [8]:
dataset2

Unnamed: 0,id,event_time,year,attr66,attr67,attr68,write_time,api_invocation_time,is_deleted
0,5136,1.676985e+09,2year,-0.330212,7446.617452,255.941214,2023-02-21 13:11:26.121,2023-02-21 13:06:32.000,False
1,54,1.676985e+09,2year,-5.623758,6722.446538,856.021285,2023-02-21 13:11:26.121,2023-02-21 13:06:33.000,False
2,5226,1.676985e+09,2year,0.000000,1031.844383,41.150914,2023-02-21 13:11:26.121,2023-02-21 13:06:34.000,False
3,189,1.676985e+09,2year,0.000000,3319.824861,344.314621,2023-02-21 13:11:26.121,2023-02-21 13:06:34.000,False
4,198,1.676985e+09,2year,0.000000,1915.465898,2507.697137,2023-02-21 13:11:26.121,2023-02-21 13:06:34.000,False
...,...,...,...,...,...,...,...,...,...
17195,3394,1.676984e+09,1year,-5.152416,1112.246424,135.501412,2023-02-21 13:03:33.088,2023-02-21 12:58:47.000,False
17196,3395,1.676984e+09,1year,0.000000,861.466745,38.977883,2023-02-21 13:03:33.088,2023-02-21 12:58:47.000,False
17197,6928,1.676984e+09,1year,0.000000,3380.009657,130.790444,2023-02-21 13:03:33.088,2023-02-21 12:58:47.000,False
17198,6953,1.676984e+09,1year,0.000000,3650.995341,314.999985,2023-02-21 13:03:33.088,2023-02-21 12:58:47.000,False


In [9]:

query_string = '''
SELECT * FROM "%s" t1 LEFT JOIN "%s" t2 ON t1."id" = t2."id" AND t1."event_time" = t2."event_time"
''' % (table_name1, table_name2)

print("Running " + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
feature_query.run(
    query_string=query_string,
    output_location=f's3://sagemaker-featurestore-pov/feature-group-storage/query_results/',
)
feature_query.wait()
dataset = feature_query.as_dataframe()

dataset

Running 
SELECT * FROM "bankruptcy-fg-2023-02-21-12-55-30-1676984149" t1 LEFT JOIN "bankruptcy-new-features-2023-02-21-12-56-19-1676984204" t2 ON t1."id" = t2."id" AND t1."event_time" = t2."event_time"



Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,labels,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
0,0.175510,0.437000,837.63,11.169000,0.071537,4265.3000,3779,1.676984e+09,1year,0,...,False,3779,1.676984e+09,1year,0.000000,59547.942146,15131.976922,2023-02-21 13:03:03.918,2023-02-21 12:58:08.000,False
1,0.020479,0.032554,228.89,1321.989035,0.021252,-994.2700,3794,1.676984e+09,1year,0,...,False,3794,1.676984e+09,1year,-2.109185,4621.072690,200.232622,2023-02-21 13:03:03.918,2023-02-21 12:58:08.000,False
2,0.249590,0.350490,1041.40,3.193300,0.081002,1.3406,338,1.676984e+09,1year,0,...,False,338,1.676984e+09,1year,0.000000,0.250565,1.091594,2023-02-21 13:03:03.918,2023-02-21 12:58:08.000,False
3,-0.010328,-0.017930,3775.40,0.453020,0.176850,-14997.0000,3920,1.676984e+09,1year,0,...,False,3920,1.676984e+09,1year,4.786793,38292.819937,-707.874223,2023-02-21 13:03:03.918,2023-02-21 12:58:09.000,False
4,0.083469,0.158400,2212.60,82.479000,0.199800,670.3600,403,1.676984e+09,1year,0,...,False,403,1.676984e+09,1year,0.000000,10.102934,120.070983,2023-02-21 13:03:03.918,2023-02-21 12:58:09.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17195,0.006654,0.007339,27691.00,0.642680,0.455200,1241.0000,6826,1.676984e+09,1year,1,...,False,6826,1.676984e+09,1year,0.000000,144.990186,87.983006,2023-02-21 13:03:05.789,2023-02-21 12:58:45.000,False
17196,0.370760,0.608900,591.07,56.435000,0.053188,502.4500,3266,1.676984e+09,1year,0,...,False,3266,1.676984e+09,1year,0.620132,1723.197750,2109.888236,2023-02-21 13:03:05.789,2023-02-21 12:58:45.000,False
17197,0.017391,0.044831,6676.60,0.000000,1.950500,647.1100,3284,1.676984e+09,1year,0,...,False,3284,1.676984e+09,1year,0.000000,4789.150385,154.463340,2023-02-21 13:03:05.789,2023-02-21 12:58:45.000,False
17198,0.056850,0.145190,1818.00,4.053500,0.138890,1200.6000,3412,1.676984e+09,1year,0,...,False,3412,1.676984e+09,1year,0.000000,7068.173790,622.662692,2023-02-21 13:03:05.789,2023-02-21 12:58:47.000,False


In [11]:
# Prepare query results for training.
query_execution = feature_query.get_query_execution()
query_result = (
    f's3://{default_s3_bucket_name}/{prefix}'
    + "/query_results/"
    + query_execution["QueryExecution"]["QueryExecutionId"]
    + ".csv"
)
print(query_result)

# Select useful columns for training with target column as the first.
dataset_filtered = dataset[
    [
        "labels",
        "attr7",
        "attr12",
        "attr15",
        "attr27",
        "attr41",
        "attr55",
        "attr66",
        "attr67",
        "attr68",
    ]
]

# Write to csv in S3 without headers and index column.
dataset_filtered.to_csv("dataset_filtered.csv", header=False, index=False)
s3_client = boto3.client("s3", region_name=region)
s3_client.upload_file("dataset_filtered.csv", default_s3_bucket_name, prefix + "/training_input/dataset_filtered.csv")
dataset_uri_prefix = f's3://{default_s3_bucket_name}/{prefix}/training_input/'

dataset_filtered

s3://sagemaker-featurestore-pov/feature-group-storage/query_results/2e627b29-2dbb-4420-95a0-3dbda0f07d99.csv


Unnamed: 0,labels,attr7,attr12,attr15,attr27,attr41,attr55,attr66,attr67,attr68
0,0,0.175510,0.437000,837.63,11.169000,0.071537,4265.3000,0.000000,59547.942146,15131.976922
1,0,0.020479,0.032554,228.89,1321.989035,0.021252,-994.2700,-2.109185,4621.072690,200.232622
2,0,0.249590,0.350490,1041.40,3.193300,0.081002,1.3406,0.000000,0.250565,1.091594
3,0,-0.010328,-0.017930,3775.40,0.453020,0.176850,-14997.0000,4.786793,38292.819937,-707.874223
4,0,0.083469,0.158400,2212.60,82.479000,0.199800,670.3600,0.000000,10.102934,120.070983
...,...,...,...,...,...,...,...,...,...,...
17195,1,0.006654,0.007339,27691.00,0.642680,0.455200,1241.0000,0.000000,144.990186,87.983006
17196,0,0.370760,0.608900,591.07,56.435000,0.053188,502.4500,0.620132,1723.197750,2109.888236
17197,0,0.017391,0.044831,6676.60,0.000000,1.950500,647.1100,0.000000,4789.150385,154.463340
17198,0,0.056850,0.145190,1818.00,4.053500,0.138890,1200.6000,0.000000,7068.173790,622.662692


In [39]:
from sklearn.model_selection import train_test_split

# X,y split
def train_validation_test(data, target_col):
    y=data[target_col]
    X= data.loc[:, data.columns != target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    
    train = pd.concat([pd.Series(y_train, index=X_train.index,
                                 name='labels', dtype=int), X_train], axis=1)
    validation = pd.concat([pd.Series(y_val, index=X_val.index,
                                name='labels', dtype=int), X_val], axis=1)
    test = X_test
    #test = pd.concat([pd.Series(y_test, index=X_test.index,name='labels', dtype=int), X_test], axis=1)
    
    return train, validation, test

In [40]:
train, validation, test = train_validation_test(dataset_filtered, 'labels')


In [22]:
train

Unnamed: 0,labels,attr7,attr12,attr15,attr27,attr41,attr55,attr66,attr67,attr68
1054,0,0.057921,0.112080,3761.600,1.93890,0.262970,1330.5000,0.737505,5036.148227,518.220395
7109,0,0.108450,0.223220,1780.400,7.07470,0.143720,87.4300,0.000000,221.431466,49.422128
11751,0,0.314480,0.578800,615.880,1.21900,0.055415,18548.0000,1.813215,3034.040535,10939.666828
15201,0,0.076601,0.335020,748.400,7.31340,0.061829,68.8470,0.000000,1224.861230,92.371276
78,0,0.022386,0.032828,6175.600,1.48040,0.279990,3664.4000,0.000000,6069.500116,670.786568
...,...,...,...,...,...,...,...,...,...,...
1723,0,0.086070,0.271690,681.710,0.74661,0.059176,16958.0000,0.909086,72183.203507,8828.024006
13594,0,0.285870,7.726400,42.034,5.57950,0.004187,38657.0000,1.994193,14675.043657,12796.872568
9098,0,0.030890,0.070185,4118.700,2.06750,0.268090,1.3012,-2.220006,4.194443,0.235347
12343,0,0.049230,0.093555,2095.500,0.67201,0.183910,372.3400,0.782084,7619.924689,649.473777


In [23]:
validation

Unnamed: 0,labels,attr7,attr12,attr15,attr27,attr41,attr55,attr66,attr67,attr68
15010,0,0.108900,0.399670,969.810,3.21680,0.079197,2.4774,0.786777,2.000161,0.533797
8576,0,0.190990,0.532160,568.770,884.02000,0.056929,1290.1000,2.343892,1330.946756,780.846772
1873,0,0.000519,0.000788,10158.000,0.00000,0.947600,1237.6000,-156.707129,472.564817,2.601516
565,0,0.058887,1.497100,171.470,-0.32359,0.129760,131840.0000,8.252076,45447.964425,9004.576036
7317,0,0.143920,0.349430,186.330,0.00000,0.019846,1218.1000,0.000000,5200.444008,1570.587278
...,...,...,...,...,...,...,...,...,...,...
16709,0,0.282360,1.673400,333.550,4.17230,0.030155,16990.0000,1.806417,19859.963296,12391.183009
4243,0,0.166500,0.312260,1321.500,10.80800,0.113640,4613.5000,-0.527910,412.103618,1272.908695
16808,0,0.034963,0.105590,1450.200,0.18809,0.122450,4287.2000,5.048480,24660.339373,1126.698076
2520,0,0.757580,7.918300,47.398,9.42550,0.004165,22314.0000,1.551651,4300.001927,22295.165150


In [41]:
test

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,attr66,attr67,attr68
16847,-0.010551,-0.044056,9101.700,-0.12914,0.795600,7147.600,0.978391,38446.560164,-530.746171
10383,0.073908,0.246020,2115.700,0.00000,0.604475,1738.100,0.000000,1077.023175,297.401247
3053,0.002269,0.026894,534.440,-0.01564,0.050348,4317.300,-15.855443,19059.244217,77.862033
6505,0.376250,3.585600,80.997,2.87280,0.006947,4966.700,0.799973,3713.698220,2919.038199
3501,0.045524,0.060236,3775.400,0.11630,0.413980,-2200.000,3.755382,36339.009927,4868.263549
...,...,...,...,...,...,...,...,...,...
15657,0.055993,0.156790,3871.300,2.51700,0.206760,-467.790,-0.091588,10712.175685,697.525946
13583,-0.023771,-0.136000,6260.300,-13.90500,0.604800,1171.100,0.000000,1162.382134,-67.219342
11020,0.165720,4.626700,99.816,217.82000,0.009102,9020.000,-1.132090,4897.117107,1946.002464
5802,0.014898,0.051094,2595.900,0.62425,0.144460,4365.100,-0.863941,4563.617355,101.882163


In [31]:
# Use 'csv' format to store the data
# The first column is expected to be the output column
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)

# Write to csv in S3 without headers and index column.
s3_client = boto3.client("s3", region_name=region)
s3_client.upload_file("train.csv", default_s3_bucket_name, prefix + "/training_input/train.csv")
train_data_uri= f's3://{default_s3_bucket_name}/{prefix}/training_input/'

s3_client.upload_file("validation.csv", default_s3_bucket_name, prefix + "/validation_input/validation.csv")
validation_data_uri = f's3://{default_s3_bucket_name}/{prefix}/validation_input/'

s3_client.upload_file("test.csv", default_s3_bucket_name, prefix + "/test_input/test.csv")
test_data_uri = f's3://{default_s3_bucket_name}/{prefix}/test_input/'


In [32]:
training_image = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

In [33]:
training_output_path = "s3://" + default_s3_bucket_name + "/" + prefix + "/training_output"

from sagemaker.estimator import Estimator

model = Estimator(
    training_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=training_output_path,
    sagemaker_session=sess,
)

In [34]:
model.set_hyperparameters(objective="binary:logistic", num_round=50)

In [36]:

train_input = sagemaker.inputs.TrainingInput(
    train_data_uri,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

validation_input = sagemaker.inputs.TrainingInput(
    validation_data_uri,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

#data_channels = {"train": train_data}

In [38]:
model.fit({"train": train_input, "validation": validation_input}, wait=True, logs=True)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-02-21-14-22-25-278


2023-02-21 14:22:25 Starting - Starting the training job...
2023-02-21 14:22:51 Starting - Preparing the instances for training......
2023-02-21 14:23:51 Downloading - Downloading input data...
2023-02-21 14:24:21 Training - Downloading the training image......
2023-02-21 14:25:16 Training - Training image download completed. Training in progress..[34m[2023-02-21 14:25:27.864 ip-10-0-76-41.eu-central-1.compute.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV in

### Make Prediction with Batch Transform

In [43]:
batch_input = test_data_uri
batch_output = 's3://{}/{}/batch-prediction'.format(default_s3_bucket_name, prefix)

In [44]:
transformer = model.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge', 
    output_path=batch_output
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-02-21-14-28-57-977


In [45]:
transformer.transform(
    data=batch_input, 
    data_type='S3Prefix',
    content_type='text/csv', 
    split_type='Line'
)
transformer.wait()

INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2023-02-21-14-29-33-597


....................................
[34m[2023-02-21:14:35:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-02-21:14:35:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2023-02-21:14:35:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2023-02-21:14:35:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-02-21:14:35:21:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $p