In this example we will create a training dataset with FeatureValues from the bankruptcy-fg and the bankruptcy-new-fg. This is done by utilizing the auto-build Catalog. We run an Athena query that does a simple select all in the offline store in S3 from the FeatureGroup.

## Required Imports

In [2]:
import sagemaker
import sys

import boto3
import pandas as pd
import numpy as np
import io
import time
from time import gmtime, strftime, sleep
import datetime

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()


default_s3_bucket_name = "sagemaker-featurestore-pov"
prefix = "feature-group-storage"

In [3]:
feature_group1_name = 'bankruptcy-fg-2023-02-21-12-55-30'
feature_group2_name = 'bankruptcy-new-features-2023-02-21-12-56-19'

In [4]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group1 = FeatureGroup(name=feature_group1_name, sagemaker_session=sess)
feature_group2 = FeatureGroup(name=feature_group2_name, sagemaker_session=sess)

In [5]:
feature_query = feature_group1.athena_query()
table_name1 = feature_query.table_name
print(table_name1)
query_string = ('SELECT * FROM "%s"' % table_name1)
print('Running ' + query_string)

feature_query.run(query_string=query_string,
          output_location=f's3://{default_s3_bucket_name}/{prefix}')
feature_query.wait()
dataset1 = feature_query.as_dataframe()

bankruptcy-fg-2023-02-21-12-55-30-1676984149
Running SELECT * FROM "bankruptcy-fg-2023-02-21-12-55-30-1676984149"


In [6]:
new_feature_query = feature_group2.athena_query()
table_name2 = new_feature_query.table_name
print(table_name2)
query_string = ('SELECT * FROM "%s"' % table_name2)
print('Running ' + query_string)

new_feature_query.run(query_string=query_string,
          output_location=f's3://{default_s3_bucket_name}/{prefix}')
new_feature_query.wait()
dataset2 = new_feature_query.as_dataframe()

bankruptcy-new-features-2023-02-21-12-56-19-1676984204
Running SELECT * FROM "bankruptcy-new-features-2023-02-21-12-56-19-1676984204"


In [7]:
dataset1

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,labels,write_time,api_invocation_time,is_deleted
0,0.036752,0.080598,400.15,0.000000,0.039824,3159.80,44,1.676984e+09,1year,0,2023-02-21 13:02:22.280,2023-02-21 12:57:15.000,False
1,0.315710,1.830400,185.30,2.844800,0.016786,132630.00,3568,1.676984e+09,1year,0,2023-02-21 13:02:22.280,2023-02-21 12:57:15.000,False
2,0.069060,0.114840,2275.60,0.988060,0.170600,1270.10,3578,1.676984e+09,1year,0,2023-02-21 13:02:22.280,2023-02-21 12:57:15.000,False
3,0.289480,0.629640,577.64,2.299500,0.042427,12949.00,95,1.676984e+09,1year,0,2023-02-21 13:02:22.280,2023-02-21 12:57:15.000,False
4,0.064126,0.094067,3808.60,21.134000,0.331670,478.28,3730,1.676984e+09,1year,0,2023-02-21 13:02:22.280,2023-02-21 12:57:17.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17195,0.004723,0.007844,3606.30,1321.989035,0.265640,279.95,6966,1.676984e+09,1year,1,2023-02-21 13:02:22.365,2023-02-21 12:57:56.000,False
17196,0.223570,0.499310,721.78,38.793000,0.065540,5074.50,3382,1.676984e+09,1year,0,2023-02-21 13:02:22.365,2023-02-21 12:57:56.000,False
17197,0.032301,0.171450,705.96,0.893910,0.058494,836.77,3385,1.676984e+09,1year,0,2023-02-21 13:02:22.365,2023-02-21 12:57:56.000,False
17198,0.468350,1.119000,316.47,12.590000,0.026780,2316.40,3399,1.676984e+09,1year,0,2023-02-21 13:02:22.365,2023-02-21 12:57:57.000,False


In [8]:
dataset2

Unnamed: 0,id,event_time,year,attr66,attr67,attr68,write_time,api_invocation_time,is_deleted
0,5106,1.676985e+09,2year,3.312370,12112.355902,2026.849014,2023-02-21 13:11:23.675,2023-02-21 13:06:32.000,False
1,20,1.676985e+09,2year,0.188266,3400.969787,305.999741,2023-02-21 13:11:23.675,2023-02-21 13:06:32.000,False
2,5148,1.676985e+09,2year,-0.252407,780.574966,1604.910413,2023-02-21 13:11:23.675,2023-02-21 13:06:33.000,False
3,69,1.676985e+09,2year,1.808571,29542.654579,7205.185463,2023-02-21 13:11:23.675,2023-02-21 13:06:33.000,False
4,5171,1.676985e+09,2year,3.625320,32867.590184,2768.400974,2023-02-21 13:11:23.675,2023-02-21 13:06:33.000,False
...,...,...,...,...,...,...,...,...,...
17195,6888,1.676984e+09,1year,-1.064455,2031.999456,382.991936,2023-02-21 13:03:05.891,2023-02-21 12:58:46.000,False
17196,3430,1.676984e+09,1year,-2.610860,390.359313,217.025782,2023-02-21 13:03:05.891,2023-02-21 12:58:47.000,False
17197,6984,1.676984e+09,1year,-99.722421,11173.964759,941.319869,2023-02-21 13:03:05.891,2023-02-21 12:58:47.000,False
17198,7004,1.676984e+09,1year,1.000000,9885.423443,-5999.659403,2023-02-21 13:03:05.891,2023-02-21 12:58:48.000,False


In [9]:
dataset1.value_counts('event_time')

event_time
1.676985e+09    10173
1.676984e+09     7027
dtype: int64

In [10]:
dataset2.value_counts('event_time')

event_time
1.676985e+09    10173
1.676984e+09     7027
dtype: int64

In [11]:

query_string = '''
SELECT * FROM "%s" t1 LEFT JOIN "%s" t2 ON t1."id" = t2."id" AND t1."event_time" = t2."event_time"
''' % (table_name1, table_name2)

print("Running " + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
feature_query.run(
    query_string=query_string,
    output_location=f's3://sagemaker-featurestore-pov/feature-group-storage/query_results/',
)
feature_query.wait()
dataset = feature_query.as_dataframe()

dataset

Running 
SELECT * FROM "bankruptcy-fg-2023-02-21-12-55-30-1676984149" t1 LEFT JOIN "bankruptcy-new-features-2023-02-21-12-56-19-1676984204" t2 ON t1."id" = t2."id" AND t1."event_time" = t2."event_time"



Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,labels,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
0,0.118290,12.530000,22.182,10.451000,0.001875,1920.100,6788,1.676985e+09,2year,0,...,False,6788,1.676985e+09,2year,0.000000,628.284415,253.620733,2023-02-21 13:11:24.884,2023-02-21 13:06:53.000,False
1,0.354110,0.501290,672.150,38.298000,0.059901,53.313,32,1.676985e+09,2year,0,...,False,32,1.676985e+09,2year,-0.863884,10.515639,57.594892,2023-02-21 13:11:24.884,2023-02-21 13:06:32.000,False
2,0.190190,1.354600,243.950,10.540000,0.021991,189.500,51,1.676985e+09,2year,0,...,False,51,1.676985e+09,2year,0.795520,3276.789265,749.467379,2023-02-21 13:11:24.884,2023-02-21 13:06:33.000,False
3,0.004435,0.006069,8872.600,0.121960,0.538200,5985.100,6854,1.676985e+09,2year,0,...,False,6854,1.676985e+09,2year,1.117537,5498.484153,134.017441,2023-02-21 13:11:24.884,2023-02-21 13:06:54.000,False
4,0.370920,1.223200,280.480,3.941500,0.025317,1222.000,3494,1.676985e+09,2year,0,...,False,3494,1.676985e+09,2year,0.000000,249.515059,605.593481,2023-02-21 13:11:24.884,2023-02-21 13:07:15.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17195,0.000240,0.000380,4571.200,1321.989035,0.273750,-34.420,6820,1.676984e+09,1year,1,...,False,6820,1.676984e+09,1year,1.000000,2603.237029,1.670874,2023-02-21 13:03:03.907,2023-02-21 12:58:45.000,False
17196,0.021552,0.043019,8275.600,0.241520,0.392180,-1502.400,3235,1.676984e+09,1year,0,...,False,3235,1.676984e+09,1year,0.315238,5469.637396,54.012624,2023-02-21 13:03:03.907,2023-02-21 12:58:45.000,False
17197,0.095129,0.284530,1247.200,10.920000,0.111240,1671.000,3315,1.676984e+09,1year,0,...,False,3315,1.676984e+09,1year,0.000000,3864.030524,791.005966,2023-02-21 13:03:03.907,2023-02-21 12:58:46.000,False
17198,0.247840,1.109200,289.970,1.736100,0.025364,5673.800,3341,1.676984e+09,1year,0,...,False,3341,1.676984e+09,1year,2.793657,3261.929401,2851.858911,2023-02-21 13:03:03.907,2023-02-21 12:58:46.000,False


In [12]:
dataset.loc[dataset['id'] == 50]

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,labels,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
6573,-0.037519,-0.46603,505.79,-0.47547,0.048809,65262.0,50,1676984000.0,1year,0,...,False,50,1676984000.0,1year,0.562462,32140.852007,-4631.623208,2023-02-21 13:03:03.921,2023-02-21 12:58:05.000,False
7101,0.173,0.82642,383.42,1.4494,0.03613,34824.0,50,1676985000.0,2year,0,...,False,50,1676985000.0,2year,2.198555,21914.291108,10271.099407,2023-02-21 13:11:23.750,2023-02-21 13:06:33.000,False


In [14]:
# Prepare query results for training.
query_execution = feature_query.get_query_execution()
query_result = (
    "s3://sagemaker-featurestore-pov/feature-group-storage"
    + "/query_results/"
    + query_execution["QueryExecution"]["QueryExecutionId"]
    + ".csv"
)
print(query_result)

# Select useful columns for training with target column as the first.
dataset = dataset[
    [
        "labels",
        "attr7",
        "attr12",
        "attr15",
        "attr27",
        "attr41",
        "attr55",
        "attr66",
        "attr67",
        "attr68",
    ]
]

# Write to csv in S3 without headers and index column.
dataset.to_csv("dataset.csv", header=False, index=False)
s3_client = boto3.client("s3", region_name=region)
s3_client.upload_file("dataset.csv", "sagemaker-featurestore-pov", "feature-group-storage" + "/training_input/dataset.csv")
dataset_uri_prefix = "s3://" + "sagemaker-featurestore-pov" + "/feature-group-storage" + "/training_input/"

dataset

s3://sagemaker-featurestore-pov/feature-group-storage/query_results/e37f5be0-8b9c-4006-aa05-e2faec032f4e.csv


Unnamed: 0,labels,attr7,attr12,attr15,attr27,attr41,attr55,attr66,attr67,attr68
0,0,0.107640,0.324670,753.09,0.718980,0.066952,1164.50,2.534653,30453.998640,4343.638450
1,0,-0.025266,-1.912500,-3130.60,1321.989035,0.604475,4433.40,-0.774480,4819.856059,0.000000
2,0,0.229480,1.041300,746.89,29.868000,0.066496,1497.80,0.000000,8512.161855,2946.550742
3,0,0.082692,0.242260,983.82,0.431960,0.082286,1577.90,0.729962,4432.303371,550.804995
4,0,0.030620,0.053646,2733.40,1.709600,0.178770,-792.21,0.000000,1970.720665,50.330993
...,...,...,...,...,...,...,...,...,...,...
17195,0,-0.133890,-0.150970,-2417.70,-4.856000,-0.264180,18.44,0.445799,3.637169,-21.827704
17196,1,0.019775,0.044790,3733.20,1243.186724,0.217200,-4055.00,1.708774,20647.690819,1003.974978
17197,1,0.057748,0.393010,1022.20,473.550000,0.493940,34057.00,0.000000,32933.952229,3171.993968
17198,0,-0.301200,-3.125000,-131.40,1243.186724,0.721041,72.00,0.000000,3.000000,-24.999597


## Train-test split

Using Sklearn, split the dataset into a training set and a test set. 

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
# X,y split
def train_test(data, target_col):
    y=data[target_col]
    X= data.loc[:, data.columns != target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [26]:
X_train, X_test, y_train, y_test = train_test(dataset, 'labels')


Split the training set to separate out a validation set. The validation set is used to evaluate the performance of the trained model while tuning the model's hyperparameters. 

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [28]:
X_train

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,write_time,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
1054,0.177280,1.035300,499.24,1.842400,0.042890,49795.000,9364,1.676985e+09,2year,2023-02-21 13:10:51.240,...,False,9364,1.676985e+09,2year,1.545578,80374.148562,22469.318383,2023-02-21 13:11:26.225,2023-02-21 13:07:27.000,False
7109,0.055931,0.101860,3104.00,2.070700,0.167090,-145.860,7138,1.676985e+09,2year,2023-02-21 13:10:51.372,...,False,7138,1.676985e+09,2year,0.000000,4280.306365,369.573738,2023-02-21 13:11:23.750,2023-02-21 13:06:57.000,False
11751,0.040912,0.065386,4364.50,1.822000,0.268520,2576.000,7617,1.676985e+09,2year,2023-02-21 13:10:43.517,...,False,7617,1.676985e+09,2year,0.000000,2218.968042,496.013206,2023-02-21 13:11:23.618,2023-02-21 13:07:04.000,False
15201,0.109230,0.262510,1431.50,3.500000,0.098047,1574.000,1291,1.676985e+09,2year,2023-02-21 13:10:45.036,...,False,1291,1.676985e+09,2year,0.000000,361.997194,279.001815,2023-02-21 13:11:23.666,2023-02-21 13:06:47.000,False
78,0.021731,0.058292,3181.50,1.362200,0.195740,82.081,2991,1.676985e+09,2year,2023-02-21 13:10:42.172,...,False,2991,1.676985e+09,2year,0.219626,13012.413006,319.234045,2023-02-21 13:11:24.884,2023-02-21 13:07:09.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,-0.043738,-0.149820,-6032.10,1321.989035,-1.108600,7418.000,6885,1.676984e+09,1year,2023-02-21 13:02:22.351,...,False,6885,1.676984e+09,1year,1.151607,18209.043154,-1822.976395,2023-02-21 13:03:04.164,2023-02-21 12:58:46.000,False
13594,-0.479160,-3.702700,-746.46,-0.347280,-0.422240,-102.000,6900,1.676984e+09,1year,2023-02-21 13:02:20.804,...,False,6900,1.676984e+09,1year,0.263691,17726.798749,-9701.135371,2023-02-21 13:03:03.892,2023-02-21 12:58:46.000,False
9098,0.161970,0.688110,874.37,4.920900,0.064432,5819.200,3726,1.676984e+09,1year,2023-02-21 13:02:22.557,...,False,3726,1.676984e+09,1year,0.353954,4219.257541,1646.086310,2023-02-21 13:03:32.999,2023-02-21 12:58:07.000,False
12343,0.128300,0.272870,995.62,4.456600,0.075050,508.070,5861,1.676984e+09,1year,2023-02-21 13:02:22.378,...,False,5861,1.676984e+09,1year,-2.934762,280.918943,191.064223,2023-02-21 13:03:32.983,2023-02-21 12:58:33.000,False


In [18]:
X_val

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,write_time,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
15010,0.299240,0.386460,872.89,15.104000,0.077048,-1458.2,681,1.676985e+09,2year,2023-02-21 13:10:43.531,...,False,681,1.676985e+09,2year,-0.011432,3749.550013,3038.026652,2023-02-21 13:11:24.423,2023-02-21 13:06:40.000,False
8576,0.222450,0.584270,1101.10,422.640000,0.103170,1297.7,783,1.676984e+09,1year,2023-02-21 13:02:22.231,...,False,783,1.676984e+09,1year,0.000000,81.284059,399.958640,2023-02-21 13:03:03.941,2023-02-21 12:58:14.000,False
1873,0.019895,0.059077,14106.00,0.000000,79.317000,1538.5,6066,1.676984e+09,1year,2023-02-21 13:02:23.846,...,False,6066,1.676984e+09,1year,0.000000,4722.947045,187.828041,2023-02-21 13:03:33.063,2023-02-21 12:58:35.000,False
565,0.005260,0.008690,5031.80,0.313550,0.262940,-6889.1,7995,1.676985e+09,2year,2023-02-21 13:10:50.899,...,False,7995,1.676985e+09,2year,9.912548,25026.701057,156.636040,2023-02-21 13:11:23.797,2023-02-21 13:07:09.000,False
7317,0.160130,0.578280,703.64,1.167200,0.067823,28646.0,3063,1.676984e+09,1year,2023-02-21 13:02:22.557,...,False,3063,1.676984e+09,1year,1.785799,46276.372330,13243.866315,2023-02-21 13:03:32.999,2023-02-21 12:58:42.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16709,0.059301,0.075981,4110.20,1.563500,0.589970,-1534.0,4873,1.676985e+09,2year,2023-02-21 13:10:45.036,...,False,4873,1.676985e+09,2year,-3.410904,5907.043013,1066.077314,2023-02-21 13:11:23.666,2023-02-21 13:07:33.000,False
4243,-0.057430,-0.069549,-5660.70,0.652320,0.230430,2028.5,8345,1.676985e+09,2year,2023-02-21 13:10:42.123,...,False,8345,1.676985e+09,2year,0.000000,16.347006,-950.455230,2023-02-21 13:11:24.384,2023-02-21 13:07:14.000,False
16808,0.105650,0.392900,1016.10,1.677400,0.070710,89593.0,1329,1.676984e+09,1year,2023-02-21 13:02:22.290,...,False,1329,1.676984e+09,1year,2.628206,93461.365936,26453.245906,2023-02-21 13:03:33.088,2023-02-21 12:58:20.000,False
2520,0.134010,0.190280,1889.30,0.210130,0.245580,9555.8,365,1.676984e+09,1year,2023-02-21 13:02:20.797,...,False,365,1.676984e+09,1year,2.035893,630.745875,3736.410461,2023-02-21 13:03:03.886,2023-02-21 12:58:09.000,False


In [19]:
X_test

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,write_time,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
16847,0.028389,0.098191,6457.800,3.981300,0.441720,-105.00,6928,1.676984e+09,1year,2023-02-21 13:02:22.290,...,False,6928,1.676984e+09,1year,0.000000,3380.009657,130.790444,2023-02-21 13:03:33.088,2023-02-21 12:58:47.000,False
10383,0.063820,0.156250,5496.400,4.820600,0.687770,915.13,9776,1.676985e+09,2year,2023-02-21 13:10:43.732,...,False,9776,1.676985e+09,2year,-1.679097,180.503375,98.729772,2023-02-21 13:11:23.672,2023-02-21 13:07:32.000,False
3053,0.080525,0.342710,826.090,0.539470,0.072757,9909.50,4170,1.676985e+09,2year,2023-02-21 13:10:43.363,...,False,4170,1.676985e+09,2year,3.841043,5801.814988,1614.295810,2023-02-21 13:11:23.590,2023-02-21 13:07:24.000,False
6505,0.001990,0.008304,1739.500,0.194000,0.115430,6164.20,3212,1.676984e+09,1year,2023-02-21 13:02:20.750,...,False,3212,1.676984e+09,1year,85.371859,7595.214332,410.446042,2023-02-21 13:03:03.921,2023-02-21 12:58:44.000,False
3501,0.174560,9.423600,32.921,11.644000,0.003420,2284.00,4708,1.676985e+09,2year,2023-02-21 13:10:43.575,...,False,4708,1.676985e+09,2year,2.155992,12031.184155,2136.983908,2023-02-21 13:11:23.721,2023-02-21 13:07:31.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15657,0.137160,0.516890,540.430,174.940000,0.051656,1531.20,1077,1.676985e+09,2year,2023-02-21 13:10:43.542,...,False,1077,1.676985e+09,2year,0.000000,365.110401,262.358678,2023-02-21 13:11:24.501,2023-02-21 13:06:45.000,False
13583,0.005187,0.047894,2657.100,-0.094986,0.269820,15564.00,6336,1.676984e+09,1year,2023-02-21 13:02:20.804,...,False,6336,1.676984e+09,1year,-3.057451,75450.843514,23.985442,2023-02-21 13:03:03.892,2023-02-21 12:58:39.000,False
11020,0.005790,0.028014,4738.600,1.586600,0.314170,363.60,1074,1.676985e+09,2year,2023-02-21 13:10:51.381,...,False,1074,1.676985e+09,2year,-6.895897,92.620424,2.871994,2023-02-21 13:11:24.603,2023-02-21 13:06:45.000,False
5802,0.148790,2.113100,135.260,2.569500,0.015272,114830.00,451,1.676985e+09,2year,2023-02-21 13:10:51.408,...,False,451,1.676985e+09,2year,1.770011,214479.164721,42584.341509,2023-02-21 13:11:24.553,2023-02-21 13:06:37.000,False


In [20]:
y_train.value_counts()

0    9931
1     389
Name: labels, dtype: int64

In [21]:
y_test.value_counts()

0    3290
1     150
Name: labels, dtype: int64

Using the pandas package, explicitly align each dataset by concatenating the numeric features with the true labels.

In [22]:

train = pd.concat([pd.Series(y_train, index=X_train.index,
                             name='labels', dtype=int), X_train], axis=1)
validation = pd.concat([pd.Series(y_val, index=X_val.index,
                            name='labels', dtype=int), X_val], axis=1)
test = pd.concat([pd.Series(y_test, index=X_test.index,
                            name='labels', dtype=int), X_test], axis=1)

In [23]:
train

Unnamed: 0,labels,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,year,...,is_deleted,id.1,event_time.1,year.1,attr66,attr67,attr68,write_time.1,api_invocation_time.1,is_deleted.1
1054,0,0.177280,1.035300,499.24,1.842400,0.042890,49795.000,9364,1.676985e+09,2year,...,False,9364,1.676985e+09,2year,1.545578,80374.148562,22469.318383,2023-02-21 13:11:26.225,2023-02-21 13:07:27.000,False
7109,0,0.055931,0.101860,3104.00,2.070700,0.167090,-145.860,7138,1.676985e+09,2year,...,False,7138,1.676985e+09,2year,0.000000,4280.306365,369.573738,2023-02-21 13:11:23.750,2023-02-21 13:06:57.000,False
11751,0,0.040912,0.065386,4364.50,1.822000,0.268520,2576.000,7617,1.676985e+09,2year,...,False,7617,1.676985e+09,2year,0.000000,2218.968042,496.013206,2023-02-21 13:11:23.618,2023-02-21 13:07:04.000,False
15201,0,0.109230,0.262510,1431.50,3.500000,0.098047,1574.000,1291,1.676985e+09,2year,...,False,1291,1.676985e+09,2year,0.000000,361.997194,279.001815,2023-02-21 13:11:23.666,2023-02-21 13:06:47.000,False
78,0,0.021731,0.058292,3181.50,1.362200,0.195740,82.081,2991,1.676985e+09,2year,...,False,2991,1.676985e+09,2year,0.219626,13012.413006,319.234045,2023-02-21 13:11:24.884,2023-02-21 13:07:09.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,1,-0.043738,-0.149820,-6032.10,1321.989035,-1.108600,7418.000,6885,1.676984e+09,1year,...,False,6885,1.676984e+09,1year,1.151607,18209.043154,-1822.976395,2023-02-21 13:03:04.164,2023-02-21 12:58:46.000,False
13594,1,-0.479160,-3.702700,-746.46,-0.347280,-0.422240,-102.000,6900,1.676984e+09,1year,...,False,6900,1.676984e+09,1year,0.263691,17726.798749,-9701.135371,2023-02-21 13:03:03.892,2023-02-21 12:58:46.000,False
9098,0,0.161970,0.688110,874.37,4.920900,0.064432,5819.200,3726,1.676984e+09,1year,...,False,3726,1.676984e+09,1year,0.353954,4219.257541,1646.086310,2023-02-21 13:03:32.999,2023-02-21 12:58:07.000,False
12343,0,0.128300,0.272870,995.62,4.456600,0.075050,508.070,5861,1.676984e+09,1year,...,False,5861,1.676984e+09,1year,-2.934762,280.918943,191.064223,2023-02-21 13:03:32.983,2023-02-21 12:58:33.000,False


### Convert the Train and Validation Datasets to CSV Files and uploading them to S3

In [38]:
# Use 'csv' format to store the data
# The first column is expected to be the output column
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [39]:
# Write to csv in S3 without headers and index column.
s3_client = boto3.client("s3", region_name=region)
s3_client.upload_file("train.csv", "sagemaker-featurestore-pov", "feature-group-storage" + "/training_input/train.csv")
train_data_uri= "s3://" + "sagemaker-featurestore-pov" + "/feature-group-storage" + "/training_input/"

In [40]:
# Write to csv in S3 without headers and index column.
s3_client = boto3.client("s3", region_name=region)
s3_client.upload_file("validation.csv", "sagemaker-featurestore-pov", "feature-group-storage" + "/validation_input/validation.csv")
val_data_uri= "s3://" + "sagemaker-featurestore-pov" + "/feature-group-storage" + "/validation_input/"

## Xgboost for Training Model

In [22]:
training_image = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

In [23]:
training_output_path = "s3://" + default_s3_bucket_name + "/" + prefix + "/training_output"

from sagemaker.estimator import Estimator

training_model = Estimator(
    training_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=training_output_path,
    sagemaker_session=sess,
)

In [24]:
training_model.set_hyperparameters(objective="binary:logistic", num_round=50)

In [51]:

train_input = sagemaker.inputs.TrainingInput(
    train_data_uri,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

validation_input = sagemaker.inputs.TrainingInput(
    val_data_uri,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

#data_channels = {"train": train_data}

In [52]:
training_model.fit({"train": train_input, "validation": validation_input}, wait=True, logs=True)

2023-02-14 17:50:08 Starting - Starting the training job...
2023-02-14 17:50:35 Starting - Preparing the instances for trainingProfilerReport-1676397008: InProgress
......
2023-02-14 17:51:32 Downloading - Downloading input data...
2023-02-14 17:51:57 Training - Downloading the training image......
2023-02-14 17:52:52 Training - Training image download completed. Training in progress..[34m[2023-02-14 17:53:09.315 ip-10-0-178-217.eu-central-1.compute.internal:6 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m

## Set up a Hosting for the model

Once the training is done, we can deploy the trained model as an Amazon SageMaker real-time hosted endpoint. This will allow us to make predictions (or inference) from the model. Note that we don't have to host on the same instance (or type of instance) that we used to train. The endpoint deployment can be accomplished as follows. This takes 8-10 minutes to complete.

In [53]:
predictor = training_model.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

-----!

## SageMaker FeatureStore During Inference

SageMaker FeatureStore can be useful in supplementing data for inference requests because of the low-latency GetRecord functionality. For this demo, we will be given an Id and query our online FeatureGroups for data on the transaction to build our inference request.

In [54]:
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


In [55]:
boto_session = boto3.Session(region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

In [62]:
# Incoming inference request.
id = str(10000)

# Helper to parse the feature value from the record.
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


bankruptcy_response = featurestore_runtime.get_record(
    FeatureGroupName=feature_group1_name, RecordIdentifierValueAsString=id
)

bankruptcy_record = bankruptcy_response["Record"]

bankruptcy_test_data = [
    get_feature_value(bankruptcy_record, "attr7"),
    get_feature_value(bankruptcy_record, "attr12"),
    get_feature_value(bankruptcy_record, "attr15"),
    get_feature_value(bankruptcy_record, "attr27"),
    get_feature_value(bankruptcy_record, "attr41"),
    get_feature_value(bankruptcy_record, "attr55"),
]


bankruptcy_new_response = featurestore_runtime.get_record(
    FeatureGroupName=feature_group2_name, RecordIdentifierValueAsString=id
)

bankruptcy_new_record = bankruptcy_new_response["Record"]

bankruptcy_new_test_data = [
    get_feature_value(bankruptcy_new_record, "attr66"),
    get_feature_value(bankruptcy_new_record, "attr67"),
    get_feature_value(bankruptcy_new_record, "attr68"),
]

# Join all pieces for inference request.
inference_request = []
inference_request.extend(bankruptcy_test_data[:])
inference_request.extend(bankruptcy_new_test_data[:])

inference_request

['0.1375',
 '0.21917',
 '1512.0',
 '2.207',
 '0.12795',
 '794.0',
 '1.017090909090909',
 '1235.0095658801388',
 '606.8047207355564']

In [63]:
import json

results = predictor.predict(",".join(inference_request), initial_args={"ContentType": "text/csv"})
prediction = json.loads(results)
print(prediction)


0.044454336166381836


In [65]:
predictor.delete_endpoint()