In [3]:
import sagemaker
import sys

import boto3
import pandas as pd
import numpy as np
import io
import time
import json
from time import gmtime, strftime, sleep
import datetime

from sagemaker.feature_store.feature_group import FeatureGroup

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()

In [3]:
############################################################

# Loads the 5 raw .arff files into pandas dataframes
def load_dataframes():
    data_loc = 's3://sagemaker-featurestore-pov/bankruptcy data/3year.csv'
    return pd.read_csv(data_loc)

# dataframes is the list of pandas dataframes for the 5 year datafiles.  
dataframe = load_dataframes()


In [4]:
dataframe

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,56,57,58,59,60,61,62,63,64,65
0,0.174190,0.41299,0.143710,1.34800,-28.9820,0.603830,0.219460,1.122500,1.19610,0.463590,...,0.163960,0.375740,0.83604,0.000007,9.7145,6.2813,84.291,4.3303,4.0341,0
1,0.146240,0.46038,0.282300,1.62940,2.5952,0.000000,0.171850,1.172100,1.60180,0.539620,...,0.027516,0.271000,0.90108,0.000000,5.9882,4.1103,102.190,3.5716,5.9500,0
2,0.000595,0.22612,0.488390,3.15990,84.8740,0.191140,0.004572,2.988100,1.00770,0.675660,...,0.007639,0.000881,0.99236,0.000000,6.7742,3.7922,64.846,5.6287,4.4581,0
3,0.024526,0.43236,0.275460,1.78330,-10.1050,0.569440,0.024526,1.305700,1.05090,0.564530,...,0.048398,0.043445,0.95160,0.142980,4.2286,5.0528,98.783,3.6950,3.4844,0
4,0.188290,0.41504,0.342310,1.92790,-58.2740,0.000000,0.233580,1.409400,1.33930,0.584960,...,0.176480,0.321880,0.82635,0.073039,2.5912,7.0756,100.540,3.6303,4.6375,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10498,0.002861,0.58067,-0.223860,0.51658,-31.8660,0.002861,0.002861,0.618550,1.01200,0.359170,...,0.011834,0.007966,0.98817,0.327360,38.1420,39.1500,42.202,8.6489,5.2646,1
10499,-0.051968,0.55254,0.147150,2.16980,12.7480,-0.051968,-0.034361,0.669830,0.94694,0.370110,...,-0.056037,-0.140410,1.05600,1.153000,14.4930,9.2851,32.761,11.1410,1.9276,1
10500,-0.135900,0.83954,-0.342010,0.46526,-145.3100,-0.219120,-0.131860,0.191130,1.09990,0.160460,...,0.144120,-0.846930,0.85427,0.814200,6.2737,9.6966,212.230,1.7198,1.5659,1
10501,0.009423,0.50028,0.261630,1.52300,-10.1580,0.009423,0.007700,0.989900,1.01230,0.495230,...,0.012186,0.019027,0.98781,0.000000,5.9675,4.3536,98.240,3.7154,7.8068,1


In [5]:
dataframe = dataframe.fillna(dataframe.mean())

# Adding columns with Id and Year for Record Identifier and Event Time
# Add event_time timestamp. This is a point in time when a new event occurs that corresponds to the creation or update of a record in a feature group
### timestamp = str(datetime.datetime.now())

dataframe['id'] = np.arange(len(dataframe))

current_time_sec = int(round(time.time()))
dataframe['event_time'] = current_time_sec
dataframe = dataframe.astype({'event_time': 'float64'})
dataframe.rename({'65': 'labels'}, axis=1, inplace=True)

new_columns = ['id', 'event_time', 'labels']

def new_feature_addition(data):
            feature1 = data['6'] / data['7']
            feature2 = data['55'] / data['28']
            feature3 = (data['55'] * data['1']) / data['3']

            Id = data['id']
            Time = data['event_time']

            new_df = pd.DataFrame([Id, Time, feature1, feature2, feature3])
            new_df = new_df.T
            new_df.columns = ['id', 'event_time', 'attr66', 'attr67', 'attr68']

            new_df.replace([np.inf, -np.inf], np.nan, inplace=True)
            new_df = new_df.fillna(dataframe.mean())
            new_df = new_df.astype({'id':'int'})

            return (new_df)

df_new = new_feature_addition(dataframe)

schema = ['7', '12', '15', '27', '41', '55']

for i in range(len(schema)):
    dataframe.rename({schema[i]: 'attr' + schema[i]}, axis=1, inplace=True)
    schema[i] = 'attr' + schema[i]
    i += i
    

schema.extend(new_columns)

df = dataframe[schema]

In [5]:
feature_group1_name = 'bankruptcy-fg-2023-02-21-12-55-30'
feature_group2_name = 'bankruptcy-new-features-2023-02-21-12-56-19'

In [8]:
df

Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,labels
0,0.219460,0.531390,592.24,1.973700,0.041124,127280.0,0,1.676398e+09,0
1,0.171850,0.383170,829.46,44.593000,0.074333,3387.8,1,1.676398e+09,0
2,0.004572,0.020219,2094.10,0.321530,0.098528,20453.0,2,1.676398e+09,0
3,0.024526,0.069747,3299.40,0.448440,0.180500,5012.6,3,1.676398e+09,0
4,0.233580,0.633170,602.31,44.947000,0.054712,13730.0,4,1.676398e+09,0
...,...,...,...,...,...,...,...,...,...
10498,0.002861,0.006178,2879.30,1185.945272,0.195420,-4321.5,10498,1.676398e+09,1
10499,-0.034361,-0.273160,6875.90,1185.945272,0.903660,13488.0,10499,1.676398e+09,1
10500,-0.131860,-0.206160,-2324.00,-2.769400,-0.287440,-4233.0,10500,1.676398e+09,1
10501,0.007700,0.015392,6789.00,1185.945272,0.386400,2686.9,10501,1.676398e+09,1


In [9]:
df_new

Unnamed: 0,id,event_time,attr66,attr67,attr68
0,0,1.676398e+09,2.751435,392633.494771,154275.298866
1,1,1.676398e+09,0.000000,3230.783902,1754.983606
2,2,1.676398e+09,41.808478,11955.924475,24.929803
3,3,1.676398e+09,23.217810,6785.796478,446.304464
4,4,1.676398e+09,0.000000,11583.565342,7552.282142
...,...,...,...,...,...
10498,10498,1.676398e+09,1.000000,14685.991980,55.230106
10499,10499,1.676398e+09,1.512412,66643.608874,-4763.468461
10500,10500,1.676398e+09,1.661762,8693.955514,-1682.011345
10501,10501,1.676398e+09,1.223685,2445.081445,96.767631


In [6]:

feature_group1 = FeatureGroup(name=feature_group1_name, sagemaker_session=sess)
feature_group2 = FeatureGroup(name=feature_group2_name, sagemaker_session=sess)

In [11]:
feature_group1.ingest(data_frame=df, max_workers=3, max_processes = 1, wait=True)

IngestionManagerPandas(feature_group_name='bankruptcy-fg-2023-02-14-16-51-58', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fb43439ef10>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7fb434134d50>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [12]:
feature_group2.ingest(data_frame=df_new, max_workers=3, max_processes = 1, wait=True)

IngestionManagerPandas(feature_group_name='bankruptcy-new-features-2023-02-14-16-52-29', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fb43439ef10>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7fb46c36dd90>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [7]:
query = feature_group1.athena_query()
table_name = query.table_name
print(table_name)
query_string = ('SELECT * FROM "%s"' % table_name)
print('Running ' + query_string)

query.run(query_string=query_string,
          output_location=f's3://sagemaker-featurestore-pov/feature-group-storage')
query.wait()
dataset1 = query.as_dataframe()
dataset1

bankruptcy-fg-2023-02-14-16-51-58-1676393524
Running SELECT * FROM "bankruptcy-fg-2023-02-14-16-51-58-1676393524"


Unnamed: 0,attr7,attr12,attr15,attr27,attr41,attr55,id,event_time,labels,write_time,api_invocation_time,is_deleted
0,0.321420,2.244800,147.34,3.503300,0.013675,19993.00,27,1.676393e+09,0,2023-02-14 16:58:22.215,2023-02-14 16:53:18.000,False
1,0.038831,0.076431,1875.30,1.775700,0.132410,875.53,3541,1.676393e+09,0,2023-02-14 16:58:22.215,2023-02-14 16:53:18.000,False
2,0.228800,1.196100,454.02,32.073000,0.040910,5516.80,3581,1.676393e+09,0,2023-02-14 16:58:22.215,2023-02-14 16:53:18.000,False
3,0.039185,0.074460,2981.00,0.289010,0.194230,49293.00,88,1.676393e+09,0,2023-02-14 16:58:22.215,2023-02-14 16:53:19.000,False
4,0.122230,0.350570,919.43,2.210200,0.079996,3170.40,3601,1.676393e+09,0,2023-02-14 16:58:22.215,2023-02-14 16:53:19.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
27698,0.186370,1.021800,501.75,1.629000,0.045094,3271.30,6650,1.676394e+09,0,2023-02-14 17:08:13.355,2023-02-14 17:03:57.000,False
27699,-0.090807,-1.458900,-1026.90,0.000000,0.721041,117.52,6694,1.676394e+09,0,2023-02-14 17:08:13.355,2023-02-14 17:03:57.000,False
27700,0.021858,0.029136,12527.00,1243.186724,0.721041,295.00,6696,1.676394e+09,0,2023-02-14 17:08:13.355,2023-02-14 17:03:57.000,False
27701,0.066713,0.130520,2064.80,1.745400,0.140340,1119.60,6709,1.676394e+09,0,2023-02-14 17:08:13.355,2023-02-14 17:03:57.000,False


In [8]:
query = feature_group2.athena_query()
table_name = query.table_name
print(table_name)
query_string = ('SELECT * FROM "%s"' % table_name)
print('Running ' + query_string)

query.run(query_string=query_string,
          output_location=f's3://sagemaker-featurestore-pov/feature-group-storage')
query.wait()
dataset2 = query.as_dataframe()
dataset2

bankruptcy-new-features-2023-02-14-16-52-29-1676393554
Running SELECT * FROM "bankruptcy-new-features-2023-02-14-16-52-29-1676393554"


Unnamed: 0,id,event_time,attr66,attr67,attr68,write_time,api_invocation_time,is_deleted
0,18,1.676398e+09,0.000000,1574.992674,681.992413,2023-02-14 18:12:08.154,2023-02-14 18:07:08.000,False
1,22,1.676398e+09,-3.985193,45456.652162,3713.594135,2023-02-14 18:12:08.154,2023-02-14 18:07:08.000,False
2,33,1.676398e+09,0.000000,16.278497,1.421979,2023-02-14 18:12:08.154,2023-02-14 18:07:09.000,False
3,7096,1.676398e+09,1.652582,8963.402951,-2829.831481,2023-02-14 18:12:08.154,2023-02-14 18:07:09.000,False
4,139,1.676398e+09,9.596876,7493.974383,-899.190700,2023-02-14 18:12:08.154,2023-02-14 18:07:10.000,False
...,...,...,...,...,...,...,...,...
27698,10336,1.676398e+09,0.846816,189.193988,-5.643208,2023-02-14 18:12:08.440,2023-02-14 18:07:52.000,False
27699,10343,1.676398e+09,0.000000,31.370122,1.870042,2023-02-14 18:12:08.440,2023-02-14 18:07:52.000,False
27700,10390,1.676398e+09,6.764325,128.958567,-34.444535,2023-02-14 18:12:08.440,2023-02-14 18:07:52.000,False
27701,10453,1.676398e+09,0.000000,227.436435,541.979530,2023-02-14 18:12:08.440,2023-02-14 18:07:53.000,False


In [9]:
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


In [10]:
boto_session = boto3.Session(region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

In [11]:
# Incoming inference request.
id = str(10500)

# Helper to parse the feature value from the record.
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


bankruptcy_response = featurestore_runtime.get_record(
    FeatureGroupName=feature_group1_name, RecordIdentifierValueAsString=id
)

bankruptcy_record = bankruptcy_response["Record"]

bankruptcy_test_data = [
    get_feature_value(bankruptcy_record, "attr7"),
    get_feature_value(bankruptcy_record, "attr12"),
    get_feature_value(bankruptcy_record, "attr15"),
    get_feature_value(bankruptcy_record, "attr27"),
    get_feature_value(bankruptcy_record, "attr41"),
    get_feature_value(bankruptcy_record, "attr55"),
]



bankruptcy_new_response = featurestore_runtime.get_record(
    FeatureGroupName=feature_group2_name, RecordIdentifierValueAsString=id
)

bankruptcy_new_record = bankruptcy_new_response["Record"]

bankruptcy_new_test_data = [
    get_feature_value(bankruptcy_new_record, "attr66"),
    get_feature_value(bankruptcy_new_record, "attr67"),
    get_feature_value(bankruptcy_new_record, "attr68"),
]

# Join all pieces for inference request.
inference_request = []
inference_request.extend(bankruptcy_test_data[:])
inference_request.extend(bankruptcy_new_test_data[:])

inference_request

['-0.13186',
 '-0.20616',
 '-2324.0',
 '-2.7694',
 '-0.28744',
 '-4233.0',
 '1.6617624753526468',
 '8693.955513565692',
 '-1682.0113446975233']

In [12]:
training_image = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

In [13]:
path = 's3://sagemaker-featurestore-pov/feature-group-storage/training_output/sagemaker-xgboost-2023-02-14-17-50-08-305/output/model.tar.gz'

trained_model = sagemaker.model.Model(
    model_data=path,
    image_uri=training_image,
    role=role)  


In [14]:
predictor = trained_model.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

-----!

In [17]:
type(predictor)

NoneType

In [43]:
predictor = sagemaker.predictor.Predictor(
    endpoint_name='endpoint-name',
    sagemaker_session=sess, 
    content_type='text/csv',
    accept='application/json')

content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
accept is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [15]:
results = predictor.predict(",".join(inference_request), initial_args={"ContentType": "text/csv"})
prediction = json.loads(results)
print(prediction)

AttributeError: 'NoneType' object has no attribute 'predict'

In [None]:
trained_model.