In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import sagemaker
import io
from sagemaker.session import Session
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
import warnings 
warnings.filterwarnings('ignore') 

In [4]:
#Import data from S3 bucket

bucket = Session().default_bucket()
region = boto3.Session().region_name
s3_client = boto3.client("s3", region_name=region)

Housing_bucket_name = bucket
gmaps_data_key = (
    "Housing_Assignment3/csv/housing_gmaps_data_raw.csv"
)
housing_data_key = (
    "Housing_Assignment3/csv/housing.csv"
)

gmaps_data_object = s3_client.get_object(
    Bucket=Housing_bucket_name, Key=gmaps_data_key
)
housing_data_object = s3_client.get_object(
    Bucket=Housing_bucket_name, Key=housing_data_key
)

gmaps_data = pd.read_csv(io.BytesIO(gmaps_data_object["Body"].read()))
housing_data = pd.read_csv(io.BytesIO(housing_data_object["Body"].read()))


In [5]:
#View first couple of rows from both datasets
print(gmaps_data.head())

  street_number                   route locality-political  \
0          3130  Grizzly Peak Boulevard           Berkeley   
1          2005             Tunnel Road            Oakland   
2          6886             Chabot Road            Oakland   
3          6365           Florio Street            Oakland   
4          5407           Bryant Avenue            Oakland   

  administrative_area_level_2-political administrative_area_level_1-political  \
0                        Alameda County                            California   
1                        Alameda County                            California   
2                        Alameda County                            California   
3                        Alameda County                            California   
4                        Alameda County                            California   

  country-political  postal_code  \
0     United States      94705.0   
1     United States      94611.0   
2     United States      94618.0

In [6]:
print(housing_data.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [7]:
#Setup SageMaker Feature Store 
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

sagemaker-us-east-1-385225481548


In [8]:
#Setup IAM role
role = get_execution_role()
print(role)

arn:aws:iam::385225481548:role/LabRole


In [9]:
#Merge datasets for data cleaning and select columns used for the feature group and 
#rename columns
df_total = pd.merge(housing_data, gmaps_data, on=('longitude','latitude'))
df = df_total[['neighborhood-political','ocean_proximity','median_house_value',
               'housing_median_age','households','total_bedrooms','postal_code',
               'locality-political']]
df.rename(columns = {'neighborhood-political':'neighborhood'}, inplace = True)
print(df.head())

      neighborhood ocean_proximity  median_house_value  housing_median_age  \
0              NaN        NEAR BAY            452600.0                41.0   
1       Merriewood        NEAR BAY            358500.0                21.0   
2  Upper Rockridge        NEAR BAY            352100.0                52.0   
3        Rockridge        NEAR BAY            341300.0                52.0   
4        Rockridge        NEAR BAY            342200.0                52.0   

   households  total_bedrooms  postal_code locality-political  
0       126.0           129.0      94705.0           Berkeley  
1      1138.0          1106.0      94611.0            Oakland  
2       177.0           190.0      94618.0            Oakland  
3       219.0           235.0      94618.0            Oakland  
4       259.0           280.0      94618.0            Oakland  


In [10]:
#Identify columns with Nan values
df.isna().any()

neighborhood           True
ocean_proximity       False
median_house_value    False
housing_median_age    False
households            False
total_bedrooms         True
postal_code            True
locality-political     True
dtype: bool

In [11]:
#Fill neighborhood missing values from locality
df['neighborhood'] = df['neighborhood'].fillna(df['locality-political'])
df = df.dropna(subset = ['neighborhood']).reset_index()

In [12]:
#Encode ocean distance 
encoded_ocean = pd.get_dummies(df['ocean_proximity'])
df = pd.concat([df, encoded_ocean], axis=1)
df = df.drop(['ocean_proximity'], axis=1)
df.rename(columns = {'<1H OCEAN':'1H_OCEAN','NEAR BAY':'NEAR_BAY',
                     'NEAR OCEAN':'NEAR_OCEAN'}, inplace = True)
df.replace({False: 0, True: 1}, inplace=True)

In [13]:
#Average median house value across all records for a neighborhood
MedHouseVal = df.groupby(['neighborhood'])['median_house_value'].median()
MedHouseVal.where(MedHouseVal < 500000, 500000, inplace = True)

for i in range(0,len(df.index)):
    df['median_house_value'][i] = MedHouseVal[df['neighborhood'][i]]

In [14]:
#Average median house age across all records for a neighborhood
MedHouseAge = df.groupby(['neighborhood'])['housing_median_age'].median()
print(MedHouseAge.max())

52.0


In [15]:
dis_MedHouseAge = pd.cut(x=MedHouseAge, bins=[0, 9, 19, 29, 39, 49, 59],
                    labels=['0-9','10-19','20-29','30-39','40-49', '50-59'])

for i in range(0,len(df.index)):
    df['housing_median_age'][i] = dis_MedHouseAge[df['neighborhood'][i]]

In [16]:
#Average household value across all records for a neighborhood
AvgHouse = df.groupby(['neighborhood'])['households'].mean()
RoundAvgHouse = np.ceil(AvgHouse)

for i in range(0,len(df.index)):
    df['households'][i] = RoundAvgHouse[df['neighborhood'][i]]

In [17]:
#Add the bedrooms per household 
df['bedrooms_per_household'] = np.nan
for i in range(0,len(df.index)):
    df['bedrooms_per_household'][i] = df['total_bedrooms'][i]/df['households'][i]
    
AvgBedPerHouse = df.groupby(['postal_code'])
df['bedrooms_per_household'] = AvgBedPerHouse['bedrooms_per_household'].transform(
    lambda x: x.fillna(x.mean()))

In [18]:
#Encode locality-political value without one hot encoding to prevent large number 
# of columns
df['locality-political']=df['locality-political'].astype('category').cat.codes

In [19]:
#Create Feature Groups
from time import gmtime, strftime, sleep
from sagemaker.feature_store.feature_group import FeatureGroup

house_feature_group_name = "house-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

house_feature_group = FeatureGroup(
    name=house_feature_group_name, sagemaker_session=feature_store_session
)

In [None]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then 
#map the string dtype to String feature type.
cast_object_to_string(df)

# record identifier and event time feature names
record_identifier_feature_name = "neighborhood"
event_time_feature_name = "EventTime"

# append EventTime feature
df[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(df), dtype="float64"
)

# load feature definitions to the feature group
house_feature_group.load_feature_definitions(data_frame=df)
# output is suppressed

In [21]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


house_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=house_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup house-feature-group-29-20-00-28 successfully created.


In [None]:
print(house_feature_group.describe())
# output is suppressed

In [None]:
print(sagemaker_client.list_feature_groups())
# output is suppressed

In [None]:
#Put records into Feature Group
house_feature_group.ingest(data_frame=df, max_workers=3, wait=True)
# output is suppressed

In [25]:
featurestore_runtime.get_record(
    FeatureGroupName=house_feature_group_name,
    RecordIdentifierValueAsString='Brooktree',
)

{'ResponseMetadata': {'RequestId': '67c70fdc-5694-44ee-b540-192e94566caf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '67c70fdc-5694-44ee-b540-192e94566caf',
   'content-type': 'application/json',
   'content-length': '1148',
   'date': 'Wed, 29 May 2024 20:02:56 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'index', 'ValueAsString': '17825'},
  {'FeatureName': 'neighborhood', 'ValueAsString': 'Brooktree'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '257400.0'},
  {'FeatureName': 'housing_median_age', 'ValueAsString': '0-9'},
  {'FeatureName': 'households', 'ValueAsString': '1438.0'},
  {'FeatureName': 'postal_code', 'ValueAsString': '95131.0'},
  {'FeatureName': 'locality-political', 'ValueAsString': '787'},
  {'FeatureName': '1H_OCEAN', 'ValueAsString': '1'},
  {'FeatureName': 'INLAND', 'ValueAsString': '0'},
  {'FeatureName': 'ISLAND', 'ValueAsString': '0'},
  {'FeatureName': 'NEAR_BAY', 'ValueAsString': '0'},
  {'FeatureName': 'NEAR_OCE

In [26]:
featurestore_runtime.get_record(
    FeatureGroupName=house_feature_group_name,
    RecordIdentifierValueAsString="Fisherman's Wharf",
)

{'ResponseMetadata': {'RequestId': '33ea43dd-da5a-403f-b3d2-12f705369a69',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '33ea43dd-da5a-403f-b3d2-12f705369a69',
   'content-type': 'application/json',
   'content-length': '1226',
   'date': 'Wed, 29 May 2024 20:02:56 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'index', 'ValueAsString': '15616'},
  {'FeatureName': 'neighborhood', 'ValueAsString': "Fisherman's Wharf"},
  {'FeatureName': 'median_house_value', 'ValueAsString': '500000.0'},
  {'FeatureName': 'housing_median_age', 'ValueAsString': '50-59'},
  {'FeatureName': 'households', 'ValueAsString': '250.0'},
  {'FeatureName': 'total_bedrooms', 'ValueAsString': '317.0'},
  {'FeatureName': 'postal_code', 'ValueAsString': '94133.0'},
  {'FeatureName': 'locality-political', 'ValueAsString': '781'},
  {'FeatureName': '1H_OCEAN', 'ValueAsString': '0'},
  {'FeatureName': 'INLAND', 'ValueAsString': '0'},
  {'FeatureName': 'ISLAND', 'ValueAsString': '0'},
  {'Feat

In [27]:
featurestore_runtime.get_record(
    FeatureGroupName=house_feature_group_name,
    RecordIdentifierValueAsString='Los Osos',
)

{'ResponseMetadata': {'RequestId': '5a247d33-d8c0-45f6-acdc-87381e0ebd71',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5a247d33-d8c0-45f6-acdc-87381e0ebd71',
   'content-type': 'application/json',
   'content-length': '1228',
   'date': 'Wed, 29 May 2024 20:02:56 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'index', 'ValueAsString': '16636'},
  {'FeatureName': 'neighborhood', 'ValueAsString': 'Los Osos'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '194400.0'},
  {'FeatureName': 'housing_median_age', 'ValueAsString': '10-19'},
  {'FeatureName': 'households', 'ValueAsString': '612.0'},
  {'FeatureName': 'total_bedrooms', 'ValueAsString': '699.0'},
  {'FeatureName': 'postal_code', 'ValueAsString': '93402.0'},
  {'FeatureName': 'locality-political', 'ValueAsString': '55'},
  {'FeatureName': '1H_OCEAN', 'ValueAsString': '0'},
  {'FeatureName': 'INLAND', 'ValueAsString': '0'},
  {'FeatureName': 'ISLAND', 'ValueAsString': '0'},
  {'FeatureName': 