<!-- instructions -->
The housing data set contains information about houses and their values, and the Google Maps raw data set contains information about addresses and their designations. Imagine we are building an ML tool to predict housing prices. To aid with prediction, we want to create a Neighborhood feature group. We can envision this neighborhood feature group helping us predict house prices by giving us a bucket to group new houses into.

In [2]:
import boto3
import sagemaker
import pandas as pd
import time
import datetime
import numpy as np
from time import gmtime, strftime, sleep
from sagemaker.feature_store.feature_group import FeatureGroup

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


#### Auth with AWS

In [3]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")
boto_session = boto3.Session(region_name=region)

In [4]:
sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

### Get Data

In [5]:
# jupyter path
gmaps_houses_df = pd.read_csv('~/aai-540-homework/homework-3-1/housing_gmaps_data_raw.csv')
housing_df = pd.read_csv('~/aai-540-homework/homework-3-1/housing.csv')

# locally 
# gmaps_houses_df = pd.read_csv('/Users/Steve/dev/aiMasters/aai-540-homework/homework-3-1/housing_gmaps_data_raw.csv')
# housing_df = pd.read_csv('/Users/Steve/dev/aiMasters/aai-540-homework/homework-3-1/housing.csv')

display(gmaps_houses_df.head())
display(housing_df.head())

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [25]:
df = pd.merge(housing_df, gmaps_houses_df, on=['longitude', 'latitude'], how='inner')

df['neighborhood'] = df['neighborhood-political']
df = df.dropna(subset=['neighborhood'])
df = df.drop(columns=['neighborhood-political'])
df['event_time'] = datetime.datetime.now()
df['ocean_proximity'] = df['ocean_proximity'].str.replace(' ', '_')


ocean_proximity_dummies = pd.get_dummies(df['ocean_proximity'], dtype=int)
df = pd.concat([df, ocean_proximity_dummies], axis=1)

df['median_house_value'] = df.groupby('neighborhood')['median_house_value'].transform('mean')
df['median_house_value'] = df['median_house_value'].clip(upper=500000)

# Average 'median_house_age', into 10-year bins
df['median_house_age'] = df.groupby('neighborhood')['housing_median_age'].transform('mean')
df['median_house_age'] = pd.cut(df['median_house_age'], bins=np.arange(0, 101, 10), right=False, labels=[f"{i}-{i+9}" for i in range(0, 100, 10)]).astype("string")

# Total households (average per neighborhood, rounded up)
df['total_households'] = df.groupby('neighborhood')['households'].transform('mean').apply(np.ceil).astype(int)
# Bedrooms per household (average and impute missing values)
df['total_bedrooms'] = df.groupby('neighborhood')['total_bedrooms'].transform(
    lambda x: x.fillna(x.mean()) if x.mean() > 0 else x.fillna(0)
)

df['bedrooms_per_household'] = df['total_bedrooms'] / df['households']

# Replacing locaility code with postal code - unsure from directions 
df['locality_code'] = df['postal_code']

# Selecting the final columns
final_cols = ['neighborhood', 'event_time', '<1H_OCEAN', 'INLAND', 'NEAR_BAY', 'NEAR_OCEAN',
               'median_house_value', 'median_house_age', 'total_households', 'bedrooms_per_household', 'locality_code']

# Generate the final dataframe
final_df = df[final_cols]

In [None]:
final_df.rename(columns={"<1H_OCEAN": "LESS_THAN_1H_OCEAN"})
display(final_df.head())
print(final_df.shape)
final_df = final_df.dropna()
print(final_df.isna().sum())
print(final_df.shape)

### Define Feature Group

In [29]:
neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=sess
)

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if pd.api.types.is_object_dtype(data_frame[label]):
            data_frame[label] = data_frame[label].astype("string")
    return data_frame

final_df = cast_object_to_string(final_df)

In [30]:
# record identifier and event time feature names
record_identifier_feature_name = "neighborhood"
event_time_feature_name = "event_time"
# final_df.name = 'final_df'

current_time_sec = int(round(time.time()))

# append EventTime feature
final_df[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(final_df), dtype="float64"
)

# Load feature definitions into the feature group
neighborhood_feature_group.load_feature_definitions(data_frame=final_df)

[FeatureDefinition(feature_name='neighborhood', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='<1H_OCEAN', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='INLAND', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='NEAR_BAY', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='NEAR_OCEAN', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='median_house_value', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='median_house_age', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinit

#### Create FeatureGroups in SageMaker FeatureStore

In [31]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

s3_private_data_path = "s3://{}/feature_groups/".format(bucket)

neighborhood_feature_group.create(
    s3_uri= s3_private_data_path,
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)

ClientError: An error occurred (ValidationException) when calling the CreateFeatureGroup operation: 1 validation error detected: Value '<1H_OCEAN' at 'featureDefinitions.3.member.featureName' failed to satisfy constraint: Member must satisfy regular expression pattern: ^[a-zA-Z0-9]([-_]*[a-zA-Z0-9]){0,63}

In [None]:
# Validate feature group
neighborhood_feature_group.describe()
sagemaker_client.list_feature_groups()

In [None]:
# Put Records into feature group 
neighborhood_feature_group.ingest(data_frame=final_df, max_workers=5, wait=True)

### 3 Homework Queries

In [None]:
# retreive a record from the online store
# Brooktree, Fisherman’s Wharf, Los Osos

# change this 
record_identifier_value = 'Brooktree'

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

In [None]:
transaction_id = str(3450774)


# Helper to parse the feature value from the record.
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


transaction_response = featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name, RecordIdentifierValueAsString=transaction_id
)
transaction_record = transaction_response["Record"]


get_feature_value(transaction_record, "TransactionDT")

In [None]:
# Clean up feature group
neighborhood_feature_group.delete()


----------------

In [11]:
!aws s3 cp "dataset_clean.csv" $s3_private_data_path/

upload: ./dataset_clean.csv to s3://sagemaker-us-east-1-106006112223/w2-musicData/csv/dataset_clean.csv


In [12]:
!aws s3 ls $s3_private_data_path/

2024-09-17 05:56:57   16931936 dataset_clean.csv
