In [7]:
# Cell 1: Install and Import
!pip install -U sagemaker pandas boto3

import pandas as pd
import numpy as np
import time
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum

# Set up SageMaker session
sagemaker_session = sagemaker.Session()
region = 'us-east-1'  # Your region
bucket = 'custom-sagemaker-bucket-s3-feature-engineering123'  # Your S3 bucket
role = 'arn:aws:iam::696623242873:role/service-role/AmazonSageMaker-ExecutionRole-20250818T084167'  # Replace YOUR_ACCOUNT_ID with your AWS account ID
prefix = 'feature-store-demo'

# Note: Replace YOUR_ACCOUNT_ID above with your actual AWS account ID (find it in AWS Console > IAM).



In [2]:
# Cell 2: Load Data
s3_path = f's3://custom-sagemaker-bucket-s3-feature-engineering123/Mall_Customers.csv'
df = pd.read_csv(s3_path)

# Quick inspection
print(df.head())
print(df.info())

   CustomerID   Genre  Age Annual_Income  Spending_Score
0           1    Male   19     EUR 15.00              39
1           2    Male   21     EUR 15.00              81
2           3  Female   20     EUR 16.00               6
3           4  Female   23     EUR 16.00              77
4           5  Female   31     EUR 17.00              40
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CustomerID      200 non-null    int64 
 1   Genre           200 non-null    object
 2   Age             200 non-null    int64 
 3   Annual_Income   200 non-null    object
 4   Spending_Score  200 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 7.9+ KB
None


In [3]:
# Cell 3: Feature Engineering
# Clean Annual_Income
df['Annual_Income'] = df['Annual_Income'].str.replace('EUR ', '').str.replace('.00', '').astype(float)

# Rename Genre to Gender
df.rename(columns={'Genre': 'Gender'}, inplace=True)

# New features
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 25, 45, 100], labels=['Young', 'Adult', 'Senior'])
df['Income_Spending_Ratio'] = df['Annual_Income'] / df['Spending_Score'].replace(0, np.nan)  # Avoid div by zero
df['High_Spender'] = (df['Spending_Score'] > 50).astype(int)

# Add required columns for Feature Store
df['RecordId'] = df['CustomerID'].astype(str)  # Unique record identifier
df['EventTime'] = time.time()  # Unix timestamp for ingestion

# Convert categoricals to string
df['Age_Group'] = df['Age_Group'].astype(str)
df['Gender'] = df['Gender'].astype(str)

# Drop CustomerID if not needed as feature
df.drop(columns=['CustomerID'], inplace=True)

# Inspection
print(df.head())
print(df.dtypes)

   Gender  Age  Annual_Income  Spending_Score Age_Group  \
0    Male   19           15.0              39     Young   
1    Male   21           15.0              81     Young   
2  Female   20           16.0               6     Young   
3  Female   23           16.0              77     Young   
4  Female   31           17.0              40     Adult   

   Income_Spending_Ratio  High_Spender RecordId     EventTime  
0               0.384615             0        1  1.755532e+09  
1               0.185185             1        2  1.755532e+09  
2               2.666667             0        3  1.755532e+09  
3               0.207792             1        4  1.755532e+09  
4               0.425000             0        5  1.755532e+09  
Gender                    object
Age                        int64
Annual_Income            float64
Spending_Score             int64
Age_Group                 object
Income_Spending_Ratio    float64
High_Spender               int64
RecordId                  obje

In [8]:
# Cell 4: Define Feature Group
feature_group_name = 'mall-customers-features'

# Define feature definitions based on DataFrame
feature_definitions = [
    FeatureDefinition('RecordId', FeatureTypeEnum.STRING),
    FeatureDefinition('Gender', FeatureTypeEnum.STRING),
    FeatureDefinition('Age', FeatureTypeEnum.INTEGRAL),
    FeatureDefinition('Annual_Income', FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition('Spending_Score', FeatureTypeEnum.INTEGRAL),
    FeatureDefinition('Age_Group', FeatureTypeEnum.STRING),
    FeatureDefinition('Income_Spending_Ratio', FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition('High_Spender', FeatureTypeEnum.INTEGRAL),
    FeatureDefinition('EventTime', FeatureTypeEnum.FRACTIONAL)
]

# Create Feature Group
feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=sagemaker_session,
    feature_definitions=feature_definitions
)

# Create the group with offline and online stores enabled
feature_group.create(
    s3_uri=f's3://custom-sagemaker-bucket-s3-feature-engineering123/features',
    record_identifier_name='RecordId',
    event_time_feature_name='EventTime',
    role_arn=role,
    enable_online_store=True  # Enables online store
)

# Wait for creation (poll status)
status = feature_group.describe()['FeatureGroupStatus']
while status == 'Creating':
    print('Waiting for Feature Group Creation...')
    time.sleep(5)
    status = feature_group.describe()['FeatureGroupStatus']
print(f'Feature Group {feature_group_name} created successfully!')

Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Feature Group mall-customers-features created successfully!


In [9]:
# Cell 5: Ingest Data
feature_group.ingest(data_frame=df, max_workers=3, wait=True)
print('Data ingested successfully!')

Data ingested successfully!


In [10]:
# Cell 6: Query Online Store
runtime_client = boto3.client('sagemaker-featurestore-runtime', region_name=region)

# Get a single record
response = runtime_client.get_record(
    FeatureGroupName=feature_group_name,
    RecordIdentifierValueAsString='1'  # Example RecordId (from original CustomerID=1)
)
print(response)# Cell 6: Query Online Store
runtime_client = boto3.client('sagemaker-featurestore-runtime', region_name=region)

# Get a single record
response = runtime_client.get_record(
    FeatureGroupName=feature_group_name,
    RecordIdentifierValueAsString='1'  # Example RecordId (from original CustomerID=1)
)
print(response)

{'ResponseMetadata': {'RequestId': '5597ba47-00c0-414b-a9a7-0c26e574f61b', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '5597ba47-00c0-414b-a9a7-0c26e574f61b', 'content-type': 'application/json', 'content-length': '746', 'date': 'Mon, 18 Aug 2025 16:00:41 GMT'}, 'RetryAttempts': 0}, 'Record': [{'FeatureName': 'RecordId', 'ValueAsString': '1'}, {'FeatureName': 'Gender', 'ValueAsString': 'Male'}, {'FeatureName': 'Age', 'ValueAsString': '19'}, {'FeatureName': 'Annual_Income', 'ValueAsString': '15.0'}, {'FeatureName': 'Spending_Score', 'ValueAsString': '39'}, {'FeatureName': 'Age_Group', 'ValueAsString': 'Young'}, {'FeatureName': 'Income_Spending_Ratio', 'ValueAsString': '0.38461538461538464'}, {'FeatureName': 'High_Spender', 'ValueAsString': '0'}, {'FeatureName': 'EventTime', 'ValueAsString': '1755531902.587493'}]}
{'ResponseMetadata': {'RequestId': '60fc8259-5a69-49d6-b28d-de78ec546dca', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '60fc8259-5a69-49d6-b28d-de7

In [12]:
# Cell 7: Query Athena from Notebook (Optional)
athena_query = feature_group.athena_query()
table_name = athena_query.table_name

# Run query
athena_query.run(query_string=f'SELECT * FROM "{table_name}" LIMIT 5', output_location=f's3://{bucket}/query_results/')
athena_query.wait()
result_df = athena_query.as_dataframe()
print(result_df)

Empty DataFrame
Columns: [recordid, gender, age, annual_income, spending_score, age_group, income_spending_ratio, high_spender, eventtime, write_time, api_invocation_time, is_deleted]
Index: []


In [14]:
# Cell 8: Cleanup
feature_group.delete()