# ðŸ““ Notebook: Feature Store Verification & Sampling

### Imports & Setup

In [None]:
import os
import boto3
import sagemaker
import pandas as pd
from sagemaker.feature_store.feature_group import FeatureGroup
from datetime import datetime

### Set Region, Session & Feature Group Name

In [None]:
# Set region (same region where Feature Group was created)
region = os.environ.get("AWS_REGION", "us-east-1")

boto_sess = boto3.Session(region_name=region)
sm_session = sagemaker.Session(boto_session=boto_sess)

print("Region:", region)
print("SageMaker session initialized")

# Feature Group name (must match your pipeline)
FEATURE_GROUP_NAME = "mlops-data-preprocessing-pipeline-employee-features"

### Load Feature Group & Describe Configuration

In [None]:
feature_group = FeatureGroup(
    name=FEATURE_GROUP_NAME,
    sagemaker_session=sm_session
)

fg_desc = feature_group.describe()

print("Feature Group Name:", fg_desc["FeatureGroupName"])
print("Status:", fg_desc["FeatureGroupStatus"])
print("Record Identifier:", fg_desc["RecordIdentifierFeatureName"])
print("Event Time Feature:", fg_desc["EventTimeFeatureName"])
print("Creation Time:", fg_desc["CreationTime"])

Customer Feature Group Name: customers-feature-group
Orders Feature Group Name: orders-feature-group


### Verify Online & Offline Store Configuration

In [None]:
online_status = fg_desc.get("OnlineStoreConfig", {}).get("EnableOnlineStore", False)
offline_status = fg_desc.get("OfflineStoreConfig", {}).get("S3StorageConfig", {}).get("S3Uri")

print("Online Store Enabled:", online_status)
print("Offline Store S3 Location:", offline_status)


### Inspect Feature Definitions (Schema Validation)

In [None]:
features = fg_desc["FeatureDefinitions"]

schema_df = pd.DataFrame(features)
schema_df


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:590183953604:feature-group/customers-feature-group',
 'FeatureGroupName': 'customers-feature-group',
 'RecordIdentifierFeatureName': 'customer_id',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'customer_id',
   'FeatureType': 'String'},
  {'FeatureName': 'sex', 'FeatureType': 'Integral'},
  {'FeatureName': 'is_married', 'FeatureType': 'Integral'},
  {'FeatureName': 'event_time', 'FeatureType': 'String'},
  {'FeatureName': 'age_18-29', 'FeatureType': 'Integral'},
  {'FeatureName': 'age_30-39', 'FeatureType': 'Integral'},
  {'FeatureName': 'age_40-49', 'FeatureType': 'Integral'},
  {'FeatureName': 'age_50-59', 'FeatureType': 'Integral'},
  {'FeatureName': 'age_60-69', 'FeatureType': 'Integral'},
  {'FeatureName': 'age_70-plus', 'FeatureType': 'Integral'},
  {'FeatureName': 'n_days_active', 'FeatureType': 'Fractional'}],
 'CreationTime': datetime.datetime(2025, 8, 9, 20, 1, 55, tzinfo=tzlocal()),
 'OnlineSt

### Query Online Store (Low Latency)

In [None]:
# Pick a sample record identifier (adjust if needed)
sample_employee_id = "0"

record = feature_group.get_record(
    record_identifier_value_as_string=sample_employee_id
)

record

### Convert Online Record to DataFrame

In [None]:
online_df = pd.DataFrame(record["Record"])
online_df

{'ResponseMetadata': {'RequestId': 'e63bf0bf-9e37-4579-b2e0-2205a26e792b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e63bf0bf-9e37-4579-b2e0-2205a26e792b',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sat, 09 Aug 2025 20:03:59 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}

### Query Offline Store (Athena)

In [None]:
# Build Athena query
query = feature_group.athena_query()

query_string = f"""
SELECT *
FROM "{query.table_name}"
ORDER BY event_time DESC
LIMIT 10
"""

print(query_string)


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:590183953604:feature-group/customers-feature-group',
 'FeatureGroupName': 'customers-feature-group',
 'FeatureName': 'customer_id',
 'FeatureType': 'String',
 'CreationTime': datetime.datetime(2025, 8, 9, 20, 1, 55, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 8, 9, 20, 3, 59, 743000, tzinfo=tzlocal()),
 'Description': 'The ID of the customer, it is also part of Order feature group',
 'Parameters': [{'Key': 'idType', 'Value': 'primarykey'}],
 'ResponseMetadata': {'RequestId': '089f7892-a3cc-47a3-89da-e8628c01bace',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '089f7892-a3cc-47a3-89da-e8628c01bace',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '393',
   'date': 'Sat, 09 Aug 2025 20:03:59 GMT'},
  'RetryAttempts': 0}}

### Run Athena Query

In [None]:
query.run(query_string=query_string, output_location=query.output_location)

query.wait()

{'Results': [{'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:590183953604:feature-group/customers-fg-09-19-26-55',
    'FeatureGroupName': 'customers-fg-09-19-26-55',
    'FeatureName': 'customer_id',
    'FeatureType': 'String',
    'CreationTime': datetime.datetime(2025, 8, 9, 19, 26, 55, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2025, 8, 9, 19, 58, 2, tzinfo=tzlocal()),
    'Description': 'The ID of the customer, it is also part of Order feature group',
    'Parameters': [{'Key': 'idType', 'Value': 'primarykey'}]}}],
 'ResponseMetadata': {'RequestId': '84d6739c-8933-4023-a8f5-1b3b03c340fc',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '84d6739c-8933-4023-a8f5-1b3b03c340fc',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '507',
   'date': 'Sat, 09 Aug 2025 20:03:59 GMT'},
  'RetryAttempts': 0}}

### Load Offline Store Results

In [None]:
offline_df = query.as_dataframe()
offline_df

IngestionManagerPandas(feature_group_name='customers-feature-group', feature_definitions={'customer_id': {'FeatureName': 'customer_id', 'FeatureType': 'String'}, 'sex': {'FeatureName': 'sex', 'FeatureType': 'Integral'}, 'is_married': {'FeatureName': 'is_married', 'FeatureType': 'Integral'}, 'event_time': {'FeatureName': 'event_time', 'FeatureType': 'String'}, 'age_18-29': {'FeatureName': 'age_18-29', 'FeatureType': 'Integral'}, 'age_30-39': {'FeatureName': 'age_30-39', 'FeatureType': 'Integral'}, 'age_40-49': {'FeatureName': 'age_40-49', 'FeatureType': 'Integral'}, 'age_50-59': {'FeatureName': 'age_50-59', 'FeatureType': 'Integral'}, 'age_60-69': {'FeatureName': 'age_60-69', 'FeatureType': 'Integral'}, 'age_70-plus': {'FeatureName': 'age_70-plus', 'FeatureType': 'Integral'}, 'n_days_active': {'FeatureName': 'n_days_active', 'FeatureType': 'Fractional'}}, sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f833788dd90>, sagemaker_session=<sagemaker.session.Session ob

### Quick Data Quality Checks

In [None]:
print("Rows:", len(offline_df))
print("Columns:", offline_df.columns.tolist())

offline_df.isnull().sum()


### Compare Online vs Offline (Optional)

In [None]:
comparison_cols = [
    "employee_id",
    "age",
    "salary",
    "department",
    "salary_category",
    "age_group"
]

offline_df[comparison_cols].head()
