# ðŸ““ Notebook: Feature Store Verification & Sampling

### Imports & Setup

In [13]:
import os
import boto3
import sagemaker
import pandas as pd
from sagemaker.feature_store.feature_group import FeatureGroup
from datetime import datetime

### Set Region, Session & Feature Group Name

In [27]:
# Set region (same region where Feature Group was created)
region = os.environ.get("AWS_REGION", "us-east-1")

boto_sess = boto3.Session(region_name=region)
sm_session = sagemaker.Session(boto_session=boto_sess)
athena_output = f"s3://{sm_session.default_bucket()}/athena/feature-store/"

print("Region:", region)
print("SageMaker session initialized")

# Feature Group name (must match your pipeline)
FEATURE_GROUP_NAME = "mlops-data-preprocessing-pipeline-employee-features"

Region: us-east-1
SageMaker session initialized


### Load Feature Group & Describe Configuration

In [15]:
feature_group = FeatureGroup(
    name=FEATURE_GROUP_NAME,
    sagemaker_session=sm_session
)

fg_desc = feature_group.describe()

print("Feature Group Name:", fg_desc["FeatureGroupName"])
print("Status:", fg_desc["FeatureGroupStatus"])
print("Record Identifier:", fg_desc["RecordIdentifierFeatureName"])
print("Event Time Feature:", fg_desc["EventTimeFeatureName"])
print("Creation Time:", fg_desc["CreationTime"])

Feature Group Name: mlops-data-preprocessing-pipeline-employee-features
Status: Created
Record Identifier: employee_id
Event Time Feature: event_time
Creation Time: 2025-12-12 01:10:29.175000+00:00


### Verify Online & Offline Store Configuration

In [16]:
online_status = fg_desc.get("OnlineStoreConfig", {}).get("EnableOnlineStore", False)
offline_status = fg_desc.get("OfflineStoreConfig", {}).get("S3StorageConfig", {}).get("S3Uri")

print("Online Store Enabled:", online_status)
print("Offline Store S3 Location:", offline_status)


Online Store Enabled: True
Offline Store S3 Location: s3://mlops-data-preprocessing-pipeline-feature-store-bucket/offline-store


### Inspect Feature Definitions (Schema Validation)

In [17]:
features = fg_desc["FeatureDefinitions"]

schema_df = pd.DataFrame(features)
schema_df


Unnamed: 0,FeatureName,FeatureType
0,employee_id,String
1,event_time,String
2,age,Fractional
3,salary,Fractional
4,department,String
5,address,String
6,phone,String
7,email,String
8,address_length,Integral
9,salary_category,String


### Query Online Store (Low Latency)

In [18]:
# Pick a sample record identifier (adjust if needed)
sample_employee_id = "0"

record = feature_group.get_record(
    record_identifier_value_as_string=sample_employee_id
)

record

[{'FeatureName': 'employee_id', 'ValueAsString': '0'},
 {'FeatureName': 'event_time', 'ValueAsString': '2025-12-14T23:55:23Z'},
 {'FeatureName': 'age', 'ValueAsString': '48.0'},
 {'FeatureName': 'salary', 'ValueAsString': '60000.0'},
 {'FeatureName': 'department', 'ValueAsString': 'Marketing'},
 {'FeatureName': 'address', 'ValueAsString': 'Street 40, City 17'},
 {'FeatureName': 'phone', 'ValueAsString': '9277021151.0'},
 {'FeatureName': 'email', 'ValueAsString': 'email_937@example.com'},
 {'FeatureName': 'address_length', 'ValueAsString': '18'},
 {'FeatureName': 'salary_category', 'ValueAsString': 'medium'},
 {'FeatureName': 'age_group', 'ValueAsString': 'Senior'}]

### Convert Online Record to DataFrame

In [20]:
online_df = pd.DataFrame(record)
online_df

Unnamed: 0,FeatureName,ValueAsString
0,employee_id,0
1,event_time,2025-12-14T23:55:23Z
2,age,48.0
3,salary,60000.0
4,department,Marketing
5,address,"Street 40, City 17"
6,phone,9277021151.0
7,email,email_937@example.com
8,address_length,18
9,salary_category,medium


### Query Offline Store (Athena)

In [21]:
# Build Athena query
query = feature_group.athena_query()

query_string = f"""
SELECT *
FROM "{query.table_name}"
ORDER BY event_time DESC
LIMIT 10
"""

print(query_string)



SELECT *
FROM "mlops_data_preprocessing_pipeline_employee_features_1765501829"
ORDER BY event_time DESC
LIMIT 10



### Run Athena Query

In [28]:
query.run(
    query_string=query_string,
    output_location=athena_output
)

query.wait()

### Load Offline Store Results

In [29]:
offline_df = query.as_dataframe()
offline_df

Unnamed: 0,employee_id,event_time,age,salary,department,address,phone,email,address_length,salary_category,age_group,write_time,api_invocation_time,is_deleted
0,16700,2025-12-14T23:55:23Z,72.0,70000.0,HR,,,,3,medium,Experienced,2025-12-15 00:00:21.281,2025-12-14 23:55:26.000,False
1,33377,2025-12-14T23:55:23Z,57.0,70000.0,HR,"Street 95, City 22",6461593000.0,email_369@example.com,18,medium,Experienced,2025-12-15 00:00:21.281,2025-12-14 23:55:26.000,False
2,33405,2025-12-14T23:55:23Z,63.0,60000.0,HR,"Street 87, City 9",5814074000.0,email_204@example.com,17,medium,Experienced,2025-12-15 00:00:21.281,2025-12-14 23:55:26.000,False
3,33447,2025-12-14T23:55:23Z,47.0,60000.0,HR,"Street 54, City 23",7811807000.0,email_920@example.com,18,medium,Senior,2025-12-15 00:00:21.281,2025-12-14 23:55:27.000,False
4,16799,2025-12-14T23:55:23Z,27.0,70000.0,Unknown,"Street 54, City 35",9021193000.0,email_799@example.com,18,medium,Early Career,2025-12-15 00:00:21.281,2025-12-14 23:55:27.000,False
5,150,2025-12-14T23:55:23Z,60.0,60000.0,Unknown,"Street 8, City 5",1178840000.0,email_83@example.com,16,medium,Experienced,2025-12-15 00:00:21.281,2025-12-14 23:55:27.000,False
6,33536,2025-12-14T23:55:23Z,33.0,70000.0,IT,"Street 31, City 10",5150613000.0,email_610@example.com,18,medium,Early Career,2025-12-15 00:00:21.281,2025-12-14 23:55:28.000,False
7,210,2025-12-14T23:55:23Z,20.0,60000.0,HR,"Street 40, City 2",6569666000.0,email_522@example.com,17,medium,Young,2025-12-15 00:00:21.281,2025-12-14 23:55:28.000,False
8,16670,2025-12-14T23:55:23Z,27.0,60000.0,Marketing,,,,3,medium,Early Career,2025-12-15 00:00:21.281,2025-12-14 23:55:25.000,False
9,26,2025-12-14T23:55:23Z,22.0,60000.0,Marketing,"Street 2, City 26",8437614000.0,email_582@example.com,17,medium,Young,2025-12-15 00:00:21.281,2025-12-14 23:55:26.000,False


### Quick Data Quality Checks

In [30]:
print("Rows:", len(offline_df))
print("Columns:", offline_df.columns.tolist())

offline_df.isnull().sum()


Rows: 10
Columns: ['employee_id', 'event_time', 'age', 'salary', 'department', 'address', 'phone', 'email', 'address_length', 'salary_category', 'age_group', 'write_time', 'api_invocation_time', 'is_deleted']


employee_id            0
event_time             0
age                    0
salary                 0
department             0
address                2
phone                  2
email                  2
address_length         0
salary_category        0
age_group              0
write_time             0
api_invocation_time    0
is_deleted             0
dtype: int64

### Compare Online vs Offline (Optional)

In [31]:
comparison_cols = [
    "employee_id",
    "age",
    "salary",
    "department",
    "salary_category",
    "age_group"
]

offline_df[comparison_cols].head()


Unnamed: 0,employee_id,age,salary,department,salary_category,age_group
0,16700,72.0,70000.0,HR,medium,Experienced
1,33377,57.0,70000.0,HR,medium,Experienced
2,33405,63.0,60000.0,HR,medium,Experienced
3,33447,47.0,60000.0,HR,medium,Senior
4,16799,27.0,70000.0,Unknown,medium,Early Career
