In [3]:
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import boto3
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
import time
from botocore.exceptions import ClientError

# Initialize SageMaker (SM) and Boto3 clients
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket() # Or specify your own bucket
prefix = 'pisa-feature-store'

sagemaker_client = boto3.client('sagemaker', region_name=region)
featurestore_runtime = boto3.client('sagemaker-featurestore-runtime', region_name=region)

print(f"SageMaker Role: {role}")
print(f"SageMaker S3 Bucket: {s3_bucket_name}")

SageMaker Role: arn:aws:iam::767397858887:role/LabRole
SageMaker S3 Bucket: sagemaker-us-east-1-767397858887


Design Feature Groups

Features can be logically separated into groups based on their source and how frequently they might be updated.
This makes the system more modular. For example, demographic data is static, while academic performance might be updated annually.

I'll use CNTSTUID as the `record_identifier_name`, and add an event_time for each record.

# Feature Engineering & Ingestion

In [None]:
## Load & Pre-process Data

In [4]:
df = pd.read_csv('s3://aai540group5/us_df.csv')

# Use CNTSTUID as student ID
df['student_id'] = df['CNTSTUID'].astype(str)
# Add an event time column
current_time_sec = int(round(time.time()))
df['event_time'] = pd.Series([current_time_sec] * len(df), dtype="float64")

  df = pd.read_csv('s3://aai540group5/us_df.csv')


## Feature Engineering: Academic Performance - use PCA to deal with multicollinearity

In [5]:
# Avg PV scores for each subject (as was done in EDA)
pv_subjects = ['MATH', 'READ', 'SCIE', 'SCEP', 'SCED', 'SCID', 'SKCO', 'SKPE', 'SSPH', 'SSLI', 'SSES']
pv_avg_cols = []
for subject in pv_subjects:
    pv_cols = [col for col in df.columns if col.startswith(f'PV') and subject in col]
    avg_col_name = f'PV_AVG_{subject}'
    if pv_cols:
        df[avg_col_name] = df[pv_cols].mean(axis=1)
        pv_avg_cols.append(avg_col_name)

# Handle multicollinearity with PCA to get a composite 'academic_performance_index'
pca_input_df = df[pv_avg_cols].dropna()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(pca_input_df)
pca = PCA(n_components=1)
principal_components = pca.fit_transform(scaled_features)
# Add new PCA feature back into main DF
df['academic_performance_index'] = pd.Series(principal_components.flatten(), index=pca_input_df.index)

## Create & Ingest Feature Groups

Feature Group 1: Student Demographics & Socioeconomic Status (SES):
This feature group stores static data about each student, such as age, grade, and gender, and socioeconomic background indicators. This information rarely changes, so separating it into its own group is an efficient design.

In [15]:
def create_and_ingest_fg(fg_name, fg_dataframe, sagemaker_session):
    """
    Create a SageMaker Feature Group and ingest data (idempotent).
    
    Uses module-level s3_bucket_name, prefix, and role variables
    """
    # Instantiate the SageMaker (SM) FeatureGroup (FG) object
    fg = FeatureGroup(name=fg_name, sagemaker_session=sagemaker_session)
    
    # Load schema definitions - automatically infer feature schema (names & data types) from the pandas DF
    fg.load_feature_definitions(data_frame=fg_dataframe)
    
    # Create FG & Ingest Data
    try:
        print(f"Creating feature group: {fg_name}...")
        
        # Create FG in the SageMaker Feature Store backend
        fg.create(
            s3_uri=f"s3://{s3_bucket_name}/{prefix}",    # S3 path for the offline store (long-term storage, batch inference)
            record_identifier_name="student_id",         # Unique identifier for each record
            event_time_feature_name="event_time",        # Tracks timestamp of the feature data
            role_arn=role,                               # IAM role with necessary permissions
            enable_online_store=True                     # Enable the online store (for low-latency, real-time inference)
        )
        
        # Get the low-level boto3 client and use its waiter to confirm the resource is active
        print("Waiting for Feature Group to become active...")
        sagemaker_session.sagemaker_client.get_waiter('feature_group_created').wait(
            FeatureGroupName=fg_name
        )
        
    except ClientError as e:
        # If the FG already exists, the 'ResourceInUse' error is thrown
        if e.response['Error']['Code'] == 'ResourceInUse':
            print(f"{fg_name} already exists")
        else:
            # Raise any other errors
            raise
    
    # Ingest data from DataFrame into Feature Group
    print(f"Ingesting {len(fg_dataframe):,} records...")
    fg.ingest(
        data_frame=fg_dataframe, 
        max_workers=3,    # Number of parallel workers for ingestion
        wait=True         # Wait for ingestion to complete before returning
    )
    print(f"Ingestion complete\n")

Demographics & Socioeconomic Status Feature Group

In [7]:
# Create DataFrame for FG
# 'student_id' is the unique key, & 'event_time' tracks when the data was recorded
demo_ses_df = df[[
    'student_id', 'event_time',
    'AGE', 'GRADE', 'ST004D01T',        # Demographics features
    'ESCS', 'HOMEPOS', 'WEALTH'         # Socioeconomic Status (SES) indicators
]].dropna()  # Remove any rows with missing values

create_and_ingest_fg('student-demographics-ses-fg', demo_ses_df, sagemaker_session)

Creating feature group: student-demographics-ses-fg...
student-demographics-ses-fg already exists
Ingesting 32,293 records...
Ingestion complete



Academic Performance Feature Group

In [None]:
# Contains the computed academic performance index
performance_df = df[['student_id', 'event_time', 'academic_performance_index']].dropna()

create_and_ingest_fg('student-performance-fg', performance_df, sagemaker_session)

Ingestion complete



Student Wellbeing Feature Group

In [None]:
# Selected features based on initial EDA which identified them as having high predictive power
wellbeing_features = [
    'student_id',       # Record identifier
    'event_time',       # Event timestamp
    'BELONG',           # Student's sense of belonging at school
    'unfairteacher',    # Index of perceived unfairness from teachers
    'SCIEEFF',          # Self-efficacy in science
    'DISCLISCI',        # Disciplinary climate in science classes
    'MOTIVAT',          # Achievement motivation
    'PARED',            # Parental emotional support
    'TEACHSUP',         # Teacher support
    'EMOSUPS'           # Emotional support
]

# Create new DataFrame with only selected wellbeing features
wellbeing_df = df[wellbeing_features].dropna()

create_and_ingest_fg('student-wellbeing-fg', wellbeing_df, sagemaker_session)


Creating feature group: student-wellbeing-fg...
student-wellbeing-fg already exists
Ingesting 25,141 records...
Ingestion complete



Target Variable Feature Group (Anxiety Level)

In [16]:
# Prepare target variable with label encoding for ML model compatibility
target_df = df[['student_id', 'event_time', 'ANX_BAND_Q3_US']].dropna().copy()  # copy() avoids SettingWithCopyWarning

# Encode categorical anxiety levels to numeric values
# This is necessary as most ML algorithms require numeric inputs
le = LabelEncoder()
target_df['anxiety_level_encoded'] = le.fit_transform(target_df['ANX_BAND_Q3_US'])

# Store & display encoding for model interpretation
print("Anxiety Level Encoding (for model interpretation):")
for i, label in enumerate(le.classes_):
    print(f"  {label} → {i}")
print()

# Select only required columns for ingestion
target_ingest_df = target_df[['student_id', 'event_time', 'anxiety_level_encoded']]

create_and_ingest_fg('student-anxiety-target-fg', target_ingest_df, sagemaker_session)


Anxiety Level Encoding (for model interpretation):
  high → 0
  low → 1
  medium → 2

Creating feature group: student-anxiety-target-fg...
student-anxiety-target-fg already exists
Ingesting 35,565 records...
Ingestion complete



In [17]:
# Summary of created feature groups
print("\nFeature Group Summary:")
print(f"  • Demographics & SES: {len(demo_ses_df):,} records")
print(f"  • Academic Performance: {len(performance_df):,} records")
print(f"  • Student Wellbeing: {len(wellbeing_df):,} records")
print(f"  • Anxiety Target: {len(target_ingest_df):,} records")


Feature Group Summary:
  • Demographics & SES: 32,293 records
  • Academic Performance: 36,824 records
  • Student Wellbeing: 25,141 records
  • Anxiety Target: 35,565 records
