# Health Policy Data Preparation and Feature Store Integration
This notebook loads the integrated NHIS dataset with regional metrics and prepares it for feature engineering and storage in SageMaker Feature Store

In [1]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
import io
import time
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Initialize SageMaker session and AWS environment
sess = sagemaker.Session()
bucket = "usd-team1-ads508"  # Use your project bucket
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)
s3_resource = boto3.resource('s3')
print(f"SageMaker Session: {sess}")
print(f"Bucket: {bucket}")
print(f"Region: {region}")

SageMaker Session: <sagemaker.session.Session object at 0x7f8aad4026d0>
Bucket: usd-team1-ads508
Region: us-east-1


# SECTION 1: Load the Dataset from S3

In [3]:
s3_data_key = "nhis_with_regional_metrics.csv"
s3_data_path = f"s3://{bucket}/{s3_data_key}"

In [4]:
# Download the CSV file from S3
obj = s3_resource.Object(bucket, s3_data_key)
data = obj.get()['Body'].read()
df = pd.read_csv(io.BytesIO(data))
print(f"Shape: {df.shape}")
print("\nFirst few rows:")
df.head()

Shape: (150220, 18)

First few rows:


Unnamed: 0,evercovd_a,shtcvd191_a,empdysmss3_a,hicov_a,emdindstn1_a,sex_a,agep_a,educp_a,region,industry_avg_estimate,avg_uninsured_rate,avg_obesity_rate,avg_poor_fair_health,avg_flu_vaccination_rate,avg_adult_smoking,avg_physical_inactivity,avg_median_household_income,srvy_yr
0,1.0,1.0,2.0,1,,2,36,8,1,,0.067532,0.315695,0.171813,0.495664,0.184924,0.269075,67477.836283,2022
1,1.0,1.0,2.0,1,,2,61,7,1,,0.067532,0.315695,0.171813,0.495664,0.184924,0.269075,67477.836283,2022
2,1.0,1.0,,1,,2,73,8,1,,0.067532,0.315695,0.171813,0.495664,0.184924,0.269075,67477.836283,2022
3,2.0,1.0,,1,,2,80,9,1,,0.067532,0.315695,0.171813,0.495664,0.184924,0.269075,67477.836283,2022
4,1.0,1.0,2.0,1,,2,27,9,1,,0.067532,0.315695,0.171813,0.495664,0.184924,0.269075,67477.836283,2022


In [5]:
# Display column information
print("\nOriginal column information:")
for col in df.columns:
    print(f"- {col}: {df[col].dtype}")


Original column information:
- evercovd_a: float64
- shtcvd191_a: float64
- empdysmss3_a: float64
- hicov_a: int64
- emdindstn1_a: float64
- sex_a: int64
- agep_a: int64
- educp_a: int64
- region: int64
- industry_avg_estimate: float64
- avg_uninsured_rate: float64
- avg_obesity_rate: float64
- avg_poor_fair_health: float64
- avg_flu_vaccination_rate: float64
- avg_adult_smoking: float64
- avg_physical_inactivity: float64
- avg_median_household_income: float64
- srvy_yr: int64


# SECTION 2: Data Quality Assessment

In [6]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values)


Missing values per column:
evercovd_a                      45853
shtcvd191_a                     71089
empdysmss3_a                    57656
hicov_a                             0
emdindstn1_a                    94783
sex_a                               0
agep_a                              0
educp_a                             0
region                              0
industry_avg_estimate          101102
avg_uninsured_rate                  0
avg_obesity_rate                    0
avg_poor_fair_health                0
avg_flu_vaccination_rate            0
avg_adult_smoking                   0
avg_physical_inactivity             0
avg_median_household_income         0
srvy_yr                             0
dtype: int64


In [7]:
# Calculate percentage of missing values
missing_percentage = (missing_values / len(df)) * 100
columns_to_check = missing_percentage[missing_percentage > 30].index.tolist()
print(f"\nColumns with >30% missing values: {columns_to_check}")


Columns with >30% missing values: ['evercovd_a', 'shtcvd191_a', 'empdysmss3_a', 'emdindstn1_a', 'industry_avg_estimate']


In [8]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("\nBasic statistics for key numeric columns:")
print(df[numeric_columns].describe())


Basic statistics for key numeric columns:
          evercovd_a   shtcvd191_a  empdysmss3_a       hicov_a  emdindstn1_a  \
count  104367.000000  79131.000000  92564.000000  150220.00000  55437.000000   
mean        1.783466      1.332221     14.343730       1.07527     53.588380   
std         0.700685      1.006633     99.821748       0.35957     21.392442   
min         1.000000      1.000000      0.000000       1.00000      1.000000   
25%         1.000000      1.000000      0.000000       1.00000     42.000000   
50%         2.000000      1.000000      0.000000       1.00000     61.000000   
75%         2.000000      1.000000      3.000000       1.00000     66.000000   
max         9.000000      9.000000    999.000000       9.00000     99.000000   

               sex_a         agep_a        educp_a         region  \
count  150220.000000  150220.000000  150220.000000  150220.000000   
mean        1.543370      52.928585       6.427074       2.697058   
std         0.502349      18.

In [9]:
# Check for special values that might represent missing data or refusals
print("\nChecking for special values in key categorical columns:")
categorical_cols = ['evercovd_a', 'shtcvd191_a', 'hicov_a', 'emdindstn1_a', 'sex_a', 'educp_a', 'region']
for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col} value counts:")
        print(df[col].value_counts().sort_index())


Checking for special values in key categorical columns:

evercovd_a value counts:
evercovd_a
1.0    27289
2.0    76307
7.0      128
8.0      451
9.0      192
Name: count, dtype: int64

shtcvd191_a value counts:
shtcvd191_a
1.0    61998
2.0    15554
7.0      381
8.0     1135
9.0       63
Name: count, dtype: int64

hicov_a value counts:
hicov_a
1    140160
2      9853
7       101
9       106
Name: count, dtype: int64

emdindstn1_a value counts:
emdindstn1_a
1.0      395
2.0      229
3.0       52
4.0       19
5.0       64
        ... 
78.0    3111
79.0      37
97.0     774
98.0     186
99.0      50
Name: count, Length: 82, dtype: int64

sex_a value counts:
sex_a
1    68687
2    81517
7       10
9        6
Name: count, dtype: int64

educp_a value counts:
educp_a
0       152
1     10351
2      2340
3      3412
4     34023
5     23091
6      5830
7     13603
8     34425
9     16548
10     4299
11     1373
97      246
99      527
Name: count, dtype: int64

region value counts:
region
1    25

# SECTION 3: Create an Employee ID

In [10]:
# Create a unique ID by combining srvy_yr and other identifying fields
df['record_id'] = df.apply(
    lambda row: f"{row['srvy_yr']}_{row['region']}_{row['sex_a']}_{row['agep_a']}_{np.random.randint(10000, 99999)}", 
    axis=1
)

In [11]:
# Verify uniqueness
unique_ids = df['record_id'].nunique()
print(f"Created {unique_ids} unique record IDs out of {len(df)} records")

Created 150169 unique record IDs out of 150220 records


In [12]:
# If duplicate IDs exist, regenerate them to ensure uniqueness
if unique_ids < len(df):
    # Add another random component
    df['record_id'] = df.apply(
        lambda row: f"{row['srvy_yr']}_{row['region']}_{row['sex_a']}_{row['agep_a']}_{np.random.randint(10000, 99999)}_{np.random.randint(1000, 9999)}", 
        axis=1
    )
    unique_ids = df['record_id'].nunique()
    print(f"After regeneration: {unique_ids} unique record IDs out of {len(df)} records")

After regeneration: 150220 unique record IDs out of 150220 records


# SECTION 4: Data Cleaning and Preprocessing

In [13]:
# Helper function to identify and handle special values in survey data
def clean_survey_code(df, column, special_values=[7, 8, 9], replacement=0):
    """
    Replace special values (usually representing 'refused', 'don't know', etc.) with a specified value
    """
    if column in df.columns:
        # Create a copy to avoid modifying the original
        df[column] = df[column].copy()
        
        # Replace special values
        for val in special_values:
            df[column] = df[column].replace(val, replacement)
    
    return df

In [14]:
# Clean special values in the survey data columns
for col in ['evercovd_a', 'shtcvd191_a', 'hicov_a', 'emdindstn1_a', 'sex_a', 'educp_a']:
    df = clean_survey_code(df, col)

In [15]:
# Handle missing values for the target variable (days missed from work)
# First, clean any special codes in empdysmss3_a
df = clean_survey_code(df, 'empdysmss3_a', special_values=[999], replacement=np.nan)

In [16]:
# For empdysmss3_a, use 0 for missing values (assuming 0 days if not reported)
df['empdysmss3_a'] = df['empdysmss3_a'].fillna(0)

In [17]:
# For categorical variables, fill with most common value
categorical_cols = ['evercovd_a', 'shtcvd191_a', 'hicov_a', 'emdindstn1_a', 'sex_a', 'educp_a', 'region']
for col in categorical_cols:
    if col in df.columns:
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)

In [18]:
# For continuous variables, fill with median
continuous_cols = ['agep_a', 'avg_uninsured_rate', 'avg_obesity_rate', 
                  'avg_flu_vaccination_rate', 'avg_adult_smoking', 
                  'avg_physical_inactivity', 'avg_median_household_income']
for col in continuous_cols:
    if col in df.columns:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)

In [19]:
# Ensure data types are correct
# Convert to int
for col in ['empdysmss3_a', 'hicov_a', 'sex_a', 'agep_a', 'educp_a', 'region', 'srvy_yr']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

In [20]:
# Handle potential binary columns
for col in ['evercovd_a', 'shtcvd191_a']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

In [21]:
# Convert continuous features to float
for col in ['avg_uninsured_rate', 'avg_obesity_rate', 'avg_poor_fair_health',
            'avg_flu_vaccination_rate', 'avg_adult_smoking', 'avg_physical_inactivity', 
            'avg_median_household_income']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0).astype(float)

In [22]:
# Handle industry_avg_estimate, which has many missing values
if 'industry_avg_estimate' in df.columns:
    df['industry_avg_estimate'] = pd.to_numeric(df['industry_avg_estimate'], errors='coerce').fillna(0.0).astype(float)

# SECTION 5: Feature Engineering

In [23]:
# Numeric Feature Scaling
def scale_numeric_features(df, columns_to_scale):
    scaler = StandardScaler()
    df_scaled = df.copy()
    
    # Remove non-finite values before scaling
    for col in columns_to_scale:
        df_scaled[col] = pd.to_numeric(df_scaled[col], errors='coerce')
        df_scaled[col] = df_scaled[col].fillna(df_scaled[col].median())
    
    # Create scaled versions (new columns)
    scaled_data = scaler.fit_transform(df_scaled[columns_to_scale])
    
    # Add scaled columns with new names
    for i, col in enumerate(columns_to_scale):
        df_scaled[f"{col}_scaled"] = scaled_data[:, i]
    
    return df_scaled, scaler

In [24]:
# Select numeric columns to scale (exclude target variable)
numeric_cols_to_scale = ['agep_a', 'avg_uninsured_rate', 'avg_obesity_rate', 
                          'avg_flu_vaccination_rate', 'avg_adult_smoking', 
                          'avg_physical_inactivity', 'avg_median_household_income']

In [25]:
# Apply scaling
df, feature_scaler = scale_numeric_features(df, numeric_cols_to_scale)

In [26]:
# Categorical Feature Encoding
# Binary encoding of Yes/No variables
binary_cols = ['evercovd_a', 'shtcvd191_a', 'hicov_a']
for col in binary_cols:
    if col in df.columns:
        # Map various codes to binary (1=Yes, 0=No/Unknown)
        df[f"{col}_binary"] = df[col].apply(lambda x: 1 if x == 2 else 0)

In [27]:
# One-hot encode region
df = pd.get_dummies(df, columns=['region'], prefix='region')

In [28]:
# Encode education level (ordinal encoding)
if 'educp_a' in df.columns:
    # Group education levels
    df['education_level'] = df['educp_a'].apply(
        lambda x: 1 if x in [0, 1, 2, 3] else  # Less than HS
                  2 if x == 4 else  # HS graduate
                  3 if x in [5, 6, 7] else  # Some college
                  4 if x == 8 else  # Bachelor's
                  5 if x in [9, 10, 11] else 3  # Graduate degree, default to some college
    )


In [29]:
# Encode industry categories into broader groups
if 'emdindstn1_a' in df.columns:
    # Create industry groups based on 2-digit codes
    industry_mapping = {
        'agriculture': list(range(1, 6)),
        'mining': list(range(6, 9)),
        'utilities': [9],
        'construction': [10],
        'manufacturing': list(range(11, 32)),
        'wholesale': list(range(32, 35)),
        'retail': list(range(35, 47)),
        'transportation': list(range(47, 50)),
        'information': list(range(50, 54)),
        'finance': list(range(54, 58)),
        'real_estate': list(range(58, 61)),
        'services': list(range(61, 72)),
        'accommodation_food': list(range(72, 74)),
        'other_services': list(range(74, 78)),
        'public_admin': [78],
        'military': [79]
    }
    
    # Create a function to map the industry code to a group
    def map_industry(code):
        if pd.isna(code):
            return 'unknown'
        code = int(code)
        for group, codes in industry_mapping.items():
            if code in codes:
                return group
        return 'other'
    
    # Apply the mapping
    df['industry_group'] = df['emdindstn1_a'].apply(map_industry)
    
    # One-hot encode the industry group
    df = pd.get_dummies(df, columns=['industry_group'], prefix='ind')

Create Feature Interactions

In [30]:
# Age and insurance interaction
df['age_insurance_interaction'] = df['agep_a'] * df['hicov_a_binary']

In [31]:
# Create health risk score (combining obesity, smoking, and physical inactivity)
df['health_risk_score'] = (
    df['avg_obesity_rate_scaled'] + 
    df['avg_adult_smoking_scaled'] + 
    df['avg_physical_inactivity_scaled']
) / 3

# SECTION 6: Train-Test Split for Model Development

In [32]:
# Add a column for split_type
df['split_type'] = 'train'  # Default all to train

In [33]:
# Create stratified split based on empdysmss3_a (binned)
# Create bins for the target variable
df['absenteeism_bin'] = pd.qcut(df['empdysmss3_a'], q=5, labels=False, duplicates='drop')

In [34]:
# Perform stratified split
train_indices, test_indices = train_test_split(
    df.index, 
    test_size=0.2, 
    random_state=42,
    stratify=df['absenteeism_bin']
)

In [35]:
# Assign split types
df.loc[test_indices, 'split_type'] = 'test'

In [36]:
# Verify the split
print(f"Training set: {len(train_indices)} records")
print(f"Testing set: {len(test_indices)} records")

Training set: 120176 records
Testing set: 30044 records


# SECTION 7: Feature Store Integration

In [37]:
from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)
from sagemaker.feature_store.feature_group import FeatureGroup

In [38]:
# Create timestamp for event time feature
current_timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
df['event_time'] = current_timestamp

In [39]:
# Define Feature Group name with timestamp to ensure uniqueness
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
feature_group_name = f"health-policy-features-{timestamp}"
print(f"Feature Group name: {feature_group_name}")

Feature Group name: health-policy-features-20250331-083045


In [40]:
# Define Feature Definitions - with appropriate types
feature_definitions = [
    # Record identifier
    FeatureDefinition(feature_name="record_id", feature_type=FeatureTypeEnum.STRING),
    
    # Event time
    FeatureDefinition(feature_name="event_time", feature_type=FeatureTypeEnum.STRING),
    
    # Target variable
    FeatureDefinition(feature_name="empdysmss3_a", feature_type=FeatureTypeEnum.INTEGRAL),
    
    # Original features - carefully typecasted
    FeatureDefinition(feature_name="srvy_yr", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="evercovd_a_binary", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="shtcvd191_a_binary", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="hicov_a_binary", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="sex_a", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="agep_a", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="education_level", feature_type=FeatureTypeEnum.INTEGRAL),
    
    # Regional metrics - all as FRACTIONAL
    FeatureDefinition(feature_name="avg_uninsured_rate", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="avg_obesity_rate", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="avg_poor_fair_health", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="avg_flu_vaccination_rate", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="avg_adult_smoking", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="avg_physical_inactivity", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="avg_median_household_income", feature_type=FeatureTypeEnum.FRACTIONAL),
    
    # Engineered features
    FeatureDefinition(feature_name="health_risk_score", feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name="age_insurance_interaction", feature_type=FeatureTypeEnum.FRACTIONAL),
    
    # Region one-hot encoded columns
    FeatureDefinition(feature_name="region_1", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="region_2", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="region_3", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="region_4", feature_type=FeatureTypeEnum.INTEGRAL),
    
    # Split type for ML workflow
    FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]

In [41]:
# Create Feature Group
feature_group = FeatureGroup(
    name=feature_group_name, 
    feature_definitions=feature_definitions,
    sagemaker_session=sess
)

In [42]:
# Set S3 Prefix for Offline Feature Store
offline_store_prefix = f"health-policy-feature-store-{timestamp}"
print(f"Offline store S3 prefix: {offline_store_prefix}")

Offline store S3 prefix: health-policy-feature-store-20250331-083045


In [43]:
# Function to prepare data for Feature Store
def prepare_for_feature_store(df, feature_definitions):
    """
    Prepare a DataFrame for ingestion into SageMaker Feature Store by ensuring
    all columns match the expected types and all required columns are present.
    """
    # Get feature names and types
    feature_names = [feat.feature_name for feat in feature_definitions]
    feature_types = {feat.feature_name: feat.feature_type for feat in feature_definitions}
    
    # Create a new DataFrame with only the columns needed for Feature Store
    cols_to_include = [col for col in feature_names if col in df.columns]
    df_features = df[cols_to_include].copy()
    
    # Make sure all required columns exist
    for col in feature_names:
        if col not in df_features.columns:
            if feature_types[col] == FeatureTypeEnum.STRING:
                df_features[col] = ""
            elif feature_types[col] == FeatureTypeEnum.INTEGRAL:
                df_features[col] = 0
            elif feature_types[col] == FeatureTypeEnum.FRACTIONAL:
                df_features[col] = 0.0
    
    # Ensure correct types for each column
    for col, feat_type in feature_types.items():
        if feat_type == FeatureTypeEnum.STRING:
            df_features[col] = df_features[col].fillna("").astype(str)
        elif feat_type == FeatureTypeEnum.INTEGRAL:
            df_features[col] = pd.to_numeric(df_features[col], errors='coerce').fillna(0).astype(int)
        elif feat_type == FeatureTypeEnum.FRACTIONAL:
            df_features[col] = pd.to_numeric(df_features[col], errors='coerce').fillna(0.0).astype(float)
    
    return df_features

In [44]:
# Prepare data for Feature Store
df_features = prepare_for_feature_store(df, feature_definitions)

In [45]:
# Verify data types match feature definitions
print("\nVerifying column types after preparation:")
for feat in feature_definitions:
    col_name = feat.feature_name
    if col_name in df_features.columns:
        print(f"- {col_name}: {df_features[col_name].dtype} (expected {feat.feature_type})")


Verifying column types after preparation:
- record_id: object (expected FeatureTypeEnum.STRING)
- event_time: object (expected FeatureTypeEnum.STRING)
- empdysmss3_a: int64 (expected FeatureTypeEnum.INTEGRAL)
- srvy_yr: int64 (expected FeatureTypeEnum.INTEGRAL)
- evercovd_a_binary: int64 (expected FeatureTypeEnum.INTEGRAL)
- shtcvd191_a_binary: int64 (expected FeatureTypeEnum.INTEGRAL)
- hicov_a_binary: int64 (expected FeatureTypeEnum.INTEGRAL)
- sex_a: int64 (expected FeatureTypeEnum.INTEGRAL)
- agep_a: int64 (expected FeatureTypeEnum.INTEGRAL)
- education_level: int64 (expected FeatureTypeEnum.INTEGRAL)
- avg_uninsured_rate: float64 (expected FeatureTypeEnum.FRACTIONAL)
- avg_obesity_rate: float64 (expected FeatureTypeEnum.FRACTIONAL)
- avg_poor_fair_health: float64 (expected FeatureTypeEnum.FRACTIONAL)
- avg_flu_vaccination_rate: float64 (expected FeatureTypeEnum.FRACTIONAL)
- avg_adult_smoking: float64 (expected FeatureTypeEnum.FRACTIONAL)
- avg_physical_inactivity: float64 (expec

In [46]:
# Create the Feature Group
try:
    feature_group.create(
        s3_uri=f"s3://{bucket}/{offline_store_prefix}",
        record_identifier_name="record_id",
        event_time_feature_name="event_time",
        role_arn=role,
        enable_online_store=False
    )
    print(f"Feature group {feature_group_name} creation initiated.")
except Exception as e:
    print(f"Error creating feature group: {e}")

Feature group health-policy-features-20250331-083045 creation initiated.


In [47]:
# Wait for Feature Group Creation to Complete
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    print(f"Initial status: {status}")
    
    while status == "Creating":
        print("Waiting for Feature Group Creation.")
        time.sleep(10)
        status = feature_group.describe().get("FeatureGroupStatus")
        
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

try:
    print("Waiting for feature group creation to complete.")
    wait_for_feature_group_creation_complete(feature_group=feature_group)
except Exception as e:
    print(f"Error waiting for feature group creation: {e}")

Waiting for feature group creation to complete.
Initial status: Creating
Waiting for Feature Group Creation.
Waiting for Feature Group Creation.
Waiting for Feature Group Creation.
FeatureGroup health-policy-features-20250331-083045 successfully created.


In [48]:
# Ingest Data into Feature Store
sample_size = min(10000, len(df_features))
df_sample = df_features.sample(n=sample_size, random_state=42)

try:
    print(f"\nIngesting {len(df_sample)} records into feature store.")
    feature_group.ingest(data_frame=df_sample, max_workers=3, wait=True)
    print("Data ingestion complete!")
except Exception as e:
    print(f"Error during ingestion: {e}")


Ingesting 10000 records into feature store.
Data ingestion complete!


# SECTION 7: Data Validation and Summary

In [49]:
# Wait for data in offline store to become available
try:
    print("Checking for data in offline store")
    offline_store_contents = None
    attempts = 0
    while offline_store_contents is None and attempts < 5:
        objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=offline_store_prefix)
        if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
            offline_store_contents = objects_in_bucket["Contents"]
            print(f"Found {len(offline_store_contents)} objects in offline store.")
        else:
            print("Waiting for data in offline store")
            attempts += 1
            time.sleep(30)
except Exception as e:
    print(f"Error checking offline store: {e}")


Checking for data in offline store
Waiting for data in offline store
Waiting for data in offline store
Waiting for data in offline store
Waiting for data in offline store
Waiting for data in offline store


# SECTION 8: Prepare Data for Modeling

In [50]:
# Save processed features to S3 for model training
processed_data_key = f"processed-health-data-{timestamp}.csv"

try:
    # Select relevant columns for modeling
    modeling_columns = [
        # Target
        'empdysmss3_a',
        
        # Demographics
        'agep_a', 'sex_a', 'education_level',
        
        # Health indicators
        'evercovd_a_binary', 'shtcvd191_a_binary', 'hicov_a_binary',
        
        # Regional metrics
        'avg_uninsured_rate', 'avg_obesity_rate', 'avg_flu_vaccination_rate',
        'avg_adult_smoking', 'avg_physical_inactivity', 'avg_median_household_income',
        
        # Industry information (one-hot encoded columns)
        *[col for col in df.columns if col.startswith('ind_')],
        
        # Region indicators
        *[col for col in df.columns if col.startswith('region_')],
        
        # Engineered features
        'health_risk_score', 'age_insurance_interaction',
        
        # Split type
        'split_type'
    ]
    
    # Select only columns that exist in the dataframe
    existing_modeling_columns = [col for col in modeling_columns if col in df.columns]
    df_modeling = df[existing_modeling_columns]
    
    # Save to CSV and upload to S3
    csv_buffer = io.StringIO()
    df_modeling.to_csv(csv_buffer, index=False)
    s3.put_object(
        Bucket=bucket,
        Key=processed_data_key,
        Body=csv_buffer.getvalue()
    )
    print(f"Saved processed modeling data to s3://{bucket}/{processed_data_key}")
    
    # Print summary of key variables
    print("\nSummary of key variables for modeling:")
    print(f"  Target variable (empdysmss3_a) mean: {df_modeling['empdysmss3_a'].mean():.2f} days")
    print(f"  Target variable (empdysmss3_a) median: {df_modeling['empdysmss3_a'].median():.2f} days")
    print(f"  Target variable (empdysmss3_a) max: {df_modeling['empdysmss3_a'].max():.2f} days")
    print(f"  Records with health insurance: {df[df['hicov_a_binary'] == 1].shape[0]} ({df[df['hicov_a_binary'] == 1].shape[0]/df.shape[0]*100:.1f}%)")
    print(f"  Average age: {df_modeling['agep_a'].mean():.1f} years")
    
    print("\nData preparation complete! Your dataset is now ready for modeling.")
except Exception as e:
    print(f"Error in final data preparation: {e}")

Saved processed modeling data to s3://usd-team1-ads508/processed-health-data-20250331-083045.csv

Summary of key variables for modeling:
  Target variable (empdysmss3_a) mean: 4.12 days
  Target variable (empdysmss3_a) median: 0.00 days
  Target variable (empdysmss3_a) max: 998.00 days
  Records with health insurance: 9853 (6.6%)
  Average age: 52.9 years

Data preparation complete! Your dataset is now ready for modeling.


## Release Resources

In [51]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [52]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>