In [1]:
# Importing necessary libraries
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sagemaker.xgboost import XGBoostModel
from sagemaker import image_uris, estimator
import boto3
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sklearn.model_selection import train_test_split
import os
from sagemaker.predictor import Predictor
from sklearn.metrics import classification_report, confusion_matrix

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Initializing sagemaker session and clients
session = sagemaker.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = "student-anxiety-xgb"

print(f"Using region: {region}")
print(f"SageMaker role: {role}")

Using region: us-east-1
SageMaker role: arn:aws:iam::851725636446:role/LabRole


In [3]:
# Retrieving data from feature store
def load_feature_group(fg_name, record_identifier="student_id", event_time="event_time"):
    fg = FeatureGroup(name=fg_name, sagemaker_session=session)

    query = fg.athena_query()
    table = query.table_name

    # Running SQL query to get all data from the feature group
    sql = f'SELECT * FROM "{table}"'
    print(f"Running query for Feature Group: {fg_name}")
    query.run(query_string=sql, output_location=f"s3://{session.default_bucket()}/athena-results/")
    query.wait()

    # Loading results into pandas
    df = query.as_dataframe()

    # Dropping duplicates by keeping latest event_time per student_id
    df = df.sort_values(by=[record_identifier, event_time]).drop_duplicates(subset=[record_identifier], keep="last")

    return df


def clean_featurestore_df(df):
    drop_cols = ["write_time", "is_deleted", "api_invocation_time", "event_time"]
    return df.drop(columns=[c for c in drop_cols if c in df.columns])


# Loading each feature group
demo_ses_df = load_feature_group("student-demographics-ses-fg")
performance_df = load_feature_group("student-performance-fg")
wellbeing_df = load_feature_group("student-wellbeing-fg")
target_df = load_feature_group("student-anxiety-target-fg")

print("\nFeature Group shapes:")
print(f"Demographics & SES: {demo_ses_df.shape}")
print(f"Academic Performance: {performance_df.shape}")
print(f"Student Wellbeing: {wellbeing_df.shape}")
print(f"Anxiety Target: {target_df.shape}")

# Cleaning dataset by dropping metadata columns before merging
demo_ses_df = clean_featurestore_df(demo_ses_df)
performance_df = clean_featurestore_df(performance_df)
wellbeing_df = clean_featurestore_df(wellbeing_df)
target_df = clean_featurestore_df(target_df)

# Joining into single dataset on "student_id"
df_merged = (
    demo_ses_df
    .merge(performance_df, on="student_id", how="inner")
    .merge(wellbeing_df, on="student_id", how="inner")
    .merge(target_df, on="student_id", how="inner")
)

print(f"\nFinal merged dataset shape: {df_merged.shape}")
df_merged.head()

Running query for Feature Group: student-demographics-ses-fg
Running query for Feature Group: student-performance-fg
Running query for Feature Group: student-wellbeing-fg
Running query for Feature Group: student-anxiety-target-fg

Feature Group shapes:
Demographics & SES: (32293, 11)
Academic Performance: (36824, 6)
Student Wellbeing: (25141, 13)
Anxiety Target: (35565, 6)

Final merged dataset shape: (25112, 17)


Unnamed: 0,student_id,age,grade,st004d01t,escs,homepos,wealth,academic_performance_index,belong,unfairteacher,scieeff,disclisci,motivat,pared,teachsup,emosups,anxiety_level_encoded
0,3600001.0,16.0,0.0,Male,1.3515,1.8051,2.6171,5.537787,2.5915,6.0,1.2229,0.3363,0.1111,15.0,1.4475,1.0991,1
1,3600002.0,15.83,0.0,Male,1.1971,0.9351,0.9759,5.921402,-1.0536,18.0,1.2067,0.6,1.8543,15.0,-1.1047,0.025,2
2,3600004.0,16.17,0.0,Female,1.2256,1.2377,1.032,-2.584935,0.4153,7.0,-0.0176,-0.3506,0.5952,15.0,0.9209,1.0991,2
3,3600005.0,15.92,0.0,Male,0.8642,1.9803,1.4158,-0.075928,0.2947,19.0,0.3,-1.1864,-0.4982,12.0,-0.2651,-0.2495,0
4,3600006.0,15.92,0.0,Male,0.349,-0.085,0.3716,-3.949186,-1.4204,21.0,3.2775,0.0039,1.8543,14.0,-2.7195,1.0991,2


In [4]:
# Converting all object/string columns to categorical codes for use with XGBoost
for col in df_merged.select_dtypes(include='object').columns:
    df_merged[col] = df_merged[col].fillna('missing').astype('category').cat.codes

# Verifying all columns are now numeric or category
print(df_merged.dtypes[df_merged.dtypes == 'object'])  # Should be empty

Series([], dtype: object)


In [5]:
# Preprocessing for sagemaker xgboost

# Setting target column name
TARGET_COL = "anxiety_level_encoded"

# Ensuring merged dataframe is already loaded
assert TARGET_COL in df_merged.columns, f"'{TARGET_COL}' not found in df_merged columns!"

# 40% train, 10% validation, 10% test, 40% production (stratified)
train_df, temp_df = train_test_split(df_merged, test_size=0.6, stratify=df_merged[TARGET_COL], random_state=0)
val_df, temp2_df = train_test_split(temp_df, test_size=5/6, stratify=temp_df[TARGET_COL], random_state=0)
test_df, prod_df = train_test_split(temp2_df, test_size=4/5, stratify=temp2_df[TARGET_COL], random_state=0)

print(f"Train: {train_df.shape}, Validation: {val_df.shape}, Test: {test_df.shape}, Prod: {prod_df.shape}")

# Converting categorical/object columns to numeric codes
cat_cols = train_df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    all_vals = pd.concat([train_df[col], val_df[col], test_df[col], prod_df[col]], axis=0)
    mapping = {val: i for i, val in enumerate(all_vals.unique())}
    for df in [train_df, val_df, test_df, prod_df]:
        df[col] = df[col].map(mapping)

# Ensuring same column order 
feature_cols = [c for c in df_merged.columns if c != TARGET_COL]
all_cols = [TARGET_COL] + feature_cols

train_df = train_df[all_cols]
val_df = val_df[all_cols]

# Saving test labels before dropping from test_df
test_labels = test_df[TARGET_COL]
test_labels_path = "sagemaker_csvs/test_labels.csv"
os.makedirs("sagemaker_csvs", exist_ok=True)
test_labels.to_csv(test_labels_path, header=False, index=False)

# Dropping target from test & production sets for inference
test_df = test_df[feature_cols]
prod_df = prod_df[feature_cols]

# Saving locally without headers
os.makedirs("sagemaker_csvs", exist_ok=True)

train_path = "sagemaker_csvs/train.csv"
val_path = "sagemaker_csvs/validation.csv"
test_path = "sagemaker_csvs/test.csv"
prod_path = "sagemaker_csvs/prod.csv"

train_df.to_csv(train_path, header=False, index=False)
val_df.to_csv(val_path, header=False, index=False)
test_df.to_csv(test_path, header=False, index=False)
prod_df.to_csv(prod_path, header=False, index=False)

train_s3_uri = session.upload_data(train_path, bucket=bucket, key_prefix=f"{prefix}/train")
val_s3_uri = session.upload_data(val_path, bucket=bucket, key_prefix=f"{prefix}/validation")
test_s3_uri = session.upload_data(test_path, bucket=bucket, key_prefix=f"{prefix}/test")
prod_s3_uri = session.upload_data(prod_path, bucket=bucket, key_prefix=f"{prefix}/production")

print("\n Uploaded to S3:")
print(f"  Train: {train_s3_uri}")
print(f"  Validation: {val_s3_uri}")
print(f"  Test (no label): {test_s3_uri}")
print(f"  Production (no label): {prod_s3_uri}")

Train: (10044, 17), Validation: (2511, 17), Test: (2511, 17), Prod: (10046, 17)

 Uploaded to S3:
  Train: s3://sagemaker-us-east-1-851725636446/student-anxiety-xgb/train/train.csv
  Validation: s3://sagemaker-us-east-1-851725636446/student-anxiety-xgb/validation/validation.csv
  Test (no label): s3://sagemaker-us-east-1-851725636446/student-anxiety-xgb/test/test.csv
  Production (no label): s3://sagemaker-us-east-1-851725636446/student-anxiety-xgb/production/prod.csv


In [6]:
# Defining baseline XGBoost estimator
xgboost_image = image_uris.retrieve(region=region, framework='xgboost', version='1.5-1')

xgb_estimator = estimator.Estimator(
    image_uri=xgboost_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=5,
    max_run=3600,
    sagemaker_session=session
)

# Baseline hyperparameters
xgb_estimator.set_hyperparameters(
    objective="multi:softprob",
    num_class=len(df_merged[TARGET_COL].unique()),
    eval_metric="mlogloss",
    num_round=100,
    max_depth=5,
    eta=0.1,
    subsample=0.8,
    colsample_bytree=0.8
)

In [7]:
# Launching baseline training job
xgb_estimator.fit(
    {"train": TrainingInput(train_s3_uri, content_type="csv"),
     "validation": TrainingInput(val_s3_uri, content_type="csv")},
    wait=True
)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-10-18-05-56-50-956


2025-10-18 05:56:52 Starting - Starting the training job...
2025-10-18 05:57:06 Starting - Preparing the instances for training...
2025-10-18 05:57:54 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-10-18 05:58:52.376 ip-10-0-236-181.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-10-18 05:58:52.397 ip-10-0-236-181.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-10-18:05:58:52:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-10-18:05:58:52:INFO] Failed to parse hyperparameter eval_metric value mlogloss to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-18:05:58:52:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-18:05:58:52:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-18:05:58:52:INFO] Running XGBoo

In [8]:
baseline_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-10-18-05-59-37-790
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-10-18-05-59-37-790
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-10-18-05-59-37-790


------!

In [9]:
# Creating and running hyperparameter tuning job
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.01, 0.3),
    "max_depth": IntegerParameter(3, 10),
    "subsample": ContinuousParameter(0.5, 1.0),
    "colsample_bytree": ContinuousParameter(0.5, 1.0),
    "min_child_weight": IntegerParameter(1, 10),
    "gamma": ContinuousParameter(0, 0.5)
}

objective_metric_name = "validation:mlogloss"

tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=3,
    objective_type="Minimize"
)

tuner.fit(
    {"train": TrainingInput(train_s3_uri, content_type="csv"),
     "validation": TrainingInput(val_s3_uri, content_type="csv")},
    include_cls_metadata=False,
    wait=True
)

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-251018-0603


.............................................................................................!


In [10]:
# Creating tuned model
best_estimator = tuner.best_estimator()
best_predictor = best_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"
)


2025-10-18 06:08:52 Starting - Found matching resource for reuse
2025-10-18 06:08:52 Downloading - Downloading the training image
2025-10-18 06:08:52 Training - Training image download completed. Training in progress.
2025-10-18 06:08:52 Uploading - Uploading generated training model
2025-10-18 06:08:52 Completed - Resource reused by training job: sagemaker-xgboost-251018-0603-014-af6a1477

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-10-18-06-11-16-382





INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-10-18-06-11-16-382
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-10-18-06-11-16-382


------!

In [11]:
# Running predictions on test set
test_data = pd.read_csv("sagemaker_csvs/test.csv", header=None)

# Converting DataFrame to CSV string (for single-record prediction)
csv_data = test_data.to_csv(header=False, index=False).strip()

# Baseline predictions
baseline_preds_raw = baseline_predictor.predict(csv_data, initial_args={"ContentType": "text/csv"})
# Tuned predictions
tuned_preds_raw = best_predictor.predict(csv_data, initial_args={"ContentType": "text/csv"})

In [12]:
# Evaluating metrics for the models

# Decoding from bytes to string if needed
baseline_preds_str = baseline_preds_raw.decode("utf-8") if isinstance(baseline_preds_raw, bytes) else baseline_preds_raw
tuned_preds_str = tuned_preds_raw.decode("utf-8") if isinstance(tuned_preds_raw, bytes) else tuned_preds_raw

# Converting CSV string to numpy array
baseline_preds = np.array([list(map(float, line.split(","))) for line in baseline_preds_str.strip().split("\n")])
tuned_preds = np.array([list(map(float, line.split(","))) for line in tuned_preds_str.strip().split("\n")])

# Taking argmax to get predicted class labels
baseline_pred_labels = np.argmax(baseline_preds, axis=1)
tuned_pred_labels = np.argmax(tuned_preds, axis=1)

y_test = pd.read_csv("sagemaker_csvs/test_labels.csv")

# Aligning lengths
min_len = min(len(y_test), len(baseline_pred_labels))
y_test = y_test[:min_len]
baseline_pred_labels = baseline_pred_labels[:min_len]
tuned_pred_labels = tuned_pred_labels[:min_len]

# Classification reports
print("Baseline model report:\n", classification_report(y_test, baseline_pred_labels))
print("Tuned model report:\n", classification_report(y_test, tuned_pred_labels))


Baseline model report:
               precision    recall  f1-score   support

           0       0.33      0.32      0.32       821
           1       0.33      0.34      0.33       846
           2       0.33      0.32      0.33       843

    accuracy                           0.33      2510
   macro avg       0.33      0.33      0.33      2510
weighted avg       0.33      0.33      0.33      2510

Tuned model report:
               precision    recall  f1-score   support

           0       0.32      0.33      0.33       821
           1       0.33      0.35      0.34       846
           2       0.32      0.30      0.31       843

    accuracy                           0.32      2510
   macro avg       0.32      0.32      0.32      2510
weighted avg       0.32      0.32      0.32      2510

