# Train baseline & XGBoost models in SageMaker

In [19]:
# Imports (superset of original)
import os
import tarfile
import time
import joblib
import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                             ConfusionMatrixDisplay, f1_score)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore

from sagemaker.xgboost import XGBoostModel
from sagemaker.inputs import TrainingInput
from sagemaker.sklearn import SKLearn
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.s3 import S3Downloader

In [2]:
# Setup
sm_session = sagemaker.Session()
region = sm_session.boto_region_name
role = sagemaker.get_execution_role()
s3 = boto3.client("s3", region_name=region)

BUCKET = sm_session.default_bucket()
PREFIX = "student-anxiety-ml"
LOCAL_DIR = "local_artifacts"
os.makedirs(LOCAL_DIR, exist_ok=True)

print(f"Region: {region}")
print(f"Bucket: {BUCKET}")


Region: us-east-1
Bucket: sagemaker-us-east-1-767397858887


Load Feature Groups

In [3]:
def load_feature_group(fg_name):
    """Load data from Feature Store & clean metadata"""
    fg = FeatureGroup(name=fg_name, sagemaker_session=sm_session)
    q = fg.athena_query()
    table = q.table_name

    q.run(
        query_string=f'SELECT * FROM "{table}"',
        output_location=f"s3://{BUCKET}/athena-results/"
    )
    q.wait()
    df = q.as_dataframe()

    # keep most recent record per student
    df = df.sort_values(["student_id", "event_time"])
    df = df.drop_duplicates(subset=["student_id"], keep="last")

    # drop metadata columns
    meta_cols = ["write_time", "is_deleted", "api_invocation_time", "event_time"]
    for col in meta_cols:
        if col in df.columns:
            df = df.drop(columns=[col])

    return df

Merge & Preprocess

In [6]:
# Load Feature Groups
demo_df = load_feature_group("student-demographics-ses-fg")
performance_df = load_feature_group("student-performance-fg")
wellbeing_df = load_feature_group("student-wellbeing-fg")
target_df = load_feature_group("student-anxiety-target-fg")

print("\nFeature Group shapes:")
print(f"Demographics & SES: {demo_df.shape}")
print(f"Academic Performance: {performance_df.shape}")
print(f"Student Wellbeing: {wellbeing_df.shape}")
print(f"Anxiety Target: {target_df.shape}")

# Merge into one dataframe
df = demo_df.merge(performance_df, on="student_id")
df = df.merge(wellbeing_df, on="student_id")
df = df.merge(target_df, on="student_id")

print("Merged shape:", df.shape)
print(df.head())

# Data preprocessing
target_col = "anxiety_level_encoded"

# Convert categorical columns to numeric codes
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna("missing")
        df[col] = df[col].astype("category").cat.codes

# Define feature columns (everything except target & ID)
feature_cols = [c for c in df.columns if c not in [target_col, "student_id"]]

print(f"Using {len(feature_cols)} features")


Feature Group shapes:
Demographics & SES: (32293, 7)
Academic Performance: (36824, 2)
Student Wellbeing: (25141, 9)
Anxiety Target: (35565, 2)
Merged shape: (25112, 17)
   student_id    age  grade st004d01t    escs  homepos  wealth  \
0   3600001.0  16.00    0.0      Male  1.3515   1.8051  2.6171   
1   3600002.0  15.83    0.0      Male  1.1971   0.9351  0.9759   
2   3600004.0  16.17    0.0    Female  1.2256   1.2377  1.0320   
3   3600005.0  15.92    0.0      Male  0.8642   1.9803  1.4158   
4   3600006.0  15.92    0.0      Male  0.3490  -0.0850  0.3716   

   academic_performance_index  belong  unfairteacher  scieeff  disclisci  \
0                    5.537787  2.5915            6.0   1.2229     0.3363   
1                    5.921402 -1.0536           18.0   1.2067     0.6000   
2                   -2.584935  0.4153            7.0  -0.0176    -0.3506   
3                   -0.075928  0.2947           19.0   0.3000    -1.1864   
4                   -3.949186 -1.4204           21.0 

Train/Validation/Test Split

In [7]:
# Train/val/test split
# Keep 40% aside as holdout set for future use
df_main, df_prod = train_test_split(df, test_size=0.4, random_state=0, stratify=df[target_col])

# Split remaining into train/val/test
df_train, df_temp = train_test_split(df_main, test_size=0.3333, random_state=0, stratify=df_main[target_col])
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=0, stratify=df_temp[target_col])

print("Dataset sizes - train:", len(df_train), "val:", len(df_val), "test:", len(df_test), "prod:", len(df_prod))

Dataset sizes - train: 10045 val: 2511 test: 2511 prod: 10045


Upload Data to S3

In [8]:
def upload_to_s3(dataframe, split_name, use_xgb_format=False):
    """Save dataframe to S3 in the correct format"""
    if use_xgb_format:
        # XGBoost: label first, no header
        cols = [target_col] + feature_cols
        local_path = f"{LOCAL_DIR}/{split_name}_xgb.csv"
        dataframe[cols].to_csv(local_path, header=False, index=False)
        s3_key = f"{PREFIX}/xgb/{split_name}/{split_name}.csv"
    else:
        # SKLearn: can have headers
        cols = [target_col] + feature_cols
        local_path = f"{LOCAL_DIR}/{split_name}_sklearn.csv"
        dataframe[cols].to_csv(local_path, index=False)
        s3_key = f"{PREFIX}/sklearn/{split_name}/{split_name}.csv"

    s3.upload_file(local_path, BUCKET, s3_key)
    return f"s3://{BUCKET}/{s3_key}"

# Upload all splits
for name, data in [("train", df_train), ("val", df_val), ("test", df_test), ("prod", df_prod)]:
    upload_to_s3(data, name, use_xgb_format=False)
    upload_to_s3(data, name, use_xgb_format=True)

Baseline Training Script

In [9]:
os.makedirs("baseline_lr", exist_ok=True)

baseline_script = '''
import os
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

if __name__ == "__main__":
    # Load data from SageMaker channels
    train_dir = os.environ["SM_CHANNEL_TRAIN"]
    val_dir = os.environ["SM_CHANNEL_VAL"]
    model_dir = os.environ["SM_MODEL_DIR"]

    train_df = pd.read_csv(os.path.join(train_dir, "train.csv"))
    val_df = pd.read_csv(os.path.join(val_dir, "val.csv"))

    # Separate features and target
    y_train = train_df["anxiety_level_encoded"]
    X_train = train_df.drop(columns=["anxiety_level_encoded"])
    y_val = val_df["anxiety_level_encoded"]
    X_val = val_df.drop(columns=["anxiety_level_encoded"])

    # Train model with scaling pipeline
    model = make_pipeline(
        StandardScaler(with_mean=False),
        LogisticRegression(max_iter=400, n_jobs=-1, random_state=0)
    )
    model.fit(X_train, y_train)

    # Evaluate
    predictions = model.predict(X_val)
    f1 = f1_score(y_val, predictions, average='macro')
    print(f"Validation F1 Score: {f1:.4f}")

    # Save model
    joblib.dump(model, os.path.join(model_dir, "model.joblib"))
'''

with open("baseline_lr/train.py", "w") as f:
    f.write(baseline_script)

Train Baseline Model in SageMaker

In [10]:
baseline_estimator = SKLearn(
    entry_point="train.py",
    source_dir="baseline_lr",
    role=role,
    framework_version="1.2-1",
    instance_type="ml.m5.large",
    instance_count=1,
    py_version="py3",
    sagemaker_session=sm_session,
)

print("Training baseline model...")
baseline_estimator.fit({
    "train": TrainingInput(f"s3://{BUCKET}/{PREFIX}/sklearn/train/"),
    "val": TrainingInput(f"s3://{BUCKET}/{PREFIX}/sklearn/val/")
})

baseline_model_path = baseline_estimator.model_data

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2025-10-16-11-14-40-171


Training baseline model...
2025-10-16 11:14:45 Starting - Starting the training job...
2025-10-16 11:14:59 Starting - Preparing the instances for training...
2025-10-16 11:15:22 Downloading - Downloading input data...
  import pkg_resources[0m
[34m2025-10-16 11:16:51,302 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-10-16 11:16:51,307 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-16 11:16:51,309 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-10-16 11:16:51,325 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-10-16 11:16:51,597 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-16 11:16:51,601 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-10-16 11:16:51,619 sagemaker-training-to

Train XGBoost Model in SageMaker (using built-in algorithm)

In [15]:
from sagemaker.estimator import Estimator
from sagemaker import image_uris

num_classes = df[target_col].nunique()

# Get XGBoost container image
xgb_container = image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.7-1"
)

xgb_estimator = Estimator(
    image_uri=xgb_container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    sagemaker_session=sm_session,
    hyperparameters={
        "objective": "multi:softprob",
        "num_class": num_classes,
        "num_round": 300,
        "max_depth": 6,
        "eta": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "eval_metric": "mlogloss"
    }
)

# Fixed by Claude
xgb_estimator.fit({
    "train": TrainingInput(
        f"s3://{BUCKET}/{PREFIX}/xgb/train/",
        content_type='text/csv'
    ),
    "validation": TrainingInput(
        f"s3://{BUCKET}/{PREFIX}/xgb/val/",
        content_type='text/csv'
    )
})

xgb_model_path = xgb_estimator.model_data

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-10-16-11-30-15-165


2025-10-16 11:30:15 Starting - Starting the training job...
2025-10-16 11:30:41 Starting - Preparing the instances for training...
2025-10-16 11:31:02 Downloading - Downloading input data...
2025-10-16 11:31:27 Downloading - Downloading the training image...
  import pkg_resources[0m
[34m[2025-10-16 11:32:26.294 ip-10-0-191-33.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-10-16 11:32:26.364 ip-10-0-191-33.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-10-16:11:32:26:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-10-16:11:32:26:INFO] Failed to parse hyperparameter eval_metric value mlogloss to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-16:11:32:26:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-16:11:32:26:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-

Download & Load Models

In [16]:
# Prepare test data
X_test = df_test[feature_cols]
y_test = df_test[target_col]

# Download & extract baseline model
S3Downloader.download(baseline_model_path, f"{LOCAL_DIR}/baseline", sagemaker_session=sm_session)

baseline_tar = [f for f in os.listdir(f"{LOCAL_DIR}/baseline") if f.endswith(".tar.gz")][0]
with tarfile.open(os.path.join(f"{LOCAL_DIR}/baseline", baseline_tar)) as tar:
    tar.extractall(path=f"{LOCAL_DIR}/baseline_model")

import joblib
baseline_model = joblib.load(f"{LOCAL_DIR}/baseline_model/model.joblib")
y_pred_baseline = baseline_model.predict(X_test)

# Download & extract XGBoost model
S3Downloader.download(xgb_model_path, f"{LOCAL_DIR}/xgb", sagemaker_session=sm_session)

xgb_tar = [f for f in os.listdir(f"{LOCAL_DIR}/xgb") if f.endswith(".tar.gz")][0]
with tarfile.open(os.path.join(f"{LOCAL_DIR}/xgb", xgb_tar)) as tar:
    tar.extractall(path=f"{LOCAL_DIR}/xgb_model")

# Load XGBoost model & predict
import xgboost as xgb
booster = xgb.Booster()
booster.load_model(f"{LOCAL_DIR}/xgb_model/xgboost-model")

dtest = xgb.DMatrix(X_test.values)
y_pred_probs = booster.predict(dtest)
y_pred_xgb = y_pred_probs.argmax(axis=1)

  tar.extractall(path=f"{LOCAL_DIR}/baseline_model")
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  tar.extractall(path=f"{LOCAL_DIR}/xgb_model")


Debug - Feature Importance

In [17]:
importance = booster.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)

for i, (feat, score) in enumerate(sorted_importance[:10], 1):
    print(f"  {i}. {feat}: {score}")

  1. f2: 5.541542053222656
  2. f7: 2.1988487243652344
  3. f11: 1.7921347618103027
  4. f6: 1.7707606554031372
  5. f12: 1.7003593444824219
  6. f8: 1.6842241287231445
  7. f9: 1.6275262832641602
  8. f5: 1.592592716217041
  9. f3: 1.559146523475647
  10. f4: 1.5353251695632935


Evaluate & Compare Models

In [20]:
print("BASELINE MODEL (Logistic Regression)")
print(classification_report(y_test, y_pred_baseline))
print(f"Macro F1: {f1_score(y_test, y_pred_baseline, average='macro'):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_baseline))

print("XGBOOST MODEL")
print(classification_report(y_test, y_pred_xgb))
print(f"Macro F1: {f1_score(y_test, y_pred_xgb, average='macro'):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

BASELINE MODEL (Logistic Regression)
              precision    recall  f1-score   support

           0       0.50      0.61      0.55       821
           1       0.49      0.61      0.55       847
           2       0.42      0.23      0.30       843

    accuracy                           0.48      2511
   macro avg       0.47      0.48      0.47      2511
weighted avg       0.47      0.48      0.47      2511

Macro F1: 0.4656

Confusion Matrix:
[[498 191 132]
 [187 519 141]
 [307 339 197]]
XGBOOST MODEL
              precision    recall  f1-score   support

           0       0.51      0.51      0.51       821
           1       0.52      0.55      0.53       847
           2       0.42      0.39      0.40       843

    accuracy                           0.48      2511
   macro avg       0.48      0.48      0.48      2511
weighted avg       0.48      0.48      0.48      2511

Macro F1: 0.4825

Confusion Matrix:
[[421 168 232]
 [151 462 234]
 [254 258 331]]
