# Setup and Configuration

This section initializes the SageMaker environment, defines the S3 paths, and sets up the IAM role and session to be used throughout the notebook.

In [None]:
!pip install --upgrade awscli s3fs
!pip install --upgrade boto3

import boto3
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session()
bucket = 'eks-cobol-logs-730pkp'
prefix = 'training'

role = get_execution_role()

# Load and Explore Data

This section loads the training data from S3 and performs basic inspection of the dataset to understand its structure and content.


In [None]:
import pandas as pd

s3_uri = f's3://{bucket}/{prefix}/training-data.csv'
df = pd.read_csv(s3_uri)

print("Sample Data:")
display(df.head())
print("\nLabel Distribution:")
print(df['label'].value_counts())

# Preprocess Data

This section encodes the categorical `description` column and prepares the features and labels. The dataset is then split into training and test sets using stratified sampling to preserve class distribution.


In [None]:
from sklearn.model_selection import train_test_split

df['description'] = df['description'].astype("category").cat.codes
features = df.drop("label", axis=1)
labels = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, stratify=labels, random_state=42
)

# Save and Upload Train/Test Sets to S3

The training and testing datasets are saved locally and uploaded to S3 so they can be used by the SageMaker training job.


In [None]:
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

train_file = 'train.csv'
test_file = 'test.csv'

train_data.to_csv(train_file, header=False, index=False)
test_data.to_csv(test_file, header=False, index=False)

train_s3_path = session.upload_data(train_file, bucket=bucket, key_prefix=f'{prefix}/xgboost')
test_s3_path = session.upload_data(test_file, bucket=bucket, key_prefix=f'{prefix}/xgboost')

# Train XGBoost Model

This section defines the XGBoost training job using SageMaker's built-in XGBoost container. It specifies the hyperparameters and starts the training job using the datasets uploaded to S3.


In [None]:
from sagemaker.inputs import TrainingInput

container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, "1.3-1")

xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=session
)

xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8
)

xgb_estimator.fit({
    "train": TrainingInput(train_s3_path, content_type="csv"),
    "validation": TrainingInput(test_s3_path, content_type="csv")
})


# Deploy Model and Test Inference

This section deploys the trained model to a SageMaker endpoint and performs a prediction on a sample row from the test set.


In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = xgb_estimator.deploy(initial_instance_count=1, instance_type="ml.p3.2xlarge")
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

sample = X_test.head(1).to_csv(header=False, index=False).strip()
print("Sample row:", sample)
print("Prediction:", predictor.predict(sample))


# Evaluate the Model

This section evaluates the model performance using standard metrics and displays a confusion matrix.


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Run batch prediction
y_pred_proba = predictor.predict(X_test.to_csv(header=False, index=False))['predictions']
y_pred = [int(float(p['score']) > 0.5) for p in y_pred_proba]

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Valid", "Error"])
disp.plot(cmap="Blues")

# Cleanup

This section deletes the deployed endpoint to avoid incurring charges.
