In [None]:
#Note: Used references from Billy Fong (TA for DS210), as well as AWS references:
#https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_managed_spot_training.html
#Note: This code was run previously, with a later limited edits to clean up formatting and comments. Final code not re-run to avoid any conflicts with existing model files in S3.

In [None]:
#Note: only needed on initial install
#!pip install --upgrade sagemaker

## Setting up environment

In [None]:
#Importing key packages
import io
import os
import boto3
import sagemaker
import time
import botocore
from sagemaker import get_execution_role, image_uris, model_uris, script_uris, hyperparameters
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner
from time import gmtime, strftime
import matplotlib as plt
import numpy as np
import pandas as pd


In [None]:
#Setting up

role = sagemaker.get_execution_role()
boto_session = boto3.Session()
region = boto_session.region_name
sess = sagemaker.Session(boto_session=boto_session)

print("Role:", role)
print("Region:", region)
print("SageMaker Session Region:", sess.boto_region_name)

In [None]:
#Setting up Bucket Links/Info


bucket='sagemaker-us-east-2-917456409349'
s3_bucket_prefix= "sagemaker/adoption/Code/"
prefix = f"{bucket}/{s3_bucket_prefix}"


In [None]:
prefix

In [None]:
#Set Container
container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")


## Ingesting Data and splitting/training/test

In [None]:
#Load Source file to do encoding and split train/test

import boto3
import pandas as pd
import io

bucket = "sagemaker-us-east-2-917456409349"
key = "sagemaker/adoption/golden_record/df_cat_dog_harmonized.csv"

s3_client = boto3.client("s3")
obj = s3_client.get_object(Bucket=bucket, Key=key)
df = pd.read_csv(io.BytesIO(obj['Body'].read()))



In [None]:
#Length prededupe

np.shape(df)

In [None]:
#Deduping


# Drop duplicates, keeping the last record for each animal_id
df_deduped = df.drop_duplicates(subset='primary_key', keep='last')

print("Original rows:", len(df))
print("After deduplication:", len(df_deduped))



In [None]:
#Setting to Deduped so rest of code works

df = df_deduped.copy()


In [None]:
#See all columns
all_columns = df.columns.tolist()
print(all_columns)
print("Total columns:", len(all_columns))

In [None]:
#Setting outcome_type_final_grouped to be first for Sagemaker Training

column = 'outcome_type_harmonized_grouped'
cols = [column] + [col for col in df.columns if col != column]
df = df[cols]

#Check new order
df.head()


In [None]:
#Adding Custom Train/Test/Split
def assign_split(row):
    if row['outcome_year'] <= 2022:
        return "train"
    elif row['outcome_year'] in [2023, 2024]:
        return "validate"
    elif row['outcome_year'] == 2025:
        return "test"
    else:
        return "exclude"  # fallback for unexpected years

df['split'] = df.apply(assign_split, axis=1)



In [None]:
#See all columns
all_columns = df.columns.tolist()
print(all_columns)
print("Total columns:", len(all_columns))



In [None]:

#Naming features to keep and drop if needed, but won't as keeping standard format of xlsx.
features_to_keep = ['outcome_type_harmonized_grouped','animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized',
    'Is_returned', 'has_name', 'is_mix', 'Num_returned', 'age_months','stay_length_days', 'min_height', 'max_height',
    'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy',
    'grooming_frequency_value', 'shedding_value', 'energy_level_value',
    'trainability_value', 'demeanor_value'
]

# # Trim the DataFrame to only those columns
# df = df[features_to_keep].copy()



In [None]:
#See all columns
all_columns = df.columns.tolist()
print(all_columns)
print("Total columns:", len(all_columns))

In [None]:
df.head(10)

In [None]:
#Pre-encoding Adoption
df['outcome_type_harmonized_grouped'] = (df['outcome_type_harmonized_grouped'] == 'adopted').astype(int)


In [None]:
df.head(10)

In [None]:
#Encoding
from sklearn.preprocessing import LabelEncoder

# Copy original DataFrame
encoded_df = df.copy()

# Specific columns you want to encode
columns_to_encode = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized',
    'Is_returned', 'has_name', 'is_mix'
]

# Dictionary to store label encoders (optional: for inverse transform or saving later)
le_dict = {}

# Apply label encoding to specified columns, save in new columns
for col in columns_to_encode:
    le = LabelEncoder()
    encoded_col_name = f"Encoded-{col}"
    encoded_df[encoded_col_name] = le.fit_transform(encoded_df[col].astype(str))
    le_dict[col] = le

# Fill missing age_months with median
median_age = encoded_df['age_months'].median()
encoded_df['age_months'] = encoded_df['age_months'].fillna(median_age)




In [None]:
np.shape(encoded_df)

In [None]:
#Seeing encoded table
encoded_df.head(10)

In [None]:
# Saving Encoding
import pickle

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)




In [None]:
#See all columns
all_columns = encoded_df.columns.tolist()
print(all_columns)
print("Total columns:", len(all_columns))



In [None]:
encoded_df.head()

In [None]:
np.shape(encoded_df)

In [None]:
# Splitting for training

df_train = encoded_df[encoded_df['split'] == 'train']
df_test = encoded_df[encoded_df['split'] == 'test']
df_validate = encoded_df[encoded_df['split'] == 'validate']
# Save each to CSV (no index)
df_train.to_csv("train.csv", index=False)
df_test.to_csv("test.csv", index=False)
df_validate.to_csv("validate.csv", index=False)

# Output sizes
print("Train rows:", len(df_train))
print("Test rows:", len(df_test))
print("Validate rows:", len(df_validate))




In [None]:
df_train.head()


In [None]:
df_test.head()

In [None]:
df_validate.head()

In [None]:
bucket

In [None]:
prefix

In [None]:
key

## Training Model Locally


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import matplotlib.pyplot as plt

# efine feature columns 
feature_columns = [
'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]

# prepare datasets using only selected features
X_train = df_train[feature_columns]
y_train = df_train['outcome_type_harmonized_grouped']

X_val = df_validate[feature_columns]
y_val = df_validate['outcome_type_harmonized_grouped']

X_test = df_test[feature_columns]
y_test = df_test['outcome_type_harmonized_grouped']

# define model
xgb_model = XGBClassifier(
    max_depth=5,
    learning_rate=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=50,
    n_jobs=-1,
    random_state=42
)

# train model
xgb_model.fit(X_train, y_train)

# tredict and evaluate
def evaluate_performance(X, y_true, dataset_name):
    y_pred = xgb_model.predict(X)
    print(f"\n{dataset_name} Set Performance:")
    print(classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"{dataset_name} confusion matrix")
    plt.show()

# Run evaluations
evaluate_performance(X_train, y_train, "Training")
evaluate_performance(X_val, y_val, "Validation")
evaluate_performance(X_test, y_test, "Test")





In [None]:
#Running AMT to see if can improve Test performance

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd

feature_columns = [
'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]


# prepare datasets using only selected features
X_train = df_train[feature_columns]
y_train = df_train['outcome_type_harmonized_grouped']

X_val = df_validate[feature_columns]
y_val = df_validate['outcome_type_harmonized_grouped']

X_test = df_test[feature_columns]
y_test = df_test['outcome_type_harmonized_grouped']

# Combine train and val for GridSearchCV
X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'gamma': [0, 2, 4],
    'min_child_weight': [1, 4, 6],
    'subsample': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100],
}

# Initialize model
xgb_base = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42
)

# Grid search with 3-fold CV
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_trainval, y_trainval)
best_model = grid_search.best_estimator_

print("Best parameters found:", grid_search.best_params_)

# Evaluation function
def evaluate_performance(X, y_true, dataset_name):
    y_pred = best_model.predict(X)
    print(f"\n{dataset_name} Set Performance:")
    print(classification_report(y_true, y_pred))

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"{dataset_name} Confusion Matrix")
    plt.show()

# Run evaluations
evaluate_performance(X_train, y_train, "Training")
evaluate_performance(X_val, y_val, "Validation")
evaluate_performance(X_test, y_test, "Test")


In [None]:
#Outputting Best Parameters, saving model 

best_params = grid_search.best_params_
print(best_params)

best_model.save_model("best_xgb_model_local.json") 



## Sagemaker Model Parameter and Training

In [None]:

#Uploading to S3, withhead or index

import io
import boto3

bucket_name = bucket
base_prefix = prefix
s3_client = boto3.client("s3")

# Reorder columns to ensure label is first
def reorder_columns(df, label_col):
    cols = [label_col] + [col for col in df.columns if col != label_col]
    return df[cols]

# Helper to upload a DataFrame as CSV to S3 (with header and index)
def upload_df_to_s3(df, path, label_col):
    df = reorder_columns(df, label_col)

    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=True, header=True)  #  keep index and header
    s3_client.put_object(Bucket=bucket_name, Key=path, Body=csv_buffer.getvalue())
    print(f"Uploaded to s3://{bucket_name}/{path} (with header and index)")

# Upload each split
label_col = 'outcome_type_harmonized_grouped'

upload_df_to_s3(df_train, f"{base_prefix}/train/train_data.csv", label_col)
upload_df_to_s3(df_test, f"{base_prefix}/test/test_data.csv", label_col)
upload_df_to_s3(df_validate, f"{base_prefix}/val/validate_data.csv", label_col)



In [None]:
# Set Parameters for Binary Classification

hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",                   
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "num_round": "50",      
    "verbosity": "2",
    "seed": "42",
    "scale_pos_weight": "2.16"  
}
# Set output
instance_type = "ml.m5.4xlarge"
output_path = "s3://{}/{}/output".format(bucket, prefix)
content_type = "csv"




In [None]:
import boto3
import pandas as pd
import io

# S3 path
bucket = "sagemaker-us-east-2-917456409349"
input_key = "sagemaker/adoption/train/train_data.csv"
output_key = "sagemaker/adoption/train/train_data_filtered/train_data.csv" #EDIT-Remember to update this if simplifying pipeline

# Define target and features
target_column = 'outcome_type_harmonized_grouped'
feature_columns = [
'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]

columns_to_upload = [target_column] + feature_columns  # label first

# Load CSV from S3
s3 = boto3.client("s3")
response = s3.get_object(Bucket=bucket, Key=input_key)
body = response["Body"].read()
df = pd.read_csv(io.BytesIO(body))

#drop the first column if it's an unnamed index
if df.columns[0].lower().startswith('unnamed') or df.columns[0] == '':
    df = df.iloc[:, 1:]

# Filter and drop rows with missing target
df_filtered = df[columns_to_upload].dropna(subset=[target_column])

# Write to memory (no header, no index)
csv_buffer = io.StringIO()
df_filtered.to_csv(csv_buffer, index=False, header=False)

# Upload to S3
s3.put_object(Bucket=bucket, Key=output_key, Body=csv_buffer.getvalue())
print(f" Uploaded clean CSV to s3://{bucket}/{output_key} (no index, no header)")



In [None]:
#Training using predefined hyperparameters
#Edit: Referenced code from example XGBoost Model Abalone
#https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_managed_spot_training.html
#7/4/25

import time
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
import sagemaker

# Generate a timestamped job name
job_name = f"adoption-xgboost-{time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())}"
print("Training job:", job_name)

# Spot instance configuration
use_spot_instances = True
max_run = 3600  # max time for actual training
max_wait = 7200 if use_spot_instances else None  # max total time (includes waiting for spot)

# Checkpoint path (optional for spot)
checkpoint_s3_uri = (
    f"s3://{bucket}/{prefix}/checkpoints/{job_name}" if use_spot_instances else None
)
print("Checkpoint path:", checkpoint_s3_uri)

# Define SageMaker estimator
estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type=instance_type,
    volume_size=5,  # GB
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri,
    hyperparameters=hyperparameters
)

# Training input from S3 (folder-style path)
train_input = TrainingInput(
    s3_data=f"s3://{bucket}/sagemaker/adoption/train/train_data_filtered",
    content_type="text/csv"
)

# Launch training job
estimator.fit({"train": train_input}, job_name=job_name)


## Testing Sagemaker Endpoint Model - ONLY DO THIS TO TEST MODEL IS WORKING BEFORE TESTING ENDPOINT

In [None]:
import boto3

s3 = boto3.client('s3')
bucket = 'sagemaker-us-east-2-917456409349'
key = 'sagemaker/adoption/output/adoption-xgboost-2025-07-17-00-53-45/output/model.tar.gz' 
local_path = 'model.tar.gz'

# Download model.tar.gz
s3.download_file(bucket, key, local_path)
print(" model.tar.gz downloaded locally.")



In [None]:
import tarfile

# Extract model.tar.gz
with tarfile.open("model.tar.gz") as tar:
    tar.extractall("model_dir")

print(" Extracted model files:")
!ls model_dir


In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

# Load trained model
booster = xgb.Booster()
booster.load_model("model_dir/xgboost-model")  # path from SageMaker tar.gz

# Manually assign feature names 
booster.feature_names = feature_columns

# Plot feature importance
xgb.plot_importance(booster, importance_type='weight', show_values=True)
plt.title("Feature Importance (by frequency)")
plt.tight_layout()
plt.show()




In [None]:
#Seeing all columns so I know which ones to drop

df_train.columns.tolist()


In [None]:
## Local Testing of tar.gz model

import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt

# Load trained XGBoost model
booster = xgb.Booster()
booster.load_model("model_dir/xgboost-model")

# Define feature columns (must match training)
feature_columns = ['Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
] 

target_column = 'outcome_type_harmonized_grouped'

# Prepare datasets
X_train = df_train[feature_columns]
y_train = df_train[target_column].astype(int)

X_val = df_validate[feature_columns]
y_val = df_validate[target_column].astype(int)

X_test = df_test[feature_columns]
y_test = df_test[target_column].astype(int)

# Convert to DMatrix with feature names
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_columns)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_columns)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_columns)

# Predict probabilities
y_train_pred = booster.predict(dtrain)
y_val_pred = booster.predict(dval)
y_test_pred = booster.predict(dtest)

# Convert to binary predictions
y_train_pred_labels = (y_train_pred >= 0.5).astype(int)
y_val_pred_labels = (y_val_pred >= 0.5).astype(int)
y_test_pred_labels = (y_test_pred >= 0.5).astype(int)

# Evaluate
print("Training Set Performance:")
print(classification_report(y_train, y_train_pred_labels, zero_division=0))
print(f"AUC: {roc_auc_score(y_train, y_train_pred):.4f}")

print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred_labels, zero_division=0))
print(f"AUC: {roc_auc_score(y_val, y_val_pred):.4f}")

print("Test Set Performance:")
print(classification_report(y_test, y_test_pred_labels, zero_division=0))
print(f"AUC: {roc_auc_score(y_test, y_test_pred):.4f}")

# Distributions
print("\nPredicted labels distribution:")
print("Train:", np.bincount(y_train_pred_labels))
print("Val:  ", np.bincount(y_val_pred_labels))
print("Test: ", np.bincount(y_test_pred_labels))

print("\nTrue labels:")
print("Train:", np.bincount(y_train))
print("Val:  ", np.bincount(y_val))
print("Test: ", np.bincount(y_test))

# Feature Importance
booster.feature_names = feature_columns
xgb.plot_importance(booster, importance_type='gain', show_values=True)
plt.title("Feature Importance (Gain)")
plt.tight_layout()
plt.show()


## Setting up Endpoint - ONLY DO WHEN SETTING UP ENDPOINT OTHERWISE SKIP OVER

In [None]:
#EDIT/NOTE: Do this after locally testing Sagemaker Model for effectiveness.

from sagemaker.serverless import ServerlessInferenceConfig
from sagemaker.model import Model

serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=6144,
    max_concurrency=3,
)

model = Model(
    image_uri=container,
    model_data=estimator.model_data,
    role=role,
    sagemaker_session=sagemaker.Session()
)

predictor = model.deploy(
    serverless_inference_config=serverless_config
)


## Endpoint Testing


In [None]:
#Note: Make sure to manually update endpoint name as needed
endpoint_name = 'sagemaker-xgboost-2025-07-17-01-08-23-168'

In [None]:
# #Note: Files sourced in S3 Have no Headers but columns are: feature_columns = ['Encoded-animal_type', 
#     'Encoded-primary_breed_harmonized', 
#     'Encoded-primary_color_harmonized', 
#     'Encoded-sex', 
#     'Encoded-intake_type_harmonized', 
#     'Encoded-Is_returned', 
#     'Encoded-has_name', 
#     'Encoded-is_mix',
#     'age_months',    
#     'Num_returned', 
#     'stay_length_days', 
#     'min_height', 
#     'max_height',
#     'min_weight', 
#     'max_weight', 
#     'min_expectancy', 
#     'max_expectancy',
#     'grooming_frequency_value', 
#     'shedding_value', 
#     'energy_level_value',
#     'trainability_value', 
#     'demeanor_value'
# ] 


In [None]:
prefix

In [None]:
X_train

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os


# CONFIGURATION

MODEL_PATH = "model_dir/xgboost-model"  # path to SageMaker model
OUTPUT_PREDICTIONS_CSV = "model_predictions.csv"  # output predictions file

# Feature list (must match model training)
feature_columns = ['Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
] 
target_column = 'outcome_type_harmonized_grouped'

# LOAD MODEL

booster = xgb.Booster()
booster.load_model(MODEL_PATH)


# PREPARE DATASETS

X_train = df_train[feature_columns]
y_train = df_train[target_column].astype(int)

X_val = df_validate[feature_columns]
y_val = df_validate[target_column].astype(int)

X_test = df_test[feature_columns]
y_test = df_test[target_column].astype(int)

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_columns)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_columns)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_columns)


# PREDICTIONS

y_train_proba = booster.predict(dtrain)
y_val_proba = booster.predict(dval)
y_test_proba = booster.predict(dtest)

# Binary classification threshold
threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_val_pred = (y_val_proba >= threshold).astype(int)
y_test_pred = (y_test_proba >= threshold).astype(int)


# PERFORMANCE REPORTS

def evaluate_performance(y_true, y_pred, y_proba, name):
    print(f"\n{name} Performance:")
    print(classification_report(y_true, y_pred, zero_division=0))
    auc = roc_auc_score(y_true, y_proba)
    print(f"AUC: {auc:.4f}")
    return auc

auc_train = evaluate_performance(y_train, y_train_pred, y_train_proba, "TRAIN")
auc_val = evaluate_performance(y_val, y_val_pred, y_val_proba, "VALIDATION")
auc_test = evaluate_performance(y_test, y_test_pred, y_test_proba, "TEST")


# ROC CURVES

plt.figure()
RocCurveDisplay.from_predictions(y_train, y_train_proba, name="Train")
RocCurveDisplay.from_predictions(y_val, y_val_proba, name="Validation")
RocCurveDisplay.from_predictions(y_test, y_test_proba, name="Test")
plt.title("ROC Curves (Train / Val / Test)")
plt.legend()
plt.show()


# FEATURE IMPORTANCE

booster.feature_names = feature_columns
xgb.plot_importance(booster, importance_type='gain', show_values=True)
plt.title("Feature Importance (Gain)")
plt.tight_layout()
plt.show()


# SAVE PREDICTIONS TO CSV

predictions_df = pd.DataFrame({
    "Dataset": (["Train"] * len(y_train)) + (["Validation"] * len(y_val)) + (["Test"] * len(y_test)),
    "True_Label": np.concatenate([y_train, y_val, y_test]),
    "Predicted_Label": np.concatenate([y_train_pred, y_val_pred, y_test_pred]),
    "Predicted_Probability": np.concatenate([y_train_proba, y_val_proba, y_test_proba])
})
predictions_df.to_csv(OUTPUT_PREDICTIONS_CSV, index=False)
print(f"\n Predictions saved to {OUTPUT_PREDICTIONS_CSV}")




In [None]:
#Testing on Train Data

import boto3
import pandas as pd
import io
import numpy as np
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import classification_report

# Configuration
train_key = f"{prefix}/train/train_data.csv"
batch_size = 500  # adjust for performance

# Predictor setup
predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

# Load train data from S3
s3_client = boto3.client("s3")
obj = s3_client.get_object(Bucket=bucket, Key=train_key)
df_full = pd.read_csv(io.BytesIO(obj["Body"].read()), header=0)  # assumes CSV has header

# Define features used by model
feature_columns =['Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
] 

# Label
target_column = 'outcome_type_harmonized_grouped'

# Extract label and features
y_true = df_full[target_column].values
X = df_full[feature_columns]

# Batch inference
y_pred = []
for i in range(0, len(X), batch_size):
    batch = X.iloc[i:i + batch_size]
    payload = "\n".join([",".join(map(str, row)) for row in batch.values])
    response = predictor.predict(payload)
    scores = [p['score'] for p in response['predictions']]
    y_pred.extend([round(score) for score in scores])

# Append predictions
df_full["predicted_outcome"] = y_pred

# Evaluate
print("Train Set Performance:")
print(classification_report(y_true, y_pred, zero_division=0))

# (Optional) Save locally or to S3
#df_full.to_csv("train_with_predictions.csv", index=False)


## Endpoint Testing sending a Payload

In [None]:
import pandas as pd
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Config
local_csv_path = "test.csv"
batch_size = 500
# Define features used by model
feature_columns =['Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
] 


# Label
target_column = 'outcome_type_harmonized_grouped'


# Predictor
predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

# Load and prepare data
df_full = pd.read_csv(local_csv_path)
X = df_full[feature_columns]

# Predict in batches
preds = []
for i in range(0, len(X), batch_size):
    batch = X.iloc[i:i + batch_size]
    payload = "\n".join([",".join(map(str, row)) for row in batch.values])
    response = predictor.predict(payload)
    scores = [p["score"] for p in response["predictions"]]
    preds.extend([round(score) for score in scores])

# Append predictions to full data
df_with_preds = df_full.copy()
df_with_preds["predicted_outcome"] = preds

# Save to file
df_with_preds.to_csv("test_with_predictions_endpointpayload.csv", index=False)
print(" Saved test_with_predictions_endpointpayload.csv with headers and predictions.")



In [None]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
import matplotlib.pyplot as plt

# Load CSV
df = pd.read_csv("test_with_predictions_endpointpayload.csv")

# True and predicted columns
y_true = df['outcome_type_harmonized_grouped']
y_pred = df['predicted_outcome']

#If have probabilities, load them into an array (adjust column names)
#For binary classification:
#y_prob = df['prob_class1']

# For multiclass, extract all probability columns
prob_cols = [col for col in df.columns if col.startswith("prob_")]
y_prob = df[prob_cols].values  # shape: (n_samples, n_classes)

# Compute metrics
print(" Classification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
labels = sorted(y_true.unique())
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix")
plt.show()

# # AUC Score (multiclass)
# from sklearn.preprocessing import label_binarize

# classes = sorted(y_true.unique())
# y_true_bin = label_binarize(y_true, classes=classes)
# auc = roc_auc_score(y_true_bin, y_prob, multi_class='ovr')
# print(f"ROC AUC (One-vs-Rest): {auc:.4f}")




## Encoding Standardization for use on new csv.

In [None]:
import pandas as pd
import pickle


# Load saved encoders
with open("label_encoders.pkl", "rb") as f:
    le_dict = pickle.load(f)

# load csv
df_test = pd.read_csv("test.csv")

# handle missing age months as above
median_age = df_test["age_months"].median()
df_test["age_months"] = df_test["age_months"].fillna(median_age)

#applying encodings
categorical_cols = [col for col in df_test.columns if col not in ['age_months', 'shelter', 'split', 'outcome_type_harmonized_grouped']] #df['outcome_type_harmonized_grouped']

for col in categorical_cols:
    if col in le_dict:
        le = le_dict[col]
        known_classes = set(le.classes_)
        df_test[col] = df_test[col].astype(str).apply(lambda x: le.transform([x])[0] if x in known_classes else -1)


##  Set up CSV Lambda ingestion and encoding

### Only do this once!!

In [None]:
#Creating a dictionary as lambda layer won't run with scikit
import pickle
import boto3
import io

# Load the sklearn-based encoders
with open("label_encoders.pkl", "rb") as f:
    le_dict = pickle.load(f)

# Convert each LabelEncoder to a plain dictionary
clean_dict = {
    col: dict(zip(le.classes_, le.transform(le.classes_)))
    for col, le in le_dict.items()
}

# Serialize the clean dict (no sklearn)
pkl_buffer = io.BytesIO()
pickle.dump(clean_dict, pkl_buffer)
pkl_buffer.seek(0)

# Upload to S3
s3 = boto3.client("s3")
s3.put_object(
    Bucket="sagemaker-us-east-2-917456409349",
    Key="sagemaker/adoption/encoders_model_files/clean_label_encoders_dict.pkl",
    Body=pkl_buffer.getvalue()
)

print("Saved clean dictionary to S3 — no scikit-learn dependency.")


In [None]:
#This ingests the sample data and applies encoding for testing

import boto3
import pandas as pd
import io

s3 = boto3.client('s3')

bucket = "sagemaker-us-east-2-917456409349"
key = "sagemaker/adoption/landing_zone_sample/df_cat_dog_harmonized_Sample_No_Known_Outcome.csv"

response = s3.get_object(Bucket=bucket, Key=key)
df = pd.read_csv(io.BytesIO(response['Body'].read()))

print(" Loaded sample data")
df.head()


In [None]:
#Load Key Dictionary

key_dict = "sagemaker/adoption/encoders_model_files/clean_label_encoders_dict.pkl"

response = s3.get_object(Bucket=bucket, Key=key_dict)
clean_dict = pickle.load(io.BytesIO(response['Body'].read()))

print("Loaded clean label encoder dictionary")


In [None]:
#TESTING before running dictionary encoding in lambda
#Applies encoding from Dictionary against key data points

columns_to_encode = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized',
    'Is_returned', 'has_name', 'is_mix'
]

# Apply mappings
for col in columns_to_encode:
    encoded_col = f"Encoded-{col}"
    df[col] = df[col].astype(str).fillna('nan')  # Ensure alignment with dict
    df[encoded_col] = df[col].map(clean_dict[col]).fillna(-1).astype(int)  # Handle any mismatches safely

# Fill missing age_months with 48
df['age_months'] = df['age_months'].fillna(48)

print(" Applied encodings and filled age_months")
df.head(300)


In [None]:
# Save to CSV locally or upload back to S3
#This is encoded sample

csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)

s3.put_object(
    Bucket=bucket,
    Key="sagemaker/adoption/encoded_zone_sample/df_cat_dog_harmonized_Sample_With_No_Known_Outcome_ENCODED_NONLAMBDA.csv",
    Body=csv_buffer.getvalue()
)

print(" Uploaded encoded file to S3")


In [None]:
import boto3
import pandas as pd
import pickle
import io

# S3 paths
bucket = "sagemaker-us-east-2-917456409349"
encoding_key = "sagemaker/adoption/encoders_model_files/clean_label_encoders_dict.pkl"
output_csv_key = "sagemaker/adoption/encoded_zone_sample/df_cat_dog_harmonized_Sample_With_No_Known_Outcome_ENCODED.csv"

# Initialize S3 client
s3 = boto3.client("s3")

# --- Load encoder dict ---
encoding_obj = s3.get_object(Bucket=bucket, Key=encoding_key)
clean_dict = pickle.load(io.BytesIO(encoding_obj['Body'].read()))

# --- Load encoded CSV ---
csv_obj = s3.get_object(Bucket=bucket, Key=output_csv_key)
df_encoded = pd.read_csv(io.BytesIO(csv_obj['Body'].read()))

# Columns to check
columns_to_check = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized', 'Is_returned', 'has_name', 'is_mix'
]

print(" Checking columns...")
print("CSV Columns:", df_encoded.columns.tolist())
print("Encoder Dict Keys:", list(clean_dict.keys()))

summary = []

for col in columns_to_check:
    encoded_col = f"Encoded-{col}"
    print(f"\n🔍 Checking column: {col}")

    if col not in df_encoded.columns:
        print(f" Raw column '{col}' missing in CSV")
        summary.append((col, "Raw column missing"))
        continue

    if encoded_col not in df_encoded.columns:
        print(f" Encoded column '{encoded_col}' missing in CSV")
        summary.append((col, "Encoded column missing"))
        continue

    # Build mapping from CSV
    inferred_map = (
        df_encoded[[col, encoded_col]]
        .dropna()
        .drop_duplicates()
        .set_index(col)[encoded_col]
        .to_dict()
    )

    # Encoder mapping
    encoder_map = clean_dict.get(col, {})
    if not encoder_map:
        print(f" No encoder mapping found for '{col}'")
        summary.append((col, "Encoder missing"))
        continue

    # Compare
    mismatches = []
    for val, encoded in inferred_map.items():
        expected = encoder_map.get(val)
        if expected != encoded:
            mismatches.append((val, encoded, expected))

    if mismatches:
        print(f" {len(mismatches)} mismatches found:")
        for val, actual, expected in mismatches[:10]:
            print(f"    Value: '{val}' → CSV: {actual}, Encoder: {expected}")
        summary.append((col, f"{len(mismatches)} mismatches"))
    else:
        print(" All values match")
        summary.append((col, "OK"))

# Summary
print("\n=== SUMMARY ===")
for col, status in summary:
    print(f"{col}: {status}")



In [None]:
#Diagnostic on encoding 
import boto3
import pandas as pd
import pickle
import io

# Setup
bucket = "sagemaker-us-east-2-917456409349"
raw_csv_key = "sagemaker/adoption/landing_zone_sample/df_cat_dog_harmonized_Sample_No_Known_Outcome.csv"
dict_key = "sagemaker/adoption/encoders_model_files/clean_label_encoders_dict.pkl"

# --- Load input CSV
s3 = boto3.client("s3")
csv_obj = s3.get_object(Bucket=bucket, Key=raw_csv_key)
df_raw = pd.read_csv(io.BytesIO(csv_obj['Body'].read()))

# Load clean_dict.pkl 
dict_obj = s3.get_object(Bucket=bucket, Key=dict_key)
clean_dict = pickle.load(io.BytesIO(dict_obj['Body'].read()))

# Columns to test
columns = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized',
    'Is_returned', 'has_name', 'is_mix'
]

# Compare encodability 
summary = {}

for col in columns:
    if col not in df_raw.columns:
        print(f" Column '{col}' not found in input.")
        continue

    # Clean values
    raw_vals = df_raw[col].fillna("Unknown").astype(str).str.strip().str.lower()

    # Dictionary keys
    dict_keys = set(map(str.lower, clean_dict.get(col, {}).keys()))

    # Determine unmapped values
    unmapped = sorted(set(raw_vals) - dict_keys)

    summary[col] = {
        "total_unique_values": len(set(raw_vals)),
        "unmapped_count": len(unmapped),
        "unmapped_values": unmapped[:10]  # sample
    }

# --- Report ---
for col, result in summary.items():
    print(f"\n Column: {col}")
    print(f"Total unique raw values: {result['total_unique_values']}")
    print(f" Unmapped values: {result['unmapped_count']}")
    if result["unmapped_count"] > 0:
        print("   Sample unmapped values:", result["unmapped_values"])


In [None]:
#Based on the above, we have no unmapped values and can proceed to layers

## Test end to End w Lambda Sample and Original Sample Output, Confusion Matrix

In [None]:
#Checking Lambda Function works for Encoding by  comparing local and lambda encoded tables
import boto3
import pandas as pd
import io

# S3 details
bucket = "sagemaker-us-east-2-917456409349"
file1 = "sagemaker/adoption/encoded_zone_sample/df_cat_dog_harmonized_Sample_With_No_Known_Outcome_ENCODED_NONLAMBDA.csv"
file2 = "sagemaker/adoption/encoded_zone_sample/df_cat_dog_harmonized_Sample_With_No_Known_Outcome_ENCODED.csv"

# Initialize S3 client
s3 = boto3.client('s3')

def load_csv_from_s3(bucket, key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(io.BytesIO(obj['Body'].read()))

# Load both files
print(" Loading files from S3...")
df1 = load_csv_from_s3(bucket, file1)
df2 = load_csv_from_s3(bucket, file2)

# Compare shape
if df1.shape != df2.shape:
    print(f" Shape mismatch: {df1.shape} vs {df2.shape}")
else:
    print(f" Shape matches: {df1.shape}")

# Compare columns
if list(df1.columns) != list(df2.columns):
    print(" Column mismatch")
    print("File 1 columns:", df1.columns.tolist())
    print("File 2 columns:", df2.columns.tolist())
else:
    print(" Columns match")

# Compare data
if df1.equals(df2):
    print(" Tables are identical")
else:
    # Find differences
    diff_mask = df1 != df2
    diff_count = diff_mask.sum().sum()
    print(f" Tables differ in {diff_count} cells")

    # Optionally, save diff rows
    diff_rows = df1[diff_mask.any(axis=1)]
    diff_rows.to_csv("differences.csv", index=False)
    print("Differences saved to differences.csv")


In [None]:
#Checking Lambda generated encoding file that is scored for prediction vs endpointtested prediction
#First we have to generate our comparison file and write to bucket
import boto3
import pandas as pd
import io
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# --- Config ---
bucket = "sagemaker-us-east-2-917456409349"
input_key = "sagemaker/adoption/encoded_zone_sample/df_cat_dog_harmonized_Sample_With_No_Known_Outcome_ENCODED.csv"
output_key = "sagemaker/adoption/output_zone_sample/sample_data_output_IPYNBDTESTEDREMOTE.csv"
endpoint_name = "sagemaker-xgboost-2025-07-17-01-08-23-168"
batch_size = 500

# --- Features used by model ---
feature_columns = [
    'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', 
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]

# --- Initialize S3 and Predictor ---
s3 = boto3.client("s3")
predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer() 
)

# --- Load input file from S3 ---
print("Loading input file from S3...")
obj = s3.get_object(Bucket=bucket, Key=input_key)
df_full = pd.read_csv(io.BytesIO(obj["Body"].read()))
print(f"Loaded {len(df_full)} rows from {input_key}")

# --- Prepare features ---
X = df_full[feature_columns].fillna(0)

# --- Predict in batches ---
pred_probs = []
for i in range(0, len(X), batch_size):
    batch = X.iloc[i:i + batch_size]
    payload = "\n".join([",".join(map(str, row)) for row in batch.values])
    response = predictor.predict(payload)
    #  Extract "score" from JSON response
    batch_scores = [pred["score"] for pred in response["predictions"]]
    pred_probs.extend(batch_scores)
    print(f"Processed batch {i // batch_size + 1} ({len(batch)} rows)")

# --- Convert to hard labels ---
pred_labels = [1 if p >= 0.5 else 0 for p in pred_probs]

# --- Append predictions ---
df_full["predicted_proba"] = pred_probs
df_full["predicted_label"] = pred_labels

# --- Save output to S3 ---
csv_buffer = io.StringIO()
df_full.to_csv(csv_buffer, index=False)
s3.put_object(Bucket=bucket, Key=output_key, Body=csv_buffer.getvalue())
print(f" Predictions saved to s3://{bucket}/{output_key}")



In [None]:
#Finally, comparing output tables and they're identical!!!


import boto3
import pandas as pd
import io

# --- Config ---
bucket = "sagemaker-us-east-2-917456409349"
file_a = "sagemaker/adoption/output_zone_sample/sample_data_output_IPYNBDTESTEDREMOTE.csv"  # Notebook-scored
file_b = "sagemaker/adoption/output_zone_sample/sample_data_output.csv"  # Lambda/pipeline-scored

s3 = boto3.client('s3')

# --- Function to load CSV from S3 ---
def load_csv(bucket, key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(io.BytesIO(obj['Body'].read()))

# --- Load both files ---
print(" Loading files from S3...")
df_a = load_csv(bucket, file_a)
df_b = load_csv(bucket, file_b)

print(f" File A ({file_a}): {df_a.shape} rows & columns")
print(f" File B ({file_b}): {df_b.shape} rows & columns")

# --- Compare shape ---
if df_a.shape != df_b.shape:
    print(f" Shape mismatch: {df_a.shape} vs {df_b.shape}")
else:
    print(" Shape matches")

# --- Compare column names ---
if list(df_a.columns) != list(df_b.columns):
    print(" Column mismatch!")
    print("File A columns:", df_a.columns.tolist())
    print("File B columns:", df_b.columns.tolist())
else:
    print(" Columns match")

# --- Compare entire DataFrame ---
if df_a.equals(df_b):
    print(" Tables are completely identical")
else:
    # Detect differences
    diff_mask = df_a != df_b
    diff_count = diff_mask.sum().sum()
    print(f" Tables differ in {diff_count} cells")

    # Find rows with any difference
    diff_rows = df_a[diff_mask.any(axis=1)]
    print(f" {len(diff_rows)} rows have differences")

    # Save differences locally and upload to S3
    diff_file = "differences.csv"
    diff_rows.to_csv(diff_file, index=False)
    s3.put_object(Bucket=bucket, Key="sagemaker/adoption/output_zone_sample/differences.csv", Body=open(diff_file, "rb"))
    print(f" Differences saved to s3://{bucket}/sagemaker/adoption/output_zone_sample/differences.csv")



In [None]:
#Lastly Comparing Performance
import boto3
import pandas as pd
import io
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score
import matplotlib.pyplot as plt

# Config 
bucket = "sagemaker-us-east-2-917456409349"
predicted_key = "sagemaker/adoption/output_zone_sample/sample_data_output.csv"
golden_key = "sagemaker/adoption/golden_record/df_cat_dog_harmonized_Sample_With_Outcome.csv"

s3 = boto3.client('s3')

# Load function 
def load_csv(bucket, key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(io.BytesIO(obj['Body'].read()))

# Load data 
print(" Loading predicted and golden files...")
df_pred = load_csv(bucket, predicted_key)
df_golden = load_csv(bucket, golden_key)

print(f" Predicted file: {df_pred.shape}")
print(f" Golden file: {df_golden.shape}")

# Ensure primary_key exists 
if "primary_key" not in df_pred.columns or "primary_key" not in df_golden.columns:
    raise ValueError(" primary_key column missing in one of the files.")

# Prepare ground truth 
df_golden["actual_label"] = df_golden["outcome_type_harmonized_grouped"].apply(
    lambda x: 1 if str(x).lower() == "adopted" else 0
)

# Join on primary_key 
df_joined = pd.merge(df_pred, df_golden[["primary_key", "actual_label"]], on="primary_key", how="inner")
print(f" Joined data: {df_joined.shape}")

# Check required columns -
if "predicted_label" not in df_joined.columns:
    raise ValueError(" predicted_label column not found in predicted file.")
if "predicted_proba" not in df_joined.columns:
    raise ValueError(" predicted_proba column missing. Add probabilities for AUC calculation.")

# Extract labels and probabilities 
y_true = df_joined["actual_label"]
y_pred = df_joined["predicted_label"]
y_proba = df_joined["predicted_proba"]

# Compute metrics 
print("\n Classification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

# --- Compute AUC ---
auc = roc_auc_score(y_true, y_proba)
print(f" ROC AUC Score: {auc:.4f}")

# --- Confusion Matrix ---
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Adopted", "Adopted"])
disp.plot(cmap="Blues", values_format="d")
plt.title(f"Confusion Matrix (AUC={auc:.3f})")
plt.show()



## Need to draw out prime features from XGBOOST or other model

In [None]:
#Build Shap EXplainer .py
#Built Lambda Function


In [None]:
import boto3
import tarfile

bucket = 'sagemaker-us-east-2-917456409349'
key = 'sagemaker/adoption/output/adoption-xgboost-2025-07-17-00-53-45/output/model.tar.gz'
local_path = './model.tar.gz'  # Download into current directory

# Download the model tar from S3
s3 = boto3.client('s3')
s3.download_file(bucket, key, local_path)

# Extract locally
with tarfile.open(local_path) as tar:
    tar.extractall(path='./model')

print("Model extracted to ./model directory")



In [None]:
#Pulling Down Scored Sampel from S3
!aws s3 cp s3://sagemaker-us-east-2-917456409349/sagemaker/adoption/output_zone_sample/sample_data_output.csv scored.csv


In [None]:
#Testing Locally
import tarfile
import pandas as pd
import xgboost as xgb


# Paths

model_tar_path = 'model.tar.gz'  # Downloaded from S3
input_csv_path = 'scored.csv'    # Downloaded from S3
output_csv_path = 'final_with_shap.csv'


# Full Feature List (22 columns)

feature_cols = [
    'Encoded-animal_type',
    'Encoded-primary_breed_harmonized',
    'Encoded-primary_color_harmonized',
    'Encoded-sex',
    'Encoded-intake_type_harmonized',
    'Encoded-Is_returned',
    'Encoded-has_name',
    'Encoded-is_mix',
    'age_months',
    'Num_returned',
    'stay_length_days',
    'min_height',
    'max_height',
    'min_weight',
    'max_weight',
    'min_expectancy',
    'max_expectancy',
    'grooming_frequency_value',
    'shedding_value',
    'energy_level_value',
    'trainability_value',
    'demeanor_value'
]


# Extract Model

with tarfile.open(model_tar_path) as tar:
    tar.extractall(path='./model')
model_file = './model/xgboost-model'


# Load Model

booster = xgb.Booster()
booster.load_model(model_file)


# Load Scored Data

df = pd.read_csv(input_csv_path)
print(f"Loaded scored data with shape: {df.shape}")

# Ensure all required features exist
missing = [col for col in feature_cols if col not in df.columns]
if missing:
    raise ValueError(f"Missing required features in scored file: {missing}")


# Compute SHAP values

dmatrix = xgb.DMatrix(df[feature_cols])
shap_values = booster.predict(dmatrix, pred_contribs=True)

# Create SHAP DataFrame
shap_df = pd.DataFrame(shap_values, columns=feature_cols + ['bias']).drop(columns=['bias'])

# Rename SHAP columns for clarity
rename_map = {col: f"SHAP-{col.replace('Encoded-', '').replace('_', ' ').title()}" for col in feature_cols}
shap_df.rename(columns=rename_map, inplace=True)

# Append SHAP columns to original DataFrame
df = pd.concat([df, shap_df], axis=1)


# Compute Top Positive & Negative Features with SHAP values

pos1, pos2, pos3, neg1, neg2, neg3 = [], [], [], [], [], []

for i in range(shap_df.shape[0]):
    row = shap_df.iloc[i]

    # Sort positive and negative SHAP values
    pos_sorted = row[row > 0].sort_values(ascending=False)
    neg_sorted = row[row < 0].sort_values(ascending=True)  # ascending for negative (most negative first)

    # Format as "Feature (value)"
    p1 = f"{pos_sorted.index[0]} ({pos_sorted.iloc[0]:.4f})" if len(pos_sorted) > 0 else "None"
    p2 = f"{pos_sorted.index[1]} ({pos_sorted.iloc[1]:.4f})" if len(pos_sorted) > 1 else "None"
    p3 = f"{pos_sorted.index[2]} ({pos_sorted.iloc[2]:.4f})" if len(pos_sorted) > 2 else "None"

    n1 = f"{neg_sorted.index[0]} ({neg_sorted.iloc[0]:.4f})" if len(neg_sorted) > 0 else "None"
    n2 = f"{neg_sorted.index[1]} ({neg_sorted.iloc[1]:.4f})" if len(neg_sorted) > 1 else "None"
    n3 = f"{neg_sorted.index[2]} ({neg_sorted.iloc[2]:.4f})" if len(neg_sorted) > 2 else "None"

    pos1.append(p1)
    pos2.append(p2)
    pos3.append(p3)
    neg1.append(n1)
    neg2.append(n2)
    neg3.append(n3)

# Add to DataFrame
df['Positive_Feature_1'] = pos1
df['Positive_Feature_2'] = pos2
df['Positive_Feature_3'] = pos3
df['Negative_Feature_1'] = neg1
df['Negative_Feature_2'] = neg2
df['Negative_Feature_3'] = neg3


#  Save Enriched File

df.to_csv(output_csv_path, index=False)
print(f" SHAP file saved as: {output_csv_path}")



In [None]:
#Showing performance
import pandas as pd
import boto3
from io import BytesIO
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

# Load ground truth from S3
bucket = "sagemaker-us-east-2-917456409349"
key = "sagemaker/adoption/golden_record/df_cat_dog_harmonized_Sample_With_Outcome.csv"

s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key=key)
df_true = pd.read_csv(BytesIO(obj['Body'].read()))

# Convert ground truth to binary: adopted=1 else 0
df_true['true_label'] = (df_true['outcome_type_harmonized_grouped'] == 'adopted').astype(int)

# Load predicted local file
df_pred = pd.read_csv("final_with_shap.csv")

# Merge on primary_key
df_merged = pd.merge(df_true[['primary_key', 'true_label']], df_pred[['primary_key', 'predicted_label', 'predicted_proba']], on='primary_key')

# Extract arrays
y_true = df_merged['true_label']
y_pred = df_merged['predicted_label']
y_proba = df_merged['predicted_proba']

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
auc = roc_auc_score(y_true, y_proba)

print(f" Scored file shape: {df_pred.shape}")
print(f" Golden file shape: {df_true.shape}")
print(f" Merged shape: {df_merged.shape}\n")

print(" PERFORMANCE METRICS")
print(f" Accuracy:  {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall:    {recall:.4f}")
print(f" F1 Score:  {f1:.4f}")
print(f" AUC:       {auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Confusion Matrix")
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_true, y_proba)
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
