In [None]:
### XGBoost_Multi Build and Testing Local - Length of Stay


## Setup

In [None]:
#Importing key packages
import io
import os
import boto3
import sagemaker
import time
import botocore
from sagemaker import get_execution_role, image_uris, model_uris, script_uris, hyperparameters
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner
from time import gmtime, strftime
import matplotlib as plt
import numpy as np
import pandas as pd


In [None]:
#Setting up

role = sagemaker.get_execution_role()
boto_session = boto3.Session()
region = boto_session.region_name
sess = sagemaker.Session(boto_session=boto_session)

print("Role:", role)
print("Region:", region)
print("SageMaker Session Region:", sess.boto_region_name)

In [None]:
#Setting up Bucket Links/Info
bucket='xgb-los-multi'
s3_bucket_prefix= "xgb-los-model-code/"
prefix = f"{bucket}/{s3_bucket_prefix}"

In [None]:
prefix

In [None]:
##Set XGB Container

container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")


## Train/Test/Split

In [None]:
#Load Source file to do encoding and split train/test

#s3://sagemaker-us-east-2-917456409349/sagemaker/adoption/golden_record/df_cat_dog_harmonized_Sample_With_Outcome.csv

bucket = "sagemaker-us-east-2-917456409349"
key = "sagemaker/adoption/golden_record/df_cat_dog_harmonized.csv" 

# Initialize S3 client
s3_client = boto3.client("s3")

# Fetch the object from S3
obj = s3_client.get_object(Bucket=bucket, Key=key)

# Read into pandas DataFrame
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

# Show the first few rows
df.head()

In [None]:
#Deduping

# Drop duplicates, keeping the last record for each animal_id
df_deduped = df.drop_duplicates(subset='primary_key', keep='last')


print("Original rows:", len(df))
print("After deduplication:", len(df_deduped))
df = df_deduped.copy()
print("New rows for df:", len(df))

In [None]:
#Adding Custom Train/Test/Split
def assign_split(row):
    if row['outcome_year'] <= 2022:
        return "train"
    elif row['outcome_year'] in [2023, 2024]:
        return "validate"
    elif row['outcome_year'] == 2025:
        return "test"
    else:
        return "exclude"  # fallback for unexpected years

df['split'] = df.apply(assign_split, axis=1)


In [None]:

#Naming features to keep and drop if needed, but won't as keeping standard format of xlsx.
features_to_keep = ['outcome_type_harmonized_grouped','animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized',
    'Is_returned', 'has_name', 'is_mix', 'Num_returned', 'age_months','stay_length_days', 'min_height', 'max_height',
    'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy',
    'grooming_frequency_value', 'shedding_value', 'energy_level_value',
    'trainability_value', 'demeanor_value'
]

# # Trim the DataFrame to only those columns
# df = df[features_to_keep].copy()

# #EDIT: Only training on models with features to keep

In [None]:
#See all columns
all_columns = df.columns.tolist()
print(all_columns)
print("Total columns:", len(all_columns))

In [None]:
#Pre-encoding Adoption
df['outcome_type_harmonized_grouped'] = (df['outcome_type_harmonized_grouped'] == 'adopted').astype(int)

In [None]:
#Encoding
from sklearn.preprocessing import LabelEncoder

# Copy original DataFrame
encoded_df = df.copy()

# Specific columns we want to encode
columns_to_encode = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized',
    'Is_returned', 'has_name', 'is_mix'
]

# Dictionary to store label encoders
le_dict = {}

# Apply label encoding to specified columns, save in new columns
for col in columns_to_encode:
    le = LabelEncoder()
    encoded_col_name = f"Encoded-{col}"
    encoded_df[encoded_col_name] = le.fit_transform(encoded_df[col].astype(str))
    le_dict[col] = le

# Fill missing age_months with median
median_age = encoded_df['age_months'].median()
encoded_df['age_months'] = encoded_df['age_months'].fillna(median_age)



In [None]:
np.shape(encoded_df)

In [None]:
encoded_df.head()

In [None]:
# Saving Encoding locally JIC
import pickle

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)



In [None]:
# Splitting for training

df_train = encoded_df[encoded_df['split'] == 'train']
df_test = encoded_df[encoded_df['split'] == 'test']
df_validate = encoded_df[encoded_df['split'] == 'validate']
# Save each to CSV (no index)
df_train.to_csv("train.csv", index=False)
df_test.to_csv("test.csv", index=False)
df_validate.to_csv("validate.csv", index=False)

# Output sizes
print("Train rows:", len(df_train))
print("Test rows:", len(df_test))
print("Validate rows:", len(df_validate))

## Model Training our XGBoost so we can Test only on those with proba >0.5. Locally-only need to do once 

In [None]:
# TRain Model so we can get a prediction, if our predictions are similar to endpoitn testing in other files will use this to subset on
#adoption prediction, then do XGBoost Grid search

In [None]:
#Running AMT to see if can improve Test performance

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd

feature_columns = [
'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', #We comment this out in our prediction for los but not here
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]


# prepare datasets using only selected features
X_train = df_train[feature_columns]
y_train = df_train['outcome_type_harmonized_grouped']

X_val = df_validate[feature_columns]
y_val = df_validate['outcome_type_harmonized_grouped']

X_test = df_test[feature_columns]
y_test = df_test['outcome_type_harmonized_grouped']

# Combine train and val for GridSearchCV
X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'gamma': [0, 2, 4],
    'min_child_weight': [1, 4, 6],
    'subsample': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100],
}

# Initialize model
xgb_base = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42
)

# Grid search with 3-fold CV
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_trainval, y_trainval)
best_model = grid_search.best_estimator_

print("Best parameters found:", grid_search.best_params_)

# Evaluation function
def evaluate_performance(X, y_true, dataset_name):
    y_pred = best_model.predict(X)
    print(f"\n{dataset_name} Set Performance:")
    print(classification_report(y_true, y_pred))

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"{dataset_name} Confusion Matrix")
    plt.show()

# Run evaluations
evaluate_performance(X_train, y_train, "Training")
evaluate_performance(X_val, y_val, "Validation")
evaluate_performance(X_test, y_test, "Test")

In [None]:
#Outputting Best Parameters, saving model 

best_params = grid_search.best_params_
print(best_params)

best_model.save_model("best_xgb_model_local.json") 



In [None]:
#We get exact same parameters so good to rain on this new data set for our length of stay.

In [None]:
#Now we run whole encoded data set through to add our prediction and probability 
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Define feature cols
feature_columns_los = [
    'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    'stay_length_days', #We comment this out in our prediction for LOS
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]

# Prepare features from encoded_df
X_encoded = encoded_df[feature_columns_los]

# Check if ground truth labels exist
if 'outcome_type_harmonized_grouped' in encoded_df.columns:
    y_encoded = encoded_df['outcome_type_harmonized_grouped']
else:
    y_encoded = None

# Run prediction and predicted probabilities
y_pred = best_model.predict(X_encoded)
y_proba = best_model.predict_proba(X_encoded)[:, 1]  # Probability of positive class (adopted)

# Add predictions and probabilities to df
encoded_df['predicted_label'] = y_pred
encoded_df['predicted_proba'] = y_proba

# Evaluate Performance
if y_encoded is not None:
    print("Classification Report on encoded_df:")
    print(classification_report(y_encoded, y_pred))

    cm = confusion_matrix(y_encoded, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix on encoded_df")
    plt.show()
else:
    print("No ground truth labels found; predictions added to DataFrame.")


print(encoded_df[['predicted_label', 'predicted_proba']].head())

encoded_df

In [None]:
#See all columns
all_columns = encoded_df.columns.tolist()
print(all_columns)
print("Total columns:", len(all_columns))

In [None]:
#Next We see how many length of stay is NAN/Null and decide if we drop
null_count = encoded_df['stay_length_days'].isnull().sum()
print(f"Number of null values in 'stay_length_days': {null_count}")


In [None]:
#Given small number we'll take average

mean_stay_length = encoded_df['stay_length_days'].mean()

# Fill NaNs
encoded_df['stay_length_days'] = encoded_df['stay_length_days'].fillna(mean_stay_length)

print(f"Filled NaNs in 'stay_length_days' with mean value: {mean_stay_length:.2f}")


## TRaining for Length of Stay as Numeric using XGBoost

In [None]:
# Now we'll train locally to find our best fit but first we split again for train/test/validate
# Splitting for training

los_predicted_df_train = encoded_df[encoded_df['split'] == 'train']
los_predicted_df_test = encoded_df[(encoded_df['split'] == 'test') & (encoded_df['predicted_proba'] >= 0.5)]
los_predicted_df_validate = encoded_df[encoded_df['split'] == 'validate']

los_predicted_df_train.to_csv("los_train.csv", index=False)
los_predicted_df_test.to_csv("los_test.csv", index=False)
los_predicted_df_validate.to_csv("los_validate.csv", index=False)



# Output sizes
print("Train rows:", len(los_predicted_df_train))
print("Test rows:", len(los_predicted_df_test))
print("Validate rows:", len(los_predicted_df_validate))



In [None]:
los_predicted_df_train.head()

In [None]:
#Then we train locally using AMT to find our best model fit and ensure our Test is only on animals
#that are predicted adoption >=50%

#Running AMT to see if can improve Test performance
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error #Switched to numeric rather than categorical
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

feature_columns_los = [
    'Encoded-animal_type', 
    'Encoded-primary_breed_harmonized', 
    'Encoded-primary_color_harmonized', 
    'Encoded-sex', 
    'Encoded-intake_type_harmonized', 
    'Encoded-Is_returned', 
    'Encoded-has_name', 
    'Encoded-is_mix',
    'age_months',    
    'Num_returned', 
    #'stay_length_days', #edit: We dropped this out for training as is now our predicted variable
    'min_height', 
    'max_height',
    'min_weight', 
    'max_weight', 
    'min_expectancy', 
    'max_expectancy',
    'grooming_frequency_value', 
    'shedding_value', 
    'energy_level_value',
    'trainability_value', 
    'demeanor_value'
]

# Prepare datasets using selected features
X_train = los_predicted_df_train[feature_columns_los]
y_train = los_predicted_df_train['stay_length_days']  

X_val = los_predicted_df_validate[feature_columns_los]
y_val = los_predicted_df_validate['stay_length_days']

X_test = los_predicted_df_test[feature_columns_los]
y_test = los_predicted_df_test['stay_length_days']

# Integrate as gridsearch manages train/val
X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

# Define grid, kept as above for consistency
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'gamma': [0, 2, 4],
    'min_child_weight': [1, 4, 6],
    'subsample': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100],
}

# Initialize regression model
xgb_base = XGBRegressor(
    objective='reg:squarederror',#Switched to numeric rather than categorical
    n_jobs=-1,
    random_state=42
)

# Grid search with 3-fold CV, scoring R^2
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_trainval, y_trainval)
best_model_los = grid_search.best_estimator_

print("Best parameters fond:", grid_search.best_params_)

# Evaluation function for regression
def evaluate_regression_performance(X, y_true, dataset_name):
    y_pred = best_model_los.predict(X)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    print(f"\n{dataset_name} Set Performance:")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

# Run evaluations
evaluate_regression_performance(X_train, y_train, "Training")
evaluate_regression_performance(X_val, y_val, "Validation")
evaluate_regression_performance(X_test, y_test, "Test")


In [None]:
#Outputting Best Parameters, saving model 

best_params_los = grid_search.best_params_
print(best_params_los)

best_model.save_model("best_xgb_model_los_local.json") 


In [None]:
#Show scatterplot of Test

# Predict on test set
y_test_pred = best_model_los.predict(X_test)

# Filter where both actual and predicted < 100
mask = (y_test < 100) & (y_test_pred < 100)

plt.figure(figsize=(8, 6))
plt.scatter(y_test[mask], y_test_pred[mask], alpha=0.5)
plt.plot([0, 100], [0, 100], 'r--', linewidth=2)  # line
plt.xlabel("Actual stay_length_days (<100)")
plt.ylabel("Predicted stay_length_days (<100)")
plt.title("Actual vs Predicted Stay Length (Test Set, <100 days)")
plt.grid(True)
plt.show()



## Training for XGBoost Endpoint Length of Stay

In [None]:
#Seeing our DFs
los_predicted_df_train

In [None]:
los_predicted_df_validate

In [None]:
los_predicted_df_test

In [None]:
#Remembering our columns so we can pass the right features to XGboost

all_columns = los_predicted_df_test.columns.tolist()
print(all_columns)
print("Total columns for los_predicted structure:", len(all_columns))



In [None]:
#Data Prep from above work to S3 so sagemaker can access

#setup
import boto3
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
import pandas as pd

# sagemaker seetup
session = sagemaker.Session()
role = "arn:aws:iam::917456409349:role/Sagemaker_Execution_Role"

bucket = "xgb-los-multi"
prefix = "xgb-los-model-code"
data_prefix = f"{prefix}/data"

#data prep
feature_columns = [
    'Encoded-animal_type',
    'Encoded-primary_breed_harmonized',
    'Encoded-primary_color_harmonized',
    'Encoded-sex',
    'Encoded-intake_type_harmonized',
    'Encoded-Is_returned',
    'Encoded-has_name',
    'Encoded-is_mix',
    'age_months',
    'Num_returned',
    'min_height',
    'max_height',
    'min_weight',
    'max_weight',
    'min_expectancy',
    'max_expectancy',
    'grooming_frequency_value',
    'shedding_value',
    'energy_level_value',
    'trainability_value',
    'demeanor_value'
]
target_column = "stay_length_days"

#dataframe prep function
def prepare_for_sagemaker(df):
    df = df[feature_columns + [target_column]]
    return df[[target_column] + feature_columns]  # move target to front for sagemaker

# Prepare datasets
train_df = prepare_for_sagemaker(los_predicted_df_train)
val_df = prepare_for_sagemaker(los_predicted_df_validate)
test_df = prepare_for_sagemaker(los_predicted_df_test)

# Save as CSV (no header/index)
train_df.to_csv("train.csv", header=False, index=False)
val_df.to_csv("validation.csv", header=False, index=False)
test_df.to_csv("test.csv", header=False, index=False)

#S3 Upload for sagemaker to access
s3 = boto3.client("s3")
s3.upload_file("train.csv", bucket, f"{data_prefix}/train.csv")
s3.upload_file("validation.csv", bucket, f"{data_prefix}/validation.csv")
s3.upload_file("test.csv", bucket, f"{data_prefix}/test.csv")

print(" Uploaded to:")
print(f"s3://{bucket}/{data_prefix}/train.csv")
print(f"s3://{bucket}/{data_prefix}/validation.csv")
print(f"s3://{bucket}/{data_prefix}/test.csv")

train_s3_path = f"s3://{bucket}/{data_prefix}/train.csv"
val_s3_path = f"s3://{bucket}/{data_prefix}/validation.csv"


In [None]:
#Now we access S3 to run our endpoint and deployment training

#container image for XGBoost

container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")

# Create estimator
xgb_estimator_los = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=5,
    max_run=3600,
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=session
)

# Set hyperparameters fpr training
xgb_estimator_los.set_hyperparameters(
    objective="reg:squarederror",
    gamma=0,
    eta=0.05,               # learning_rate
    max_depth=5,
    min_child_weight=6,
    subsample=0.6,
    num_round=50            # n_estimators equivalent
)

print(" Starting training job...")
xgb_estimator_los.fit({
    "train": sagemaker.inputs.TrainingInput(train_s3_path, content_type="text/csv"),
    "validation": sagemaker.inputs.TrainingInput(val_s3_path, content_type="text/csv")
})





In [None]:
#Last, we deploy

from sagemaker.serializers import CSVSerializer


print(" Deploying endpoint...")
xgb_predictor_los = xgb_estimator_los.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="xgb-los-endpoint"
)

# Configure predictor
xgb_predictor_los.serializer = CSVSerializer()
print ("endpoitn is live:",xgb_predictor_los.endpoint_name)



In [None]:
#Now we test the endpoint using test.csv as test_df

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sagemaker.serializers import CSVSerializer

#Prepare features
target_column = "stay_length_days"
feature_columns = [
    'Encoded-animal_type',
    'Encoded-primary_breed_harmonized',
    'Encoded-primary_color_harmonized',
    'Encoded-sex',
    'Encoded-intake_type_harmonized',
    'Encoded-Is_returned',
    'Encoded-has_name',
    'Encoded-is_mix',
    'age_months',
    'Num_returned',
    'min_height',
    'max_height',
    'min_weight',
    'max_weight',
    'min_expectancy',
    'max_expectancy',
    'grooming_frequency_value',
    'shedding_value',
    'energy_level_value',
    'trainability_value',
    'demeanor_value'
]

X_test = test_df[feature_columns].fillna(0)
y_test = test_df[target_column].values

print(f" Using {len(X_test)} rows and {len(feature_columns)} features for inference")

#Send batched endpoints
xgb_predictor_los.serializer = CSVSerializer()
predictions = []
batch_size = 100

for i in range(0, X_test.shape[0], batch_size):
    batch = X_test.iloc[i:i+batch_size]
    payload = "\n".join([",".join(map(str, row)) for row in batch.to_numpy()])
    
    if i == 0:
        print("Sample payload being sent:", payload.split("\n")[0])
    
    response = xgb_predictor_los.predict(payload)
    decoded = response.decode("utf-8").strip()
    
    if decoded:
        predictions.extend([float(x) for x in decoded.split("\n")])
    else:
        print(f" Empty response for batch {i}-{i+batch_size}")

print(f" Predictions received: {len(predictions)} rows")

#combine and evaluate
results_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": predictions
})

rmse = np.sqrt(mean_squared_error(results_df["Actual"], results_df["Predicted"]))
r2 = r2_score(results_df["Actual"], results_df["Predicted"])
print(f" RMSE: {rmse:.4f}")
print(f" R²: {r2:.4f}")

#Scatter plot
mask = (results_df["Actual"] < 100) & (results_df["Predicted"] < 100)

plt.figure(figsize=(8, 6))
plt.scatter(results_df["Actual"][mask], results_df["Predicted"][mask], alpha=0.5)
plt.plot([0, 100], [0, 100], 'r--', linewidth=2)
plt.xlabel("Actual stay_length_days (<100)")
plt.ylabel("Predicted stay_length_days (<100)")
plt.title("Actual vs Predicted Stay Length (Endpoint Inference, <100 days)")
plt.grid(True)
plt.show()


In [None]:
#We get very similar results, so model is operating as expected locally and via endpoint

In [None]:
checkpoint_s3_uri

In [None]:
test_df