In [None]:
#This notebook runs our shap function one time for a shap value mean for training data, using the same shap shap process. 
#This is then going to be stored in 

In [None]:
#Moved Training file from other notebook, without prediction
#train.csv


In [None]:
#Getting columns to make sure fits our model.tar.gz
import pandas as pd

train_df = pd.read_csv('train.csv')

# Get all columns as a list
columns_list = train_df.columns.tolist()

print(f"Total columns: {len(columns_list)}")
print(columns_list)
#We have encoded columns


In [None]:
train_df.head()

In [None]:
#Running predictions 
import pandas as pd
import xgboost as xgb

# Load your train CSV
train_df = pd.read_csv('train.csv')

# Load XGBoost model
booster = xgb.Booster()
booster.load_model("model/xgboost-model") #Pulled from trainin XGBoost_Adoption.ipynb folder to ensure alignment

# Define feature columns (must match training)
feature_columns = [
    'Encoded-animal_type',
    'Encoded-primary_breed_harmonized',
    'Encoded-primary_color_harmonized',
    'Encoded-sex',
    'Encoded-intake_type_harmonized',
    'Encoded-Is_returned',
    'Encoded-has_name',
    'Encoded-is_mix',
    'age_months',
    'Num_returned',
    'stay_length_days',
    'min_height',
    'max_height',
    'min_weight',
    'max_weight',
    'min_expectancy',
    'max_expectancy',
    'grooming_frequency_value',
    'shedding_value',
    'energy_level_value',
    'trainability_value',
    'demeanor_value'
]

# Prepare input
X_train = train_df[feature_columns]
dtrain = xgb.DMatrix(X_train, feature_names=feature_columns)

# Predict probabilities
pred_proba = booster.predict(dtrain)

# Add columns to original DataFrame
train_df['predicted_proba'] = pred_proba
train_df['predicted_label'] = (pred_proba >= 0.5).astype(int)

# Save updated file
train_df.to_csv('train_with_predictions.csv', index=False)

print("Predictions added and saved to train_with_predictions.csv")


In [None]:
#Double checking model is still good, using AUC and other metrics compared against prediction and known ground truth first column
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load the updated train DataFrame with predictions
df = pd.read_csv('train_with_predictions.csv')

# Ground truth and predictions
y_true = df['outcome_type_harmonized_grouped'].astype(int)
y_pred_proba = df['predicted_proba']
y_pred_label = df['predicted_label']

# Classification report (Precision, Recall, F1)
print(" Classification Report:")
print(classification_report(y_true, y_pred_label, zero_division=0))

# AUC score
auc_score = roc_auc_score(y_true, y_pred_proba)
print(f"AUC Score: {auc_score:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred_label)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()


In [None]:
#Great, so our model holds up well and we're using the right model
#Now we run the train_with_predictions.csv through our shap process to get for the training set
#the average shap value for each feature. We can compare that in our model pipeline for our dashboarding later.

import pandas as pd
import xgboost as xgb
import numpy as np

df = pd.read_csv('train_with_predictions.csv')

# Feature columns
feature_columns = [
    'Encoded-animal_type',
    'Encoded-primary_breed_harmonized',
    'Encoded-primary_color_harmonized',
    'Encoded-sex',
    'Encoded-intake_type_harmonized',
    'Encoded-Is_returned',
    'Encoded-has_name',
    'Encoded-is_mix',
    'age_months',
    'Num_returned',
    'stay_length_days',
    'min_height',
    'max_height',
    'min_weight',
    'max_weight',
    'min_expectancy',
    'max_expectancy',
    'grooming_frequency_value',
    'shedding_value',
    'energy_level_value',
    'trainability_value',
    'demeanor_value'
]

# Load model
booster = xgb.Booster()
booster.load_model("model/xgboost-model")

# Prepare data
X_train = df[feature_columns]
dtrain = xgb.DMatrix(X_train, feature_names=feature_columns)

# Compute SHAP values
shap_values = booster.predict(dtrain, pred_contribs=True)[:, :-1]  # remove bias column

# Compute metrics
mean_shap = shap_values.mean(axis=0)  # positive/negative mean
abs_mean_shap = np.abs(shap_values).mean(axis=0)  # absolute mean

# Combine into dataframe
global_shap_df = pd.DataFrame([mean_shap, abs_mean_shap],
                              index=['Mean_SHAP', 'Abs_Mean_SHAP'],
                              columns=feature_columns)

#  Add Key_Final_with_Shap row for mapping
mapping_row = {
    'Encoded-animal_type': 'SHAP-Animal Type',
    'Encoded-primary_breed_harmonized': 'SHAP-Primary Breed Harmonized',
    'Encoded-primary_color_harmonized': 'SHAP-Primary Color Harmonized',
    'Encoded-sex': 'SHAP-Sex',
    'Encoded-intake_type_harmonized': 'SHAP-Intake Type Harmonized',
    'Encoded-Is_returned': 'SHAP-Is Returned',
    'Encoded-has_name': 'SHAP-Has Name',
    'Encoded-is_mix': 'SHAP-Is Mix',
    'age_months': 'SHAP-Age Months',
    'Num_returned': 'SHAP-Num Returned',
    'stay_length_days': 'SHAP-Stay Length Days',
    'min_height': 'SHAP-Min Height',
    'max_height': 'SHAP-Max Height',
    'min_weight': 'SHAP-Min Weight',
    'max_weight': 'SHAP-Max Weight',
    'min_expectancy': 'SHAP-Min Expectancy',
    'max_expectancy': 'SHAP-Max Expectancy',
    'grooming_frequency_value': 'SHAP-Grooming Frequency Value',
    'shedding_value': 'SHAP-Shedding Value',
    'energy_level_value': 'SHAP-Energy Level Value',
    'trainability_value': 'SHAP-Trainability Value',
    'demeanor_value': 'SHAP-Demeanor Value'
}

# Insert Key_Final_with_Shap row
global_shap_df.loc['Key_Final_with_Shap'] = mapping_row

# Reorder rows so Key_Final_with_Shap is right after the header
global_shap_df = global_shap_df.reindex(['Key_Final_with_Shap', 'Mean_SHAP', 'Abs_Mean_SHAP'])

# Save
global_shap_df.to_csv('global_shap_summary_2rows.csv', index=True)

print("File saved with Key_Final_with_Shap as second row:")
print(global_shap_df.head(3))

In [None]:
#Seeing table
global_shap_df

In [None]:
#last step is to export for use in our lambda functions
#s3://dockerevalcontainer/processing/input/code/
import boto3
local_file = 'global_shap_summary_2rows.csv'
bucket = 'dockerevalcontainer'
key = 'processing/input/code/global_shap_summary_2rows.csv'

s3 = boto3.client('s3')
s3.upload_file(local_file, bucket, key)

print(f"Uploaded to S3: s3://{bucket}/{key}")