# Data Loading and Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from typing import Dict, List, Optional, Set, Tuple, Optional, Union
from datetime import datetime
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz, process
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.inspection import permutation_importance
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier



In [None]:
# File paths
lb_path = "LB_animal-shelter-intakes-and-outcomes.csv"
sj_path = "combined_SJ_animalshelterdata.csv"
bloomington_path = "Bloomington_Animal_Shelter_Animals.csv"

# Try reading the files, detect separator if needed (assuming comma for now)
df_lb = pd.read_csv(lb_path)
df_sj = pd.read_csv(sj_path)
df_bloom = pd.read_csv(bloomington_path)

# Filter and standardize
df_lb = df_lb[
    df_lb['Animal Type'].astype(str).str.strip().str.lower().isin(['dog', 'cat'])
].copy()

df_sj = df_sj[
    df_sj['AnimalType'].astype(str).str.strip().str.lower().isin(['dog', 'cat'])
].copy()

df_bloom = df_bloom[
    df_bloom['speciesname'].astype(str).str.strip().str.lower().isin(['dog', 'cat'])
].copy()

# Normalize the values to lowercase
df_lb['Animal Type'] = df_lb['Animal Type'].astype(str).str.strip().str.lower()
df_sj['AnimalType'] = df_sj['AnimalType'].astype(str).str.strip().str.lower()
df_bloom['speciesname'] = df_bloom['speciesname'].astype(str).str.strip().str.lower()


# Check their shapes and first few columns
lb_info = (df_lb.shape, list(df_lb.columns))
sj_info = (df_sj.shape, list(df_sj.columns))
bloom_info = (df_bloom.shape, list(df_bloom.columns))

lb_info, sj_info, bloom_info

(((25263, 23),
  ['Animal ID',
   'Animal Name',
   'Animal Type',
   'Primary Color',
   'Secondary Color',
   'Sex',
   'DOB',
   'Intake Date',
   'Intake Condition',
   'Intake Type',
   'Intake Subtype',
   'Reason for Intake',
   'Outcome Date',
   'Crossing',
   'Jurisdiction',
   'Outcome Type',
   'Outcome Subtype',
   'latitude',
   'longitude',
   'intake_is_dead',
   'outcome_is_dead',
   'was_outcome_alive',
   'geopoint']),
 ((107685, 21),
  ['AnimalID',
   'AnimalName',
   'AnimalType',
   'PrimaryColor',
   'SecondaryColor',
   'PrimaryBreed',
   'Sex',
   'DOB',
   'Age',
   'IntakeDate',
   'IntakeCondition',
   'IntakeType',
   'IntakeSubtype',
   'IntakeReason',
   'OutcomeDate',
   'OutcomeType',
   'OutcomeSubtype',
   'OutcomeCondition',
   'Crossing',
   'Jurisdiction',
   'LastUpdate']),
 ((38255, 23),
  ['id',
   'intakedate',
   'intakereason',
   'istransfer',
   'sheltercode',
   'identichipnumber',
   'animalname',
   'breedname',
   'basecolour',
   'spec

In [None]:
df_lb['Animal Type'].value_counts()

Unnamed: 0_level_0,count
Animal Type,Unnamed: 1_level_1
cat,14960
dog,10303


In [None]:
df_lb['Outcome Date'] = pd.to_datetime(df_lb['Outcome Date'], errors='coerce')
df_lb['outcome_year'] = df_lb['Outcome Date'].dt.year
outcome_year_counts_lb = df_lb['outcome_year'].value_counts().sort_index()

df_sj['OutcomeDate'] = pd.to_datetime(df_sj['OutcomeDate'], errors='coerce')
df_sj['outcome_year'] = df_sj['OutcomeDate'].dt.year
outcome_year_counts_sj = df_sj['outcome_year'].value_counts().sort_index()

df_bloom['movementdate'] = pd.to_datetime(df_bloom['movementdate'], errors='coerce')
df_bloom['outcome_year'] = df_bloom['movementdate'].dt.year
outcome_year_counts_bloom = df_bloom['outcome_year'].value_counts().sort_index()

print(outcome_year_counts_lb)
print(outcome_year_counts_sj)
print(outcome_year_counts_bloom)

  df_bloom['movementdate'] = pd.to_datetime(df_bloom['movementdate'], errors='coerce')


outcome_year
2017.0    3975
2018.0    3674
2019.0    3313
2020.0    2587
2021.0    2127
2022.0    2411
2023.0    2965
2024.0    3058
2025.0    1016
Name: count, dtype: int64
outcome_year
2018.0     8987
2019.0    17388
2020.0    12964
2021.0    16284
2022.0    10956
2023.0    15987
2024.0    17930
2025.0     6171
2034.0        1
Name: count, dtype: int64
outcome_year
2013.0       1
2016.0       1
2017.0    3037
2018.0    3810
2019.0    4326
2020.0    3312
2021.0    3632
2022.0    3516
2023.0    4226
2024.0    4249
2025.0    1716
Name: count, dtype: int64


In [None]:
primary_colors_counts_lb = df_lb['Primary Color'].value_counts()
print(primary_colors_counts_lb.head(30))

primary_colors_counts_sj = df_sj['PrimaryColor'].value_counts()
print(primary_colors_counts_sj.head(30))

Primary Color
BLACK         6840
WHITE         3226
BRN TABBY     2365
GRAY          2335
BROWN         1977
TAN           1472
GRAY TABBY    1237
ORG TABBY      928
CALICO         544
ORANGE         540
TORTIE         492
BR BRINDLE     337
TRICOLOR       305
CREAM          258
SEAL PT        242
BLUE           230
RED            167
TORTIE DIL     123
LYNX PT        123
FAWN           113
CALICO DIL     112
GOLD           102
UNKNOWN         92
CALICO TAB      86
CRM TABBY       82
BLONDE          81
CHOCOLATE       77
BLK TABBY       72
BL BRINDLE      56
CHOC PT         55
Name: count, dtype: int64
PrimaryColor
BLACK         30665
TABBY-BRN     16092
WHITE          9982
GRAY           9181
TABBY-ORG      6898
TABBY-GRAY     5602
BROWN          5534
TAN            4682
TORTIE-B\O     2333
CALICO-TRI     2063
TORBI-BRN      1336
TRICOLOR       1269
ORANGE         1214
PT-LYNX        1066
BRINDLE-BN     1055
CREAM           936
CALICO-DIL      808
TABBY-BUFF      713
PT-SEAL         7

In [None]:
secondary_colors_counts_lb = df_lb['Secondary Color'].value_counts()
print(secondary_colors_counts_lb.head(30))

secondary_colors_counts_sj = df_sj['SecondaryColor'].value_counts()
print(secondary_colors_counts_sj.head(30))

Secondary Color
WHITE         7972
BLACK         1373
BROWN         1201
TAN            797
GRAY           501
BRN TABBY      118
ORANGE          90
CREAM           89
CALICO          45
BR BRINDLE      43
GRAY TABBY      42
RED             35
TRICOLOR        32
SILVER          26
ORG TABBY       23
BLUE            20
TORTIE          20
GOLD            19
FAWN             7
BL BRINDLE       5
LYNX PT          5
YELLOW           4
FLAME PT         4
SABLE            3
BLUE TICK        2
BUFF             2
BLK TABBY        2
TORTIE DIL       2
TORBI            2
WHEAT            2
Name: count, dtype: int64
SecondaryColor
WHITE         32180
BLACK          4171
BROWN          2807
TAN            2440
GRAY           1409
TABBY-BRN       387
ORANGE          260
CREAM           240
TABBY-GRAY      130
TABBY-ORG       109
RED              89
BRINDLE-BN       82
OTHER            61
TRICOLOR         49
GOLD             42
SABLE            33
TORBI-BRN        30
CALICO-TRI       24
CHOCOLATE    

In [None]:
sex_counts_lb = df_lb['Sex'].value_counts()
print(sex_counts_lb.head(30))

sex_count_sj = df_sj['Sex'].value_counts()
print(sex_count_sj.head(30))

Sex
Male        6673
Female      6099
Neutered    5685
Spayed      4824
Unknown     1982
Name: count, dtype: int64
Sex
NEUTERED    33245
SPAYED      32002
UNKNOWN     18126
MALE        13198
FEMALE      11114
Name: count, dtype: int64


In [None]:
intakecondition_counts_lb = df_lb['Intake Condition'].value_counts()
print(intakecondition_counts_lb.head(30))

intakecondition_counts_sj = df_sj['IntakeCondition'].value_counts()
print(intakecondition_counts_sj.head(30))

Intake Condition
NORMAL                13189
UNDER AGE/WEIGHT       5631
ILL MILD               1063
INJURED  SEVERE         918
ILL SEVERE              783
FRACTIOUS               743
ILL MODERATETE          727
INJURED  MILD           615
INJURED  MODERATE       552
FERAL                   301
I/I REPORT              242
AGED                    157
BEHAVIOR  MODERATE      129
BEHAVIOR  MILD           74
BEHAVIOR  SEVERE         72
WELFARE SEIZURES         67
Name: count, dtype: int64
IntakeCondition
NORMAL        35510
HEALTHY       15828
OTHER MED     14865
DEAD          10950
NURSING        9865
MED R          7079
FERAL          3319
MED SEV        2552
INJURED        1898
MED M          1683
SICK           1470
PREGNANT        868
MED EMERG       460
BEH M           378
UNHEALTHY       338
BEH U           253
AGGRESSIVE      162
BEH R           133
FEARFUL          39
AGED             27
MANAGE            4
REHAB             4
Name: count, dtype: int64


In [None]:
intaketype_counts_lb = df_lb['Intake Type'].value_counts()
print(intaketype_counts_lb.head(30))

intaketype_counts_sj = df_sj['IntakeType'].value_counts()
print(intaketype_counts_sj.head(30))


Intake Type
STRAY                    21129
OWNER SURRENDER           2512
WELFARE SEIZED             511
CONFISCATE                 407
RETURN                     337
QUARANTINE                 171
SAFE KEEP                  102
TRAP, NEUTER, RETURN        87
WILDLIFE                     2
FOSTER                       2
Adopted Animal Return        2
Euthenasia Required          1
Name: count, dtype: int64
IntakeType
STRAY         75914
DISPO REQ     10723
FOSTER         6252
OWNER SUR      4934
S/N CLINIC     3784
CONFISCATE     2435
RETURN         1505
TRANSFER       1380
EUTH REQ        668
DISASTER         37
NEUTER           26
SPAY             24
WILDLIFE          3
Name: count, dtype: int64


In [None]:
intakesubtype_counts_lb = df_lb['Intake Subtype'].value_counts()
print(intakesubtype_counts_lb.head(30))

intakesubtype_counts_sj = df_sj['IntakeSubtype'].value_counts()
print(intakesubtype_counts_sj.head(30))


Intake Subtype
OTC           14682
FIELD          8009
TRAP            522
POLICE          437
ADOPTION        223
HOSPITAL        165
ABAN FIELD      151
OWNER DIED      122
ABAN SHLTR      112
CRUELTY          95
BORN@SHELT       58
AT SHELTER       57
BITE             41
EVICTION         39
RTF              25
SPCALA           22
VICIOUS          18
PUB SAFETY       13
EMERGENCY        12
ABANDON           7
RESCUE            5
INVESTIGAT        2
RED CROSS         1
Name: count, dtype: int64
IntakeSubtype
OTC           72494
FIELD         14651
MEDVET         1990
FERAL          1633
OTC OWNED      1387
FOUND          1365
ASO             918
POLICE          890
OTHER AGEN      846
SVVS            643
EAC             596
HOSPITAL        188
RESCUE AS       178
COUNTY          150
BITE            134
OWNER DIED      130
CRUELTY         126
OWNED           124
SVACA           117
FIELD OWN       105
RESCUE NON       51
EVICTION         46
HSSV             37
FOSTER           18
STRAY

In [None]:
intakereason_counts_lb = df_lb['Reason for Intake'].value_counts()
print(intakereason_counts_lb.head(30))

intakereason_counts_sj = df_sj['IntakeReason'].value_counts()
print(intakereason_counts_sj.head(30))


Reason for Intake
OWNER PROB    410
MOVE          226
LANDLORD      167
COST          115
NO HOME       109
NO TIME       108
BITES          96
OWNER DIED     89
AGG ANIMAL     87
TOO MANY       75
ILL            73
ALLERGIC       63
OTHER PET      56
AGG PEOPLE     52
ABANDON        34
POOR HELTH     27
HYPER          27
INJURED        22
ESCAPES        19
UNKNOWN        17
CHILD PROB     12
NEW BABY       11
DIVORCE         9
HOUSE SOIL      8
RESPONSIBL      8
NOFRIENDLY      8
AFRAID          8
DESTRUC IN      6
JUMPS UP        6
ATTENTION       5
Name: count, dtype: int64
IntakeReason
IP ADOPT      2429
IP EUTH       1033
LANDLORD       346
DOA            334
MOVE           317
OWNER PROB     220
NO TIME        218
ALLERGIC       165
OTHER PET      146
NO HOME        136
TOO MANY       131
RESPONSIBL     117
BITES           98
COST            89
AGG PEOPLE      70
AGG ANIMAL      68
OWNER DIED      53
HOUSE SOIL      47
HYPER           45
CHILD PROB      37
EUTH ILL        36
ESCA

In [None]:
outcometype_counts_lb = df_lb['Outcome Type'].value_counts()
print(outcometype_counts_lb.head(30))

outcometype_counts_sj = df_sj['OutcomeType'].value_counts()
print(outcometype_counts_sj.head(30))


Outcome Type
ADOPTION                   6104
RESCUE                     5857
TRANSFER                   3810
EUTHANASIA                 3405
RETURN TO OWNER            3374
SHELTER, NEUTER, RETURN     955
COMMUNITY CAT               475
DIED                        452
FOSTER TO ADOPT             232
TRANSPORT                   129
HOMEFIRST                    75
TRAP, NEUTER, RELEASE        71
MISSING                      66
RETURN TO RESCUE             39
DISPOSAL                     31
DUPLICATE                    26
RETURN TO WILD HABITAT       11
FOSTER                        7
Name: count, dtype: int64
OutcomeType
ADOPTION      22661
RESCUE        20119
TRANSFER      18755
DISPOSAL      10911
RTO           10153
EUTH           7618
FOSTER         6497
RTF            5469
DIED           1863
FOUND EXP       701
FOUND ANIM      660
SPAY            524
NEUTER          418
LOST EXP        146
S/N UNABLE       73
MISSING          51
REQ EUTH         39
Name: count, dtype: int64


In [None]:
outcomesubtype_count_lb = df_lb['Outcome Subtype'].value_counts()
print(outcomesubtype_count_lb.head(30))

outcomesubtype_count_sj = df_sj['OutcomeSubtype'].value_counts()
print(outcomesubtype_count_sj.head(30))


Outcome Subtype
SPCALA        4039
WALKIN        2974
WEB           2854
ILL SEVERE    1335
LITTLELION    1104
STRAYCATAL    1029
LITTLEPAWS     955
CATPAWS        685
AT VET         613
FNDANIFOUN     574
UNDRAGE/WT     549
FRE RID HM     512
PFE/PAWSHP     493
OTHER RESC     479
INJ SEVERE     432
MICROCHIP      270
LIVELOVE       236
SBACC          207
BEH SEVERE     203
IN KENNEL      196
CAPAWS         185
ILL MODERA     151
FRIENDLY       146
COMMCAT        127
ENROUTE        112
BEH MODERA     102
KITTYBUNGA     101
SPEC EVT        89
SPARKYGANG      88
OTHER SHEL      81
Name: count, dtype: int64
Series([], Name: count, dtype: int64)


## Standarize and align columns across datasets

In [None]:
# Step 1: Standardize and align columns across datasets
def standardize_columns(df, mapping, shelter_name):
    df = df.rename(columns=mapping)
    df['shelter'] = shelter_name
    return df

# Define mappings from each dataset to a common schema
# We'll define a core set of columns that make sense across all datasets
common_columns = {
    'animal_id': 'animal_id',
    'animal_name': 'animal_name',
    'animal_type': 'animal_type',
    'primary_color': 'primary_color',
    'secondary_color': 'secondary_color',
    'primary_breed': 'primary_breed',
    'sex': 'sex',
    'dob': 'dob',
    'intake_date': 'intake_date',
    'intake_condition': 'intake_condition',
    'intake_type': 'intake_type',
    'intake_subtype': 'intake_subtype',
    'intake_reason': 'intake_reason',
    'outcome_date': 'outcome_date',
    'outcome_type': 'outcome_type',
    'outcome_subtype': 'outcome_subtype'
}

# Long Beach mapping
lb_mapping = {
    'Animal ID': 'animal_id',
    'Animal Name': 'animal_name',
    'Animal Type': 'animal_type',
    'Primary Color': 'primary_color',
    'Secondary Color': 'secondary_color',
    'Sex': 'sex',
    'DOB': 'dob',
    'Intake Date': 'intake_date',
    'Intake Condition': 'intake_condition',
    'Intake Type': 'intake_type',
    'Intake Subtype': 'intake_subtype',
    'Reason for Intake': 'intake_reason',
    'Outcome Date': 'outcome_date',
    'Outcome Type': 'outcome_type',
    'Outcome Subtype': 'outcome_subtype'
}

# San Jose mapping
sj_mapping = {
    'AnimalID': 'animal_id',
    'AnimalName': 'animal_name',
    'AnimalType': 'animal_type',
    'PrimaryColor': 'primary_color',
    'SecondaryColor': 'secondary_color',
    'PrimaryBreed': 'primary_breed',
    'Sex': 'sex',
    'DOB': 'dob',
    'IntakeDate': 'intake_date',
    'IntakeCondition': 'intake_condition',
    'IntakeType': 'intake_type',
    'IntakeSubtype': 'intake_subtype',
    'IntakeReason': 'intake_reason',
    'OutcomeDate': 'outcome_date',
    'OutcomeType': 'outcome_type',
    'OutcomeSubtype': 'outcome_subtype'
}

# Bloomington mapping
bloom_mapping = {
    'id': 'animal_id',
    'animalname': 'animal_name',
    'speciesname': 'animal_type',
    'basecolour': 'primary_color',
    'breedname': 'primary_breed',
    'sexname': 'sex',
    'intakedate': 'intake_date',
    'intakereason': 'intake_reason',
    'movementdate': 'outcome_date',
    'movementtype': 'outcome_type',
    'deceaseddate': 'deceased_date',
    'returndate': 'return_date',
    'diedoffshelter': 'diedoffshelter',
    'puttosleep': 'puttosleep',
    'isdoa': 'isdoa',
}

# Apply standardization
df_lb_std = standardize_columns(df_lb, lb_mapping, 'Long Beach')
df_sj_std = standardize_columns(df_sj, sj_mapping, 'San Jose')
df_bloom_std = standardize_columns(df_bloom, bloom_mapping, 'Bloomington')


# Align all to common set of columns
final_columns = set(df_lb_std.columns) | set(df_sj_std.columns) | set(df_bloom_std.columns)
df_lb_std = df_lb_std.reindex(columns=final_columns)
df_sj_std = df_sj_std.reindex(columns=final_columns)
df_bloom_std = df_bloom_std.reindex(columns=final_columns)

### Separate Data to address intakedate is always the same in Bloomtington dataset ###
# Separate rows with duplicated IDs and without duplicated IDs
duplicated_bloom_mask = df_bloom_std.duplicated(subset="animal_id", keep=False)

# DataFrame with duplicated IDs
duplicated_bloom_df = df_bloom_std[duplicated_bloom_mask].copy()

# DataFrame without duplicated IDs
unique_bloom_df = df_bloom_std[~duplicated_bloom_mask].copy()

# Convert date columns to datetime
duplicated_bloom_df["intake_date"] = pd.to_datetime(duplicated_bloom_df["intake_date"], errors="coerce")
duplicated_bloom_df["return_date"] = pd.to_datetime(duplicated_bloom_df["return_date"], errors="coerce")
duplicated_bloom_df["outcome_date"] = pd.to_datetime(duplicated_bloom_df["outcome_date"], errors="coerce")

# Sort to ensure consistent ordering
duplicated_bloom_df.sort_values(by=["animal_id", "intake_date", "outcome_date"], inplace=True)

# Create a new column for adjusted intakedate
adjusted_intakedate = []

# Track previous returndate for each id
prev_returndate = {}

# Adjust intakedate for repeated IDs
for _, row in duplicated_bloom_df.iterrows():
    animal_id = row["animal_id"]
    if animal_id in prev_returndate:
        new_intakedate = prev_returndate[animal_id]
    else:
        new_intakedate = row["intake_date"]

    adjusted_intakedate.append(new_intakedate)

    # Update returndate if present
    if pd.notnull(row["return_date"]):
        prev_returndate[animal_id] = row["return_date"]

# Assign the adjusted intakedate
duplicated_bloom_df["intake_date"] = adjusted_intakedate

### Combine dataset ###
# Convert intakedate to datetime to match duplicated_df
unique_bloom_df["intake_date"] = pd.to_datetime(unique_bloom_df["intake_date"], errors="coerce")

# Add adjusted_intakedate column with original intakedate for unique entries
#unique_bloom_df["intake_date"] = unique_bloom_df["intake_date"]

# Combine both DataFrames
df_bloom_std = pd.concat([unique_bloom_df, duplicated_bloom_df], ignore_index=True)

# Sort for better readability
df_bloom_std.sort_values(by=["animal_id", "intake_date"], inplace=True)
### Finish Bloomtington "intake_date" correction ###

### Fill Bloomington 'outcome_type' blanks based on 'diedoffshelter', 'puttosleep', and 'isdoa' ###
def fill_bloomington_outcome(row):
    if pd.notna(row['outcome_type']):
        return row['outcome_type']
    if row.get('diedoffshelter') is True or row.get('isdoa') is True:
        return 'DIED'
    if row.get('puttosleep') is True:
        return 'EUTH'
    return 'UNKNOWN'

df_bloom_std['outcome_type'] = df_bloom_std.apply(fill_bloomington_outcome, axis=1)
df_bloom_std['outcome_type'] = df_bloom_std['outcome_type'].astype(str).str.upper()

### Finished filling blanks for Bloomington 'outcome_type'


# Combine all datasets
df_cat_dog = pd.concat([df_lb_std, df_sj_std, df_bloom_std], ignore_index=True)

# Show result
df_cat_dog.shape, list(df_cat_dog.columns), df_cat_dog['animal_type'].value_counts()

  duplicated_bloom_df["intake_date"] = pd.to_datetime(duplicated_bloom_df["intake_date"], errors="coerce")
  duplicated_bloom_df["return_date"] = pd.to_datetime(duplicated_bloom_df["return_date"], errors="coerce")
  unique_bloom_df["intake_date"] = pd.to_datetime(unique_bloom_df["intake_date"], errors="coerce")


((171203, 42),
 ['location',
  'animalage',
  'intake_date',
  'animal_name',
  'OutcomeCondition',
  'puttosleep',
  'latitude',
  'outcome_type',
  'secondary_color',
  'primary_breed',
  'outcome_year',
  'Age',
  'intake_type',
  'dob',
  'outcome_is_dead',
  'intake_is_dead',
  'sex',
  'LastUpdate',
  'istrial',
  'sheltercode',
  'animal_id',
  'outcome_subtype',
  'was_outcome_alive',
  'returnedreason',
  'deceased_date',
  'intake_subtype',
  'intake_condition',
  'return_date',
  'diedoffshelter',
  'animal_type',
  'longitude',
  'Jurisdiction',
  'primary_color',
  'geopoint',
  'outcome_date',
  'istransfer',
  'intake_reason',
  'Crossing',
  'isdoa',
  'identichipnumber',
  'shelter',
  'deceasedreason'],
 animal_type
 cat    114763
 dog     56440
 Name: count, dtype: int64)

In [None]:
# Define a function to extract primary, secondary breed and mix flag
def extract_breed_parts(breed):
    if pd.isna(breed):
        return pd.Series(["unknown", None, "no"])

    parts = breed.split("/")
    primary = parts[0].strip()

    if len(parts) > 1:
        secondary = parts[1].strip()
        is_mix = "yes"
    else:
        secondary = None
        is_mix = "no"

    return pd.Series([primary, secondary, is_mix])

# Apply to df_cat_dog using 'primary_breed'
df_cat_dog[["primary_breed_clean", "secondary_breed", "is_mix"]] = df_cat_dog["primary_breed"].apply(extract_breed_parts)


In [None]:
# Export the adjusted DataFrame to CSV
#export_path = "df_cat_dog.csv"
#df_cat_dog.to_csv(export_path, index=False)

In [None]:
print(df_cat_dog[df_cat_dog["animal_id"] == 64598].head(20))

              location          animalage          intake_date animal_name  \
143497  Adoptable Dogs  5 years 4 months.  2019-08-28 12:46:21        Tank   
143498  Adoptable Dogs  5 years 4 months.  2021-04-13 00:00:00        Tank   
143499  Adoptable Dogs  5 years 4 months.  2023-07-10 00:00:00        Tank   
143500  Adoptable Dogs  5 years 4 months.  2023-09-16 00:00:00        Tank   
143501  Adoptable Dogs  5 years 4 months.  2023-11-17 00:00:00        Tank   
143502  Adoptable Dogs  5 years 4 months.  2023-12-11 00:00:00        Tank   
143503  Adoptable Dogs  5 years 4 months.  2024-02-11 00:00:00        Tank   
143504  Adoptable Dogs  5 years 4 months.  2024-02-11 00:00:00        Tank   

       OutcomeCondition puttosleep  latitude outcome_type secondary_color  \
143497              NaN        0.0       NaN     ADOPTION             NaN   
143498              NaN        0.0       NaN     ADOPTION             NaN   
143499              NaN        0.0       NaN       FOSTER         

In [None]:
print(df_cat_dog.columns)


Index(['location', 'animalage', 'intake_date', 'animal_name',
       'OutcomeCondition', 'puttosleep', 'latitude', 'outcome_type',
       'secondary_color', 'primary_breed', 'outcome_year', 'Age',
       'intake_type', 'dob', 'outcome_is_dead', 'intake_is_dead', 'sex',
       'LastUpdate', 'istrial', 'sheltercode', 'animal_id', 'outcome_subtype',
       'was_outcome_alive', 'returnedreason', 'deceased_date',
       'intake_subtype', 'intake_condition', 'return_date', 'diedoffshelter',
       'animal_type', 'longitude', 'Jurisdiction', 'primary_color', 'geopoint',
       'outcome_date', 'istransfer', 'intake_reason', 'Crossing', 'isdoa',
       'identichipnumber', 'shelter', 'deceasedreason', 'primary_breed_clean',
       'secondary_breed', 'is_mix'],
      dtype='object')


## Continue Preprocessing and Cleaning

### a. Process NAs

#### i. Covert None/Null/blank to NaN for columns: intake_subtype, intake_reason, secondary_color, age

In [None]:
# List of columns to clean
cols_to_clean = ['intake_subtype', 'intake_reason', 'secondary_color', 'Age']

# Apply replacement
for col in cols_to_clean:
    df_cat_dog[col] = (
        df_cat_dog[col]
        .replace(r'^\s*$', np.nan, regex=True)  # blank or whitespace -> NaN
        .replace(r'(?i)^none$', np.nan, regex=True)  # 'None' (any case) -> NaN
    )


In [None]:
nan_count_intake_reason = df_cat_dog['intake_reason'].isna().sum()
print(f"Total NaN in intake_reason: {nan_count_intake_reason}")
nan_count_Age = df_cat_dog['Age'].isna().sum()
print(f"Total NaN in Age: {nan_count_Age}")

Total NaN in intake_reason: 124515
Total NaN in Age: 63518


#### ii. Remove "outcome_type" is blank

In [None]:
# Drop rows where outcome_type is NaN
df_cat_dog_final = df_cat_dog[df_cat_dog['outcome_type'].notna()]

# Show the new shape
print(f"New shape after removing rows with outcome_type NaN: {df_cat_dog_final.shape}")

New shape after removing rows with outcome_type NaN: (170032, 45)


### b. Remove Duplicates

#### i. Drop Exact Duplicates

In [None]:
# Check for exact duplicates across all columns
duplicate_rows = df_cat_dog_final[df_cat_dog_final.duplicated(keep=False)]

# Show how many duplicates found
print(f"Number of exact duplicate rows: {duplicate_rows.shape[0]}")

# Drop exact duplicates (keep the first occurrence)
df_cat_dog_no_duplicates = df_cat_dog_final.drop_duplicates(keep='first')

# Show new shape after dropping duplicates
print(f"New shape after removing duplicates: {df_cat_dog_no_duplicates.shape}")


Number of exact duplicate rows: 5091
New shape after removing duplicates: (166291, 45)


In [None]:
print(duplicate_rows.head(20))

      location animalage intake_date animal_name OutcomeCondition puttosleep  \
26326      NaN       NaN  2019-03-14  ASAP FLAKO              NaN        NaN   
26327      NaN       NaN  2019-03-14  ASAP FLAKO              NaN        NaN   
31715      NaN       NaN  2018-09-21     LUCIFER             DEAD        NaN   
31716      NaN       NaN  2018-09-21     LUCIFER             DEAD        NaN   
33741      NaN       NaN  2019-05-15         NaN             DEAD        NaN   
33742      NaN       NaN  2019-05-15         NaN             DEAD        NaN   
43579      NaN       NaN  2020-02-25         NaN        UNHEALTHY        NaN   
43580      NaN       NaN  2020-02-25         NaN        UNHEALTHY        NaN   
47583      NaN       NaN  2019-09-01         NaN             DEAD        NaN   
47584      NaN       NaN  2019-09-01         NaN             DEAD        NaN   
68845      NaN       NaN  2021-05-01      BUSTER        UNHEALTHY        NaN   
68846      NaN       NaN  2021-05-01    

In [None]:
print(df_cat_dog_final[df_cat_dog_final['animal_id'] == 'A0998965'])

      location animalage intake_date animal_name OutcomeCondition puttosleep  \
26326      NaN       NaN  2019-03-14  ASAP FLAKO              NaN        NaN   
26327      NaN       NaN  2019-03-14  ASAP FLAKO              NaN        NaN   
26328      NaN       NaN  2019-04-05  ASAP FLAKO          HEALTHY        NaN   

       latitude outcome_type secondary_color primary_breed  ...  istransfer  \
26326       NaN          RTO             NaN    POODLE MIN  ...         NaN   
26327       NaN          RTO             NaN    POODLE MIN  ...         NaN   
26328       NaN          RTO             NaN    POODLE MIN  ...         NaN   

      intake_reason                     Crossing isdoa identichipnumber  \
26326           NaN                  BARBERRY LN   NaN              NaN   
26327           NaN                  BARBERRY LN   NaN              NaN   
26328      IP ADOPT  BARBERRY LN. / STALLION WAY   NaN              NaN   

        shelter deceasedreason primary_breed_clean secondary_

In [None]:
print(df_cat_dog_no_duplicates[df_cat_dog_no_duplicates['animal_id'] == 'A0998965'])

      location animalage intake_date animal_name OutcomeCondition puttosleep  \
26326      NaN       NaN  2019-03-14  ASAP FLAKO              NaN        NaN   
26328      NaN       NaN  2019-04-05  ASAP FLAKO          HEALTHY        NaN   

       latitude outcome_type secondary_color primary_breed  ...  istransfer  \
26326       NaN          RTO             NaN    POODLE MIN  ...         NaN   
26328       NaN          RTO             NaN    POODLE MIN  ...         NaN   

      intake_reason                     Crossing isdoa identichipnumber  \
26326           NaN                  BARBERRY LN   NaN              NaN   
26328      IP ADOPT  BARBERRY LN. / STALLION WAY   NaN              NaN   

        shelter deceasedreason primary_breed_clean secondary_breed is_mix  
26326  San Jose            NaN          POODLE MIN            None     no  
26328  San Jose            NaN          POODLE MIN            None     no  

[2 rows x 45 columns]


#### ii. Drop non exact duplictes based on ID+INTAKETIME+OUTCOMETYPE+shelter

In [None]:
# Ensure intake_date is string for grouping (in case it's datetime)
df_cat_dog_no_duplicates['intake_date_str'] = df_cat_dog_no_duplicates['intake_date'].astype(str)

# Define the subset of columns to check duplicates on
subset_cols = ['animal_id', 'intake_date_str', 'outcome_type', 'shelter']

# Find duplicates based on these fields
duplicate_subset = df_cat_dog_no_duplicates[df_cat_dog_no_duplicates.duplicated(subset=subset_cols, keep=False)]

print(f"Number of non-exact duplicates based on key fields: {duplicate_subset.shape[0]}")

# Drop duplicates, keep first occurrence
df_cat_dog_clean = df_cat_dog_no_duplicates.drop_duplicates(subset=subset_cols, keep='first')

print(f"New shape after removing non-exact duplicates: {df_cat_dog_clean.shape}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_dog_no_duplicates['intake_date_str'] = df_cat_dog_no_duplicates['intake_date'].astype(str)


Number of non-exact duplicates based on key fields: 512
New shape after removing non-exact duplicates: (166028, 46)


In [None]:
print(df_cat_dog_clean[df_cat_dog_clean['animal_id'] == 'A670669'])
print(df_cat_dog_no_duplicates[df_cat_dog_no_duplicates['animal_id'] == 'A670669'])

     location animalage intake_date animal_name OutcomeCondition puttosleep  \
7096      NaN       NaN  2022-02-14       *NATE              NaN        NaN   

       latitude outcome_type secondary_color primary_breed  ...  \
7096  33.817104     ADOPTION             NaN           NaN  ...   

      intake_reason                                     Crossing isdoa  \
7096     OWNER PROB  3300 BLK SANTA FE AVE, LONG BEACH, CA 90810   NaN   

     identichipnumber     shelter deceasedreason primary_breed_clean  \
7096              NaN  Long Beach            NaN             unknown   

     secondary_breed is_mix intake_date_str  
7096            None     no      2022-02-14  

[1 rows x 46 columns]
     location animalage intake_date animal_name OutcomeCondition puttosleep  \
7096      NaN       NaN  2022-02-14       *NATE              NaN        NaN   
7109      NaN       NaN  2022-02-14       *NATE              NaN        NaN   

       latitude outcome_type secondary_color primary_breed 

In [None]:
print(duplicate_subset.head(20))

      location animalage intake_date    animal_name OutcomeCondition  \
4476       NaN       NaN  2024-05-01          *TORI              NaN   
7096       NaN       NaN  2022-02-14          *NATE              NaN   
7109       NaN       NaN  2022-02-14          *NATE              NaN   
7197       NaN       NaN  2018-05-27         *ALANI              NaN   
8086       NaN       NaN  2023-04-11           HUNI              NaN   
8088       NaN       NaN  2023-04-11           HUNI              NaN   
10084      NaN       NaN  2018-05-27         *ALANI              NaN   
11250      NaN       NaN  2024-05-01          *TORI              NaN   
25288      NaN       NaN  2018-09-27           KING             DEAD   
25289      NaN       NaN  2018-09-27           KING             DEAD   
31260      NaN       NaN  2018-08-29  B - VASCHETTE        UNHEALTHY   
31261      NaN       NaN  2018-08-29  B - VASCHETTE        UNHEALTHY   
33803      NaN       NaN  2018-11-06           KIRA            R

# Feature Engineered Columns:
  a. Has name \
  b. Age \
  c. Lenght of stay \
  d. is_return \
  e. Num_returned \
  f. Tag Primary Breed for mixed breed \

## Feature "Has Name" from column "animal_name"

#### Calculated Field for 'has_name'

In [None]:
# Create a binary field: 1 = named, 0 = unnamed (empty or 'nan' or similar treated as unnamed)
def has_name(value):
    value = str(value).strip()
    if value.lower() in ['nan', '', 'none', 'unknown']:
        return 0
    else:
        return 1

df_cat_dog_clean['has_name'] = df_cat_dog_clean['animal_name'].apply(has_name)

# Show distribution of named vs unnamed
name_counts = df_cat_dog_clean['has_name'].value_counts().rename(index={1: 'named', 0: 'unnamed'})
name_counts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_dog_clean['has_name'] = df_cat_dog_clean['animal_name'].apply(has_name)


Unnamed: 0_level_0,count
has_name,Unnamed: 1_level_1
named,102478
unnamed,63550


### Feature "Stay Length" from column "intakedate" and "outcomedate"

In [None]:
# First ensure dates are converted to datetime if not already
df_cat_dog_clean['intake_date'] = pd.to_datetime(df_cat_dog_clean['intake_date'], errors='coerce')
df_cat_dog_clean['outcome_date'] = pd.to_datetime(df_cat_dog_clean['outcome_date'], errors='coerce')

# Calculate stay length in days
df_cat_dog_clean['stay_length_days'] = (df_cat_dog_clean['outcome_date'] - df_cat_dog_clean['intake_date']).dt.total_seconds() / (60*60*24)

# Show basic stats for stay length
stay_length_summary = df_cat_dog_clean['stay_length_days'].describe()
stay_length_summary


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_dog_clean['intake_date'] = pd.to_datetime(df_cat_dog_clean['intake_date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_dog_clean['outcome_date'] = pd.to_datetime(df_cat_dog_clean['outcome_date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ca

Unnamed: 0,stay_length_days
count,163299.0
mean,12.457057
std,38.607423
min,-1141.651262
25%,1.0
50%,4.0
75%,11.424074
max,3653.0


### Feature "Is_returned" and 'Num_returned'

In [None]:
# Step 1: Ensure the DataFrame is sorted by animal_id and intake_date
df_cat_dog_clean = df_cat_dog_clean.sort_values(by=['animal_id', 'intake_date'])

# Step 2: Use groupby().cumcount() to efficiently count returns per animal_id
df_cat_dog_clean['Num_returned'] = df_cat_dog_clean.groupby('animal_id').cumcount()

# Step 3: Create Is_returned column based on Num_returned
df_cat_dog_clean['Is_returned'] = df_cat_dog_clean['Num_returned'].apply(lambda x: 'no' if x == 0 else 'yes')


In [None]:
#df_cat_dog_clean['Is_returned'] = df_cat_dog_clean['Num_returned'].apply(lambda x: 'Yes' if x > 0 else 'No')

In [None]:
print(df_cat_dog_clean.shape)
print(df_cat_dog_clean.columns)

(166028, 50)
Index(['location', 'animalage', 'intake_date', 'animal_name',
       'OutcomeCondition', 'puttosleep', 'latitude', 'outcome_type',
       'secondary_color', 'primary_breed', 'outcome_year', 'Age',
       'intake_type', 'dob', 'outcome_is_dead', 'intake_is_dead', 'sex',
       'LastUpdate', 'istrial', 'sheltercode', 'animal_id', 'outcome_subtype',
       'was_outcome_alive', 'returnedreason', 'deceased_date',
       'intake_subtype', 'intake_condition', 'return_date', 'diedoffshelter',
       'animal_type', 'longitude', 'Jurisdiction', 'primary_color', 'geopoint',
       'outcome_date', 'istransfer', 'intake_reason', 'Crossing', 'isdoa',
       'identichipnumber', 'shelter', 'deceasedreason', 'primary_breed_clean',
       'secondary_breed', 'is_mix', 'intake_date_str', 'has_name',
       'stay_length_days', 'Num_returned', 'Is_returned'],
      dtype='object')


In [None]:
df_cat_dog_clean[df_cat_dog_clean['animal_id'] == 'A1083706'].sort_values(by='intake_date')

Unnamed: 0,location,animalage,intake_date,animal_name,OutcomeCondition,puttosleep,latitude,outcome_type,secondary_color,primary_breed,...,shelter,deceasedreason,primary_breed_clean,secondary_breed,is_mix,intake_date_str,has_name,stay_length_days,Num_returned,Is_returned
27032,,,2018-09-24,ROCKY,HEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2018-09-24,1,2.0,0,no
27033,,,2019-05-07,ROCKY,HEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-05-07,1,0.0,1,yes
43799,,,2019-09-17,ROCKY,UNHEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-09-17,1,1.0,2,yes
43800,,,2019-09-24,ROCKY,UNHEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-09-24,1,3.0,3,yes
43802,,,2019-10-31,ROCKY,MANAGE,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-10-31,1,5.0,4,yes
43804,,,2019-11-15,ROCKY,HEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-11-15,1,0.0,5,yes
43803,,,2019-11-18,ROCKY,HEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-11-18,1,0.0,6,yes
43801,,,2019-12-03,ROCKY,UNHEALTHY,,,ADOPTION,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2019-12-03,1,45.0,7,yes
54523,,,2020-06-02,ROCKY,HEALTHY,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2020-06-02,1,0.0,8,yes
58233,,,2020-07-01,ROCKY,,,,RTO,WHITE,SIBERIAN HUSKY,...,San Jose,,SIBERIAN HUSKY,,no,2020-07-01,1,0.0,9,yes


In [None]:
#df_cat_dog_clean.to_csv("df_cat_dog_clean.csv", index=False)

### Feature 'age_months'

In [None]:
# Define the conversion function
def parse_age_to_months(age_str):
    if pd.isna(age_str):
        return np.nan
    age_str = str(age_str).strip().lower()
    if age_str == 'no age':
        return np.nan

    match = re.match(r'(\d+)\s*(year|years|month|months|week|weeks|day|days)', age_str)
    if not match:
        return np.nan

    value = int(match.group(1))
    unit = match.group(2)

    if 'year' in unit:
        return value * 12
    elif 'month' in unit:
        return value
    elif 'week' in unit:
        return round(value / 4.345, 2)  # approx weeks to months
    elif 'day' in unit:
        return round(value / 30, 2)  # approx days to months
    else:
        return np.nan

# Apply to create new column
df_cat_dog_clean['age_months'] = df_cat_dog_clean['Age'].apply(parse_age_to_months)

In [None]:
print(df_cat_dog_clean.shape)
print(df_cat_dog_clean.columns)

(166028, 51)
Index(['location', 'animalage', 'intake_date', 'animal_name',
       'OutcomeCondition', 'puttosleep', 'latitude', 'outcome_type',
       'secondary_color', 'primary_breed', 'outcome_year', 'Age',
       'intake_type', 'dob', 'outcome_is_dead', 'intake_is_dead', 'sex',
       'LastUpdate', 'istrial', 'sheltercode', 'animal_id', 'outcome_subtype',
       'was_outcome_alive', 'returnedreason', 'deceased_date',
       'intake_subtype', 'intake_condition', 'return_date', 'diedoffshelter',
       'animal_type', 'longitude', 'Jurisdiction', 'primary_color', 'geopoint',
       'outcome_date', 'istransfer', 'intake_reason', 'Crossing', 'isdoa',
       'identichipnumber', 'shelter', 'deceasedreason', 'primary_breed_clean',
       'secondary_breed', 'is_mix', 'intake_date_str', 'has_name',
       'stay_length_days', 'Num_returned', 'Is_returned', 'age_months'],
      dtype='object')


### From data sources that don’t have that column, i.e. breed List as “Unknown”


In [None]:
# Replace NaN in primary_breed with "Unknown"
df_cat_dog_clean['primary_breed_clean'] = df_cat_dog_clean['primary_breed_clean'].fillna("Unknown")
df_cat_dog_cleaned = df_cat_dog_clean.copy()


In [None]:
# Check how many "Unknown" values now exist
unknown_count = (df_cat_dog_cleaned['primary_breed_clean'] == "Unknown").sum()
print(f'Number of records with primary_breed = "Unknown": {unknown_count}')


Number of records with primary_breed = "Unknown": 105


In [None]:
df_cat_dog_cleaned.to_csv("df_cat_dog_cleaned.csv", index=False)


## Prepare Run First Baseline model df_cat_dog_cleaned

### a. Regroup 'outcome_type' by creating binary column 'outcome_type_grouped'.

In [None]:
# Define outcome type groupings
adoption_outcomes = ['FOSTER TO ADOPT', 'ADOPTION']

# Function to classify outcome
def classify_outcome(outcome):
    if pd.isna(outcome):
        return 'other_outcome'
    elif outcome.upper() in [o.upper() for o in adoption_outcomes]:
        return 'adoption'
    else:
        return 'other_outcome'

# Apply classification
df_cat_dog_cleaned['outcome_type_grouped'] = df_cat_dog_cleaned['outcome_type'].apply(classify_outcome)


### b. Break by year for train/val and Bloomington for test.

In [None]:
# Split based on year and shelter
train_df = df_cat_dog_cleaned[
    (df_cat_dog_cleaned['outcome_year'].between(2018, 2023)) &
    (df_cat_dog_cleaned['shelter'].isin(['Long Beach', 'San Jose']))
].dropna(subset=['outcome_type_grouped']).copy()

val_df = df_cat_dog_cleaned[
    (df_cat_dog_cleaned['outcome_year'].between(2024, 2025)) &
    (df_cat_dog_cleaned['shelter'].isin(['Long Beach', 'San Jose']))
].dropna(subset=['outcome_type_grouped']).copy()

test_df = df_cat_dog_cleaned[
    df_cat_dog_cleaned['shelter'] == 'Bloomington'
].dropna(subset=['outcome_type_grouped']).copy()

#Define initial feature list
features = [
    'animal_type', 'primary_breed_clean', 'primary_color', 'sex', 'intake_type',
    'intake_subtype', 'intake_condition', 'age_months', 'shelter'
]

target = 'outcome_type_grouped'

# Output the sizes of the new splits
train_df.shape, val_df.shape, test_df.shape

((99437, 52), (28083, 52), (34532, 52))

### c. Perform XGBoost regression with unharmonized dataset

In [None]:
X = df_cat_dog_cleaned[features].copy()
y = df_cat_dog_cleaned[target].copy()

# Fill missing values
for col in X.columns:
    if X[col].dtype == 'O':
        X[col] = X[col].fillna('Missing')
    else:
        X[col] = X[col].fillna(-1)

# Encode categorical features
X_encoded = pd.get_dummies(X)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
# Train XGBoost with default params and limited depth to avoid long runtime
xgb_model = XGBClassifier(
    num_class=len(np.unique(y_encoded)),
    max_depth=6,
    n_estimators=100,
    objective="multi:softmax",
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=42
)

# Fit model
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_val)
xgb_report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Convert report to DataFrame
xgb_report_df = pd.DataFrame(xgb_report).transpose()

Parameters: { "use_label_encoder" } are not used.



In [None]:
xgb_report_df.head()

Unnamed: 0,precision,recall,f1-score,support
adoption,0.590062,0.710318,0.64463,9362.0
other_outcome,0.876368,0.806241,0.839843,23844.0
accuracy,0.779197,0.779197,0.779197,0.779197
macro avg,0.733215,0.758279,0.742236,33206.0
weighted avg,0.795648,0.779197,0.784805,33206.0


# Data Harmonization

## Column Harmonizer Class Definitions

In [None]:
class ColumnHarmonizer:
    """
    A class to standardize, harmonize, and evaluate a specific column
    in a DataFrame across different shelters.
    """
    def __init__(self, df: pd.DataFrame, column_name: str, shelter_column: str = 'shelter'):
        """
        Initializes the Harmonizer with the DataFrame and column to process.

        Args:
            df (pd.DataFrame): The input DataFrame containing shelter data.
            column_name (str): The name of the column to harmonize.
            shelter_column (str, optional): The column identifying the shelter. Defaults to 'shelter'.
        """
        if column_name not in df.columns or shelter_column not in df.columns:
            raise ValueError("Specified columns do not exist in the DataFrame.")

        self.df = df.copy()
        self.column_name = column_name
        self.shelter_column = shelter_column
        self.shelters = self.df[self.shelter_column].unique()

        # Store original and create a harmonized series placeholder
        self.df['original_values'] = self.df[column_name].astype(str)
        self.df['harmonized_values'] = self._normalize_series(self.df['original_values'])

        # Initialize component extraction settings for color harmonization
        self._init_color_components()

    def _init_color_components(self):
        """Initialize abbreviations and patterns for color harmonization."""
        self.abbreviations = {
            # Color abbreviations
            'brn': 'brown', 'br': 'brown', 'bn': 'brown',
            'blk': 'black', 'bl': 'blue',
            'org': 'orange', 'orn': 'orange', 'gry': 'gray', 'gr': 'gray',
            'crm': 'cream', 'cr': 'cream', 'slvr': 'silver', 'slv': 'silver',
            'choc': 'chocolate', 'ch': 'chocolate', 'lc': 'lilac', 'li': 'lilac',
            'sl': 'seal', 'st': 'seal', 'rd': 'red', 'lv': 'liver',
            'tn': 'tan', 'ln': 'lean', 'wh': 'white', 'yw': 'yellow', 'yl': 'yellow',
            # Pattern abbreviations
            'pt': 'point', 'dil': 'dilute', 'tab': 'tabby', 'brind': 'brindle',
            'smk': 'smoke', 'mut': 'mute',
            # Special combinations
            'b\\o': 'black orange', 'b/o': 'black orange', 'tri': 'tricolor'
        }

        self.patterns = ['calico', 'tortie', 'torbi', 'tabby', 'tiger', 'point',
                        'brindle', 'merle', 'lynx', 'smoke', 'shade', 'tick',
                        'snowshoe', 'dapple']

        self.colors = ['black', 'white', 'brown', 'gray', 'grey', 'orange', 'cream',
                      'blue', 'red', 'chocolate', 'seal', 'lilac', 'silver', 'gold',
                      'fawn', 'buff', 'tan', 'yellow', 'liver', 'apricot', 'wheat',
                      'peach', 'pink', 'beige', 'blonde', 'tricolor', 'flame']

        self.modifiers = ['dilute', 'smoke', 'shade', 'mute']

    def _normalize_series(self, series: pd.Series) -> pd.Series:
        """
        (Step 2) Applies a standard normalization process to a text series.
        """
        # Fill NaN, convert to uppercase, and strip whitespace
        normalized = series.fillna('unknown').str.upper().str.strip()
        # Replace common separators with a single space
        normalized = normalized.str.replace(r'[\-/_\\,.]', ' ', regex=True)
        # Replace multiple spaces with a single space
        normalized = normalized.str.replace(r'\s+', ' ', regex=True).str.strip()
        return normalized

    def _expand_abbreviations(self, text: str) -> str:
        """Expand known abbreviations in text."""
        expanded = text.lower()
        for abbr, full in sorted(self.abbreviations.items(), key=lambda x: len(x[0]), reverse=True):
            pattern = r'\b' + re.escape(abbr) + r'\b'
            expanded = re.sub(pattern, full, expanded)
        return expanded

    def _extract_components(self, text: str) -> Dict[str, List[str]]:
        """Extract semantic components from text (for colors)."""
        components = {'pattern': [], 'color': [], 'modifier': []}

        # First normalize and expand the text
        normalized = text.lower().strip()
        # Replace separators with spaces for processing
        normalized = re.sub(r'[-/\\,.]', ' ', normalized)
        normalized = re.sub(r'\s+', ' ', normalized).strip()

        # Expand abbreviations
        expanded = self._expand_abbreviations(normalized)

        # Split by spaces to get all words
        words = expanded.split()

        # Extract components
        used_words = set()

        # Extract patterns first
        for word in words:
            if word in self.patterns and word not in used_words:
                components['pattern'].append(word)
                used_words.add(word)

        # Extract modifiers
        for word in words:
            if word in self.modifiers and word not in used_words:
                components['modifier'].append(word)
                used_words.add(word)

        # Extract colors
        for word in words:
            if word in self.colors and word not in used_words:
                components['color'].append(word)
                used_words.add(word)

        # Special handling for compound colors that weren't expanded
        remaining = ' '.join(w for w in words if w not in used_words)
        if 'black orange' in remaining:
            components['color'].extend(['black', 'orange'])

        return components

    def _format_components(self, components: Dict[str, List[str]]) -> str:
        """Format components in standard order with alphabetical sorting."""
        parts = []

        # Sort and add each component type
        for comp_type in ['pattern', 'color', 'modifier']:
            if components.get(comp_type):
                sorted_items = sorted(components[comp_type])
                parts.append(' '.join(sorted_items).upper())

        return '-'.join(parts) if parts else 'UNKNOWN'

        """
        Applies a user-defined mapping to the normalized values.

        Args:
            mapping (Dict[str, str]): A dictionary where keys are the values to be
                                     replaced and values are the new harmonized values.
        """
        # Ensure mapping keys are normalized for matching
        normalized_mapping = {self._normalize_series(pd.Series([k])).iloc[0]: v for k, v in mapping.items()}
        self.df['harmonized_values'] = self.df['harmonized_values'].replace(normalized_mapping)

    def _get_shared_value_percentage(self, column_name: str) -> float:
        """
        Calculates the percentage of the most common value in a column.
        This is a proxy for how 'harmonized' the column is.
        """
        if column_name not in self.df.columns:
            return 0.0
        value_counts = self.df[column_name].value_counts()
        if value_counts.empty:
            return 0.0
        most_common = value_counts.iloc[0]
        total = len(self.df)
        return (most_common / total) * 100 if total > 0 else 0.0

    def evaluate_progress(self) -> Dict[str, float]:
        """
        Evaluates harmonization progress, comparing shared value percentages
        and the total number of unique values before and after.
        """
        # Calculate shared value progress
        before_progress = self._get_shared_value_percentage('original_values')
        after_progress = self._get_shared_value_percentage('harmonized_values')

        # Calculate unique value counts
        unique_before = self.df['original_values'].nunique()
        unique_after = self.df['harmonized_values'].nunique()

        # Calculate percentage of records with shared harmonized values
        # Get harmonized values that are shared across all shelters
        shared_values = []
        for shelter in self.shelters:
            shelter_values = set(self.df[self.df[self.shelter_column] == shelter]['harmonized_values'].unique())
            if not shared_values:
                shared_values = shelter_values
            else:
                shared_values = shared_values.intersection(shelter_values)

        # Count records that have these shared values
        records_with_shared_values = self.df['harmonized_values'].isin(shared_values).sum()
        total_records = len(self.df)
        pct_records_harmonized = (records_with_shared_values / total_records * 100) if total_records > 0 else 0

        return {
            "unique_values_before": unique_before,
            "unique_values_after": unique_after,
            "progress_before (%)": round(before_progress, 2),
            "progress_after (%)": round(after_progress, 2),
            "improvement (%)": round(after_progress - before_progress, 2),
            "records_harmonized (%)": round(pct_records_harmonized, 2),
            "records_harmonized_count": records_with_shared_values
        }

    def get_harmonization_report(self) -> pd.DataFrame:
        """
        Returns a DataFrame showing how original values were mapped.
        """
        report = self.df.groupby(['original_values', 'harmonized_values']).size().reset_index(name='count')
        return report.sort_values(['harmonized_values', 'count'], ascending=[False, False])

    def apply_mapping(self, mapping: Dict[str, str]):
        """
        Applies a user-defined mapping to the normalized values.

        Args:
            mapping (Dict[str, str]): A dictionary where keys are the values to be
                                      replaced and values are the new harmonized values.
        """
        # Ensure mapping keys are normalized for matching
        normalized_mapping = {
            self._normalize_series(pd.Series([k])).iloc[0]: v for k, v in mapping.items()
        }
        self.df['harmonized_values'] = self.df['harmonized_values'].replace(normalized_mapping)

    def apply_color_harmonization(self):
        """
        Applies semantic parsing (color, pattern, modifier) to harmonize color-related values.
        This should be called if working with color columns.
        """
        self.df['harmonized_values'] = self.df['harmonized_values'].apply(
            lambda x: self._format_components(self._extract_components(x))
        )


In [None]:
# Step 1: Analyze what's new
def analyze_new_shelter(df, shelter_name, column_name):
    shelter_data = df[df['shelter'] == shelter_name][column_name].value_counts()
    print(f"\n{shelter_name} - {column_name} distribution:")
    return shelter_data

# Step 2: Update mappings if needed
def update_mappings_for_shelter(df, shelter_name, column_name, existing_mappings):
    harmonizer = ColumnHarmonizer(df, column_name)
    report = harmonizer.get_harmonization_report()

    # Filter for new shelter
    new_shelter_values = df[df['shelter'] == shelter_name][column_name].unique()

    # Check which values need mapping
    needs_mapping = []
    for val in new_shelter_values:
        if pd.notna(val) and val.upper() not in existing_mappings:
            needs_mapping.append(val)

    return print(f"Values in new shelter that still needs mapping: \n{needs_mapping}")

# Step 3: Re-harmonize all data
def harmonize_all_shelters(df, column_name, mappings):
    harmonizer = ColumnHarmonizer(df, column_name)
    harmonizer.apply_mapping(mappings)

    # Show progress
    progress = harmonizer.evaluate_progress()
    print(f"\nHarmonization Progress for {column_name}:")
    print(f"Unique values: {progress['unique_values_before']} → {progress['unique_values_after']}")
    print(f"Shared across all shelters: {progress['progress_before (%)']}% → {progress['progress_after (%)']}%")

    return harmonizer

In [None]:
def compare_shelter_counts(
    df: pd.DataFrame,
    column_to_compare: str,
    shelter_names: Optional[List[str]] = None,
    shelter_column: str = 'shelter',
    min_count: int = 0,
    show_percentages: bool = False
) -> pd.DataFrame:
    """
    Compares the value counts of a specified column across multiple shelters.

    Args:
        df (pd.DataFrame): The input DataFrame containing data from multiple shelters.
        column_to_compare (str): The name of the column whose value counts are to be compared.
        shelter_names (List[str], optional): List of shelter names to compare.
            If None, compares all shelters in the dataset.
        shelter_column (str, optional): The name of the column identifying the shelter.
            Defaults to 'shelter'.
        min_count (int, optional): Minimum count threshold to include in results.
            Defaults to 0 (show all).
        show_percentages (bool, optional): If True, adds percentage columns for each shelter.
            Defaults to False.

    Returns:
        pd.DataFrame: A DataFrame comparing the value counts across all specified shelters.
    """
    # If no specific shelters specified, use all unique shelters
    if shelter_names is None:
        shelter_names = sorted(df[shelter_column].unique())

    # Validate shelter names exist in data
    available_shelters = set(df[shelter_column].unique())
    invalid_shelters = set(shelter_names) - available_shelters
    if invalid_shelters:
        raise ValueError(f"Shelters not found in data: {invalid_shelters}")

    # Start with the first shelter as base
    comparison_df = None

    # Process each shelter
    for shelter_name in shelter_names:
        # Get value counts for this shelter
        shelter_counts = df[
            df[shelter_column] == shelter_name
        ][column_to_compare].value_counts().reset_index()

        shelter_counts.columns = [column_to_compare, f'{shelter_name} Count']

        # Add percentage column if requested
        if show_percentages:
            total = shelter_counts[f'{shelter_name} Count'].sum()
            shelter_counts[f'{shelter_name} %'] = (
                shelter_counts[f'{shelter_name} Count'] / total * 100
            ).round(2)

        # Merge with existing comparison
        if comparison_df is None:
            comparison_df = shelter_counts
        else:
            comparison_df = pd.merge(
                comparison_df,
                shelter_counts,
                on=column_to_compare,
                how='outer'
            )

    # Fill NaN values with 0 for counts
    count_columns = [col for col in comparison_df.columns if 'Count' in col]
    comparison_df[count_columns] = comparison_df[count_columns].fillna(0).astype(int)

    # Fill NaN values with 0.0 for percentages
    if show_percentages:
        pct_columns = [col for col in comparison_df.columns if '%' in col]
        comparison_df[pct_columns] = comparison_df[pct_columns].fillna(0.0)

    # Calculate total count across all shelters
    comparison_df['Total Count'] = comparison_df[count_columns].sum(axis=1)

    # Apply minimum count filter
    if min_count > 0:
        comparison_df = comparison_df[comparison_df['Total Count'] >= min_count]

    # Sort by total count descending
    comparison_df = comparison_df.sort_values('Total Count', ascending=False)

    # Reorder columns to put Total Count after the value column
    cols = comparison_df.columns.tolist()
    cols.remove('Total Count')
    cols.insert(1, 'Total Count')
    comparison_df = comparison_df[cols]

    return comparison_df.reset_index(drop=True)

### Mapping dictionary to harmonize "primary_breed_final"

In [None]:
breed_mapping = {
    # A
    'affenpinscher': 'Affenpinscher',
    'airedale terr': 'Airedale Terrier',
    'airedale terrier': 'Airedale Terrier',
    'akita': 'Akita',
    'akita inu': 'Akita',
    'alask malamute': 'Alaskan Malamute',
    'alaskan malamute': 'Alaskan Malamute',
    'alaskan husky': 'Siberian Husky',
    'amer bulldog': 'American Bulldog',
    'amer eskimo': 'American Eskimo Dog',
    'amer foxhound': 'American Foxhound',
    'amer staff': 'American Staffordshire Terrier',
    'american staffordshire terrier': 'American Staffordshire Terrier',
    'anatol shepherd': 'Anatolian Shepherd Dog',
    'anatolian shepherd': 'Anatolian Shepherd Dog',
    'aust cattle dog': 'Australian Cattle Dog',
    'aust kelpie': 'Australian Kelpie',
    'aust shepherd': 'Australian Shepherd',
    'aust terrier': 'Australian Terrier',

    # B
    'belg malinois': 'Belgian Malinois',
    'belg sheepdog': 'Belgian Sheepdog',
    'belg tervuren': 'Belgian Tervuren',
    'bichon frise': 'Bichon Frise',
    'black and tan coonhound': 'Black and Tan Coonhound',
    'bloodhound': 'Bloodhound',
    'bluetick coonhound': 'Bluetick Coonhound',
    'border collie': 'Border Collie',
    'border terrier': 'Border Terrier',
    'boston terrier': 'Boston Terrier',
    'boxer': 'Boxer',
    'brittany': 'Brittany',
    'bull terrier': 'Bull Terrier',
    'bulldog': 'Bulldog',
    'bullmastiff': 'Bullmastiff',

    # C
    'cairn terrier': 'Cairn Terrier',
    'canaan dog': 'Canaan Dog',
    'cane corso': 'Cane Corso',
    'carolina dog': 'Carolina Dog',
    'catahoula': 'Catahoula Leopard Dog',
    'catahoula leopard dog': 'Catahoula Leopard Dog',
    'cavalier king charles spaniel': 'Cavalier King Charles Spaniel',
    'chesapeake bay retr': 'Chesapeake Bay Retriever',
    'chihuahua': 'Chihuahua',
    'chinese crested': 'Chinese Crested',
    'chinese sharpei': 'Chinese Shar-Pei',
    'chow chow': 'Chow Chow',
    'clumber span': 'Clumber Spaniel',
    'cocker spaniel': 'Cocker Spaniel',
    'collie': 'Collie',
    'coton de tulear': 'Coton de Tulear',
    'curly- coated retriever': 'Curly-Coated Retriever',

    # D
    'dachshund': 'Dachshund',
    'dalmatian': 'Dalmatian',
    'dandie dinmont': 'Dandie Dinmont Terrier',
    'doberman pinsch': 'Doberman Pinscher',
    'doberman pinscher': 'Doberman Pinscher',
    'dogo argentino': 'Dogo Argentino',
    'dogue de bordx': 'Dogue de Bordeaux',
    'dutch shepherd': 'Dutch Shepherd',

    # E
    'eng bulldog': 'Bulldog',
    'eng coonhound': 'English Foxhound',
    'eng foxhound': 'English Foxhound',
    'eng sprngr span': 'English Springer Spaniel',
    'english bulldog': 'Bulldog',
    'english coonhound': 'American English Coonhound',
    'english setter': 'English Setter',
    'english springer spaniel': 'English Springer Spaniel',
    'entlebucher': 'Entlebucher Mountain Dog',
    'eskimo dog': 'American Eskimo Dog',

    # F
    'field spaniel': 'Field Spaniel',
    'finnish spitz': 'Finnish Spitz',
    'flat coat retr': 'Flat-Coated Retriever',
    'flat-coated retriever': 'Flat-Coated Retriever',
    'french bulldog': 'French Bulldog',

    # G
    'german shepherd': 'German Shepherd Dog',
    'german shepherd dog': 'German Shepherd Dog',
    'german shorthaired pointer': 'German Shorthaired Pointer',
    'german wirehaired pointer': 'German Wirehaired Pointer',
    'giant schnauzer': 'Giant Schnauzer',
    'glen of imaal': 'Glen of Imaal Terrier',
    'golden retriever': 'Golden Retriever',
    'gordon setter': 'Gordon Setter',
    'great dane': 'Great Dane',
    'great pyrenees': 'Great Pyrenees',
    'greater swiss mountain dog': 'Greater Swiss Mountain Dog',
    'greyhound': 'Greyhound',

    # H
    'harrier': 'Harrier',
    'havanese': 'Havanese',
    'husky': 'Siberian Husky',

    # I
    'ibizan hound': 'Ibizan Hound',
    'irish setter': 'Irish Setter',
    'irish terrier': 'Irish Terrier',
    'irish wolfhound': 'Irish Wolfhound',
    'ital greyhound': 'Italian Greyhound',
    'italian greyhound': 'Italian Greyhound',

    # J
    'japanese chin': 'Japanese Chin',
    'jindo': 'Jindo',

    # K
    'karelian bear': 'Karelian Bear Dog',
    'keeshond': 'Keeshond',
    'kuvasz': 'Kuvasz',

    # L
    'labrador retriever': 'Labrador Retriever',
    'lakeland terr': 'Lakeland Terrier',
    'leonberger': 'Leonberger',
    'lhasa apso': 'Lhasa Apso',
    'lowchen': 'Löwchen',

    # M
    'maltese': 'Maltese',
    'manchester terr': 'Manchester Terrier (Standard)',
    'mastiff': 'Mastiff',
    'min pinscher': 'Miniature Pinscher',
    'miniature pinscher': 'Miniature Pinscher',
    'mountain cur': 'Mountain Cur',
    'neapolitan mast': 'Neapolitan Mastiff',
    'newfoundland': 'Newfoundland',
    'norfolk terrier': 'Norfolk Terrier',
    'norw buhund': 'Norwegian Buhund',
    'norw elkhound': 'Norwegian Elkhound',
    'norwegian elkhound': 'Norwegian Elkhound',
    'norwich terrier': 'Norwich Terrier',

    # O
    'old english sheepdog': 'Old English Sheepdog',

    # P
    'papillon': 'Papillon',
    'parson russ ter': 'Parson Russell Terrier',
    'patterdale terr': 'Patterdale Terrier',
    'pekingese': 'Pekingese',
    'pharaoh hound': 'Pharaoh Hound',
    'pit bull': 'American Staffordshire Terrier',
    'plott hound': 'Plott Hound',
    'pointer': 'Pointer',
    'pomeranian': 'Pomeranian',
    'poodle': 'Poodle (Standard)',
    'poodle min': 'Poodle (Miniature)',
    'poodle stnd': 'Poodle (Standard)',
    'poodle toy': 'Poodle (Toy)',
    'puli': 'Puli',
    'pumi': 'Pumi',

    # R
    'rat terrier': 'Rat Terrier',
    'redbone coonhound': 'Redbone Coonhound',
    'rhod ridgeback': 'Rhodesian Ridgeback',
    'rhodesian ridgeback': 'Rhodesian Ridgeback',
    'rottweiler': 'Rottweiler',

    # S
    'saint bernard': 'Saint Bernard',
    'saluki': 'Saluki',
    'samoyed': 'Samoyed',
    'schipperke': 'Schipperke',
    'schnauzer min': 'Miniature Schnauzer',
    'schnauzer stand': 'Standard Schnauzer',
    'schnauzer giant': 'Giant Schnauzer',
    'scot terrier': 'Scottish Terrier',
    'scottish terrier': 'Scottish Terrier',
    'sealyham terr': 'Sealyham Terrier',
    'shetld sheepdog': 'Shetland Sheepdog',
    'shiba inu': 'Shiba Inu',
    'shih tzu': 'Shih Tzu',
    'siberian husky': 'Siberian Husky',
    'silky terrier': 'Silky Terrier',
    'skye terrier': 'Skye Terrier',
    'smooth fox terrier': 'Smooth Fox Terrier',
    'soft coated wheaton terrier': 'Soft Coated Wheaten Terrier',
    'staffordshire': 'Staffordshire Bull Terrier',
    'staffordshire bull terrier': 'Staffordshire Bull Terrier',

    # T
    'tibetan mastiff': 'Tibetan Mastiff',
    'tibetan span': 'Tibetan Spaniel',
    'tibetan terr': 'Tibetan Terrier',
    'toy fox terrier': 'Toy Fox Terrier',

    # V
    'vizsla': 'Vizsla',

    # W
    'weimaraner': 'Weimaraner',
    'welsh corgi car': 'Cardigan Welsh Corgi',
    'welsh corgi pem': 'Pembroke Welsh Corgi',
    'welsh terrier': 'Welsh Terrier',
    'west highland': 'West Highland White Terrier',
    'whippet': 'Whippet',
    'wire fox terrier': 'Wire Fox Terrier',
    'wirehaired pointing griffon': 'Wirehaired Pointing Griffon',

    # Y
    'yorkshire terr': 'Yorkshire Terrier',
    'yorkshire terrier': 'Yorkshire Terrier',

    # Cat
    'domestic sh': 'domestic short hair',
    'domestic short hair': 'domestic short hair',
    'domestic mh': 'domestic medium hair',
    'domestic medium hair': 'domestic medium hair',
    'domestic lh': 'domestic long hair',
    'domestic long hair': 'domestic long hair',
}


In [None]:
def map_breed(breed):
    breed = str(breed).strip().lower()
    return breed_mapping.get(breed, breed.title())

df_cat_dog_cleaned['primary_breed_harmonized'] = (
    df_cat_dog_cleaned['primary_breed_clean']
    .fillna('unknown')
    .apply(map_breed)
)

In [None]:
df_cat_dog_cleaned.columns

Index(['return_date', 'identichipnumber', 'latitude', 'outcome_date',
       'animal_name', 'primary_color', 'intake_date', 'outcome_is_dead',
       'intake_reason', 'diedoffshelter', 'Jurisdiction', 'animalage',
       'primary_breed', 'outcome_year', 'geopoint', 'animal_id', 'sheltercode',
       'istrial', 'secondary_color', 'was_outcome_alive', 'isdoa',
       'outcome_subtype', 'intake_is_dead', 'Age', 'location',
       'intake_subtype', 'animal_type', 'returnedreason', 'intake_condition',
       'OutcomeCondition', 'Crossing', 'intake_type', 'sex', 'deceasedreason',
       'deceased_date', 'outcome_type', 'puttosleep', 'shelter', 'LastUpdate',
       'dob', 'longitude', 'istransfer', 'primary_breed_clean',
       'secondary_breed', 'is_mix', 'intake_date_str', 'has_name',
       'stay_length_days', 'Num_returned', 'Is_returned', 'age_months',
       'outcome_type_grouped', 'primary_breed_harmonized'],
      dtype='object')

In [None]:
# See top 20 harmonized breed names
df_cat_dog_cleaned['primary_breed_harmonized'].value_counts().head(20)


Unnamed: 0_level_0,count
primary_breed_harmonized,Unnamed: 1_level_1
domestic short hair,83642
Unknown,25221
domestic medium hair,9134
Chihuahua Sh,5442
American Staffordshire Terrier,4353
domestic long hair,3427
Germ Shepherd,2639
Siberian Husky,2586
Bully Breed Mix,1936
Labrador Retriever,1514


### Mapping dictionary to harmonize "intake_reason"

In [None]:
# intake reason harmonization dictionary
intake_reason_mapping = {
    'owner surrender': 'owner surrender',
    'surrender': 'owner surrender',
    'stray': 'stray',
    'lost': 'stray',
    'found': 'stray',
    'abandon': 'stray',
    'confiscated': 'confiscated',
    'seized': 'confiscated',
    'euthanasia request': 'euthanasia request',
    'medical': 'medical',
    'injured': 'medical',
    'sick': 'medical',
    'behavior': 'behavioral',
    'aggressive': 'behavioral',
    'biting': 'behavioral',
    'court case': 'legal',
    'legal case': 'legal',
    'quarantine': 'quarantine',
    'transfer': 'transfer',
    'born in care': 'born in care',
    'other': 'other'
}


In [None]:
# Harmonize intake_reason
intake_reason_harmonizer = ColumnHarmonizer(
    df=df_cat_dog_cleaned,
    column_name='intake_reason',
    shelter_column='shelter'
)

# Apply the mapping
intake_reason_harmonizer.apply_mapping(intake_reason_mapping)

# Evaluate progress
intake_reason_report = intake_reason_harmonizer.evaluate_progress()
print("Intake Reason Harmonization Report:")
print(intake_reason_report)

# Optional: Save result back to main DataFrame
df_cat_dog_cleaned['intake_reason_harmonized'] = intake_reason_harmonizer.df['harmonized_values']


Intake Reason Harmonization Report:
{'unique_values_before': 102, 'unique_values_after': 98, 'progress_before (%)': np.float64(74.15), 'progress_after (%)': np.float64(74.15), 'improvement (%)': np.float64(0.0), 'records_harmonized (%)': np.float64(85.21), 'records_harmonized_count': np.int64(141467)}


### Mapping dictionary to harmonize "primary_color"

In [None]:
# Define primary color harmonization mapping
primary_color_harmonization = {
    'brn': 'brown',
    'br': 'brown',
    'blk': 'black',
    'bl': 'blue',
    'gry': 'gray',
    'gr': 'gray',
    'org': 'orange',
    'orn': 'orange',
    'crm': 'cream',
    'cr': 'cream',
    'slvr': 'silver',
    'slv': 'silver',
    'choc': 'chocolate',
    'ch': 'chocolate',
    'lc': 'lilac',
    'li': 'lilac',
    'rd': 'red',
    'lv': 'liver',
    'tn': 'tan',
    'wh': 'white',
    'yw': 'yellow',
    'yl': 'yellow',
    'tri': 'tricolor',
    'pt-choc': 'chocolate point',
    'pt-lilac': 'lilac point',
    'pt-lynx': 'lynx point',
    'merle-red': 'red merle',
    'brindle-bn': 'brown brindle',
    'brindle-bl': 'blue brindle',
    'tabby-brn': 'brown tabby',
    'tabby-buff': 'buff tabby',
    'tabby-gray': 'gray tabby',
    'tabby-org': 'orange tabby',
    'gray tabby': 'gray tabby',
    'calico-dil': 'calico',
    'calico-tri': 'calico',
    'calico tabby': 'calico',
    'tortie-b\\o': 'tortie',
    'tortie-dil': 'tortie'
}


In [None]:
# Apply harmonization to primary_color
df_cat_dog_cleaned['primary_color_harmonized'] = (
    df_cat_dog_cleaned['primary_color']
    .fillna('unknown')
    .astype(str)
    .str.strip()
    .str.lower()
    .map(primary_color_harmonization)
    .fillna(df_cat_dog_cleaned['primary_color'].astype(str).str.strip().str.lower())
)


In [None]:
df_cat_dog_cleaned['primary_color_harmonized']

Unnamed: 0,primary_color_harmonized
132948,tortoiseshell
132949,tan
132950,black and tan
132951,black and tan
132952,black and tan
...,...
7004,brn tabby
4003,tan
14858,gray
3890,black


### Mapping dictionary to harmonize "secondary_color"

In [None]:
# First, list all unique secondary colors (lowercase, stripped) for review
sec_color_series = df_cat_dog['secondary_color'].dropna().astype(str).str.strip().str.lower()
unique_sec_colors = sorted(sec_color_series.unique())

unique_sec_colors


['apricot',
 'beige',
 'bl brindle',
 'black',
 'blk smoke',
 'blk tabby',
 'blue',
 'blue tabby',
 'blue tick',
 'br brindle',
 'brindle',
 'brindle-bn',
 'brindle-ln',
 'brindle-tn',
 'brn merle',
 'brn tabby',
 'brn tiger',
 'brn-gry',
 'brown',
 'brown tips',
 'buff',
 'calico',
 'calico tabby',
 'calico-dil',
 'calico-tri',
 'choc pt',
 'chocolate',
 'cream',
 'crm tabby',
 'fawn',
 'flame pt',
 'gold',
 'gray',
 'gray tabby',
 'grey',
 'light',
 'liver',
 'lynx pt',
 'marbled tabby',
 'merle-blue',
 'merle-red',
 'orange',
 'org tabby',
 'other',
 'pepper',
 'pt-choc',
 'pt-flame',
 'pt-lilac',
 'pt-lynx',
 'pt-seal',
 'purple',
 'red',
 'red tick',
 'sable',
 'seal pt',
 'shade-slv',
 'shade-smk',
 'silver',
 'slvr tabby',
 'tabby-brn',
 'tabby-buff',
 'tabby-gray',
 'tabby-org',
 'tan',
 'tick-black',
 'tick-yel',
 'torbi',
 'torbi-brn',
 'torbi-dil',
 'tortie',
 'tortie dil',
 'tortie-b\\o',
 'tortie-dil',
 'tricolor',
 'wheat',
 'white',
 'wht',
 'yellow']

In [None]:
# Define secondary color harmonization mapping
sec_color_harmonization = {
    'bl brindle': 'blue brindle',
    'blk tabby': 'black tabby',
    'br brindle': 'brown brindle',
    'brn tabby': 'brown tabby',
    'org tabby': 'orange tabby',
    'pt-choc': 'chocolate point',
    'pt-lilac': 'lilac point',
    'pt-lynx': 'lynx point',
    'merle-red': 'red merle',
    'brindle-bn': 'brown brindle',
    'brindle-bl': 'blue brindle',

    'tabby-brn': 'brown tabby',
    'tabby-buff': 'buff tabby',
    'tabby-gray': 'gray tabby',
    'tabby-org': 'orange tabby',
    'gray tabby': 'gray tabby',

    'calico-dil': 'calico',
    'calico-tri': 'calico',
    'calico tabby': 'calico',

    'tortie-b\\o': 'tortie',
    'tortie-dil': 'tortie'
}

In [None]:
# Apply harmonization
df_cat_dog_cleaned['secondary_color_harmonized'] = (
    df_cat_dog_cleaned['secondary_color']
    .fillna('unknown')
    .astype(str)
    .str.strip()
    .str.lower()
    .map(sec_color_harmonization)
    .fillna(df_cat_dog_cleaned['secondary_color'].astype(str).str.strip().str.lower())
)

In [None]:
# Show the top 5 harmonized secondary colors
final_sec_color_counts = df_cat_dog_cleaned['secondary_color_harmonized'].value_counts()
final_sec_color_counts.head()

Unnamed: 0_level_0,count
secondary_color_harmonized,Unnamed: 1_level_1
,109380
white,39707
black,5482
brown,3955
tan,3208


### Mapping dictionary to harmonize "outcome_type"

In [None]:
# First inspect unique values in outcome_type
outcome_series = df_cat_dog['outcome_type'].dropna().astype(str).str.strip().str.lower()
unique_outcomes = sorted(outcome_series.unique())

unique_outcomes


['adoption',
 'community cat',
 'died',
 'disposal',
 'duplicate',
 'escaped',
 'euth',
 'euthanasia',
 'foster',
 'foster to adopt',
 'found anim',
 'found exp',
 'homefirst',
 'lost exp',
 'missing',
 'neuter',
 'reclaimed',
 'released to wild',
 'req euth',
 'rescue',
 'retailer',
 'return to owner',
 'return to rescue',
 'return to wild habitat',
 'rtf',
 'rto',
 's/n unable',
 'shelter, neuter, return',
 'spay',
 'stolen',
 'transfer',
 'transport',
 'trap, neuter, release',
 'unknown']

In [None]:
# Define outcome type harmonization mapping
outcome_harmonization = {
    'euth': 'euthanasia',
    'euthanasia': 'euthanasia',
    'req euth': 'euthanasia',

    'return to owner': 'return to owner',
    'reclaimed': 'return to owner',
    'rto': 'return to owner',

    'spay': 'neutered/spayed',
    'neuter': 'neutered/spayed',

    'found exp': 'lost/missing/escaped',
    'lost exp': 'lost/missing/escaped',
    'escaped': 'lost/missing/escaped',
    'stolen': 'lost/missing/escaped',
    'missing': 'lost/missing/escaped',

    'released to wild': 'released to wild',
    'community cat': 'released to wild',

    'foster': 'foster',
    'homefirst': 'foster',

    'transfer': 'transfer',
    'transport': 'transfer'

}

# Apply mapping
df_cat_dog_cleaned['outcome_type_harmonized'] = (
    df_cat_dog_cleaned['outcome_type']
    .fillna('unknown')
    .astype(str)
    .str.strip()
    .str.lower()
    .map(outcome_harmonization)
    .fillna(df_cat_dog_cleaned['outcome_type'].astype(str).str.strip().str.lower())
)

# Show cleaned outcome distribution
outcome_final_counts = df_cat_dog_cleaned['outcome_type_harmonized'].value_counts()
outcome_final_counts


Unnamed: 0_level_0,count
outcome_type_harmonized,Unnamed: 1_level_1
adoption,46577
rescue,25963
transfer,23318
return to owner,17535
foster,15296
euthanasia,11117
disposal,10923
rtf,5451
unknown,2648
died,2328


In [None]:
df_cat_dog_cleaned['outcome_type_harmonized'].nunique()

22

### Hmrnoized dataframe

In [None]:
# Create a deep copy of the cleaned DataFrame
df_cat_dog_harmonized = df_cat_dog_cleaned.copy()
print(df_cat_dog_harmonized.columns)

Index(['return_date', 'identichipnumber', 'latitude', 'outcome_date',
       'animal_name', 'primary_color', 'intake_date', 'outcome_is_dead',
       'intake_reason', 'diedoffshelter', 'Jurisdiction', 'animalage',
       'primary_breed', 'outcome_year', 'geopoint', 'animal_id', 'sheltercode',
       'istrial', 'secondary_color', 'was_outcome_alive', 'isdoa',
       'outcome_subtype', 'intake_is_dead', 'Age', 'location',
       'intake_subtype', 'animal_type', 'returnedreason', 'intake_condition',
       'OutcomeCondition', 'Crossing', 'intake_type', 'sex', 'deceasedreason',
       'deceased_date', 'outcome_type', 'puttosleep', 'shelter', 'LastUpdate',
       'dob', 'longitude', 'istransfer', 'primary_breed_clean',
       'secondary_breed', 'is_mix', 'intake_date_str', 'has_name',
       'stay_length_days', 'Num_returned', 'Is_returned', 'age_months',
       'outcome_type_grouped', 'primary_breed_harmonized',
       'intake_reason_harmonized', 'primary_color_harmonized',
       'seconda

In [None]:
output_path = "df_cat_dog_harmonized.csv"
df_cat_dog_harmonized.to_csv(output_path, index=False)

## Prepare to Run Comparaion model df_cat_dog_harmonized

In [None]:
# Define adopted conditions
adopted_conditions = {'adoption', 'foster to adoption', 'return to owner'}

# Create the grouped outcome column based on the rule
df_cat_dog_harmonized['outcome_type_harmonized_grouped'] = df_cat_dog_harmonized['outcome_type_harmonized'].apply(
    lambda x: 'adopted' if isinstance(x, str) and x.strip().lower() in adopted_conditions else 'unadopted'
)

# Check distribution for confirmation
df_cat_dog_harmonized['outcome_type_harmonized_grouped'].value_counts()


Unnamed: 0_level_0,count
outcome_type_harmonized_grouped,Unnamed: 1_level_1
unadopted,101916
adopted,64112


In [None]:
# Update the target column to use the new grouped column
target_harmonized = 'outcome_type_harmonized_grouped'

# Redefine data splits with updated target
train_df_harmonized = df_cat_dog_harmonized[
    (df_cat_dog_harmonized['outcome_year'].between(2018, 2023)) &
    (df_cat_dog_harmonized['shelter'].isin(['Long Beach', 'San Jose']))
].dropna(subset=[target_harmonized]).copy()

val_df_harmonized = df_cat_dog_harmonized[
    (df_cat_dog_harmonized['outcome_year'].between(2024, 2025)) &
    (df_cat_dog_harmonized['shelter'].isin(['Long Beach', 'San Jose']))
].dropna(subset=[target_harmonized]).copy()

test_df_harmonized = df_cat_dog_harmonized[
    df_cat_dog_harmonized['shelter'] == 'Bloomington'
].dropna(subset=[target_harmonized]).copy()

# Feature columns remain the same
features_harmonized = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type', 'intake_subtype', 'intake_condition',
    'age_months', 'shelter'
]

# Combine train and val for joint encoding
combined_df = pd.concat([train_df_harmonized, val_df_harmonized], axis=0)
df_encoded = combined_df[features_harmonized + [target_harmonized]].copy()

# Encode categorical columns
label_encoders = {}
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

# Split back to train and val
X_all = df_encoded[features_harmonized]
y_all = df_encoded[target_harmonized]

X_train = X_all.iloc[:len(train_df_harmonized)]
X_val = X_all.iloc[len(train_df_harmonized):]
y_train = y_all.iloc[:len(train_df_harmonized)]
y_val = y_all.iloc[len(train_df_harmonized):]

# Train XGBoost classifier
xgb_model_final = xgb.XGBClassifier(
    num_class=len(np.unique(y_all)),
    max_depth=4,
    n_estimators=50,
    objective="multi:softmax",
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=42
)

xgb_model_final.fit(X_train, y_train)
y_pred_val = xgb_model_final.predict(X_val)

# Generate classification report
target_names = label_encoders[target_harmonized].classes_
xgb_val_report = classification_report(y_val, y_pred_val, target_names=target_names, output_dict=True)
xgb_val_report_df = pd.DataFrame(xgb_val_report).transpose()

Parameters: { "use_label_encoder" } are not used.



In [None]:
xgb_val_report_df.head()

Unnamed: 0,precision,recall,f1-score,support
adopted,0.642455,0.806475,0.715181,9761.0
unadopted,0.88067,0.760889,0.816409,18322.0
accuracy,0.776733,0.776733,0.776733,0.776733
macro avg,0.761562,0.783682,0.765795,28083.0
weighted avg,0.797872,0.776733,0.781225,28083.0


# Data Harmonization -- NOT USED

## Mapping Dog Characteristic Data Based on AKC Data Source

In [None]:
# Load the AKC breed characteristics data
akc_path = "akc-data-latest.csv"
df_akc = pd.read_csv(akc_path)

# Check its structure
akc_info = (df_akc.shape, list(df_akc.columns))
akc_info


((277, 21),
 ['Unnamed: 0',
  'description',
  'temperament',
  'popularity',
  'min_height',
  'max_height',
  'min_weight',
  'max_weight',
  'min_expectancy',
  'max_expectancy',
  'group',
  'grooming_frequency_value',
  'grooming_frequency_category',
  'shedding_value',
  'shedding_category',
  'energy_level_value',
  'energy_level_category',
  'trainability_value',
  'trainability_category',
  'demeanor_value',
  'demeanor_category'])

In [None]:
# Rename Unnamed: 0 to dog_breed_name
df_akc = df_akc.rename(columns={'Unnamed: 0': 'dog_breed_name'})
df_akc['dog_breed_name']

In [None]:
# Check its structure
akc_info = (df_akc.shape, list(df_akc.columns))
akc_info

### Update the Merge between Dataset and AKC source file

In [None]:
# Generate primary_breed_final_clean
df_cat_dog['primary_breed_final_clean'] = (
    df_cat_dog['primary_breed_final']
    .astype(str)
    .str.strip()
    .str.lower()
    .map(breed_mapping)
    .fillna(df_cat_dog['primary_breed_final'].astype(str).str.strip().str.lower())
)

In [None]:
pd.set_option('display.max_columns', None)

# Show first 20 rows with all columns
print(df_cat_dog.head(20))

In [None]:
# Generate primary_breed_final_clean
df_cat_dog['primary_breed_final_clean'] = (
    df_cat_dog['primary_breed_final']
    .astype(str)
    .str.strip()
    .str.lower()
    .map(breed_mapping)
    .fillna(df_cat_dog['primary_breed_final'].astype(str).str.strip().str.lower())
)

# Similarly clean AKC breed names
df_akc['dog_breed_name_clean'] = df_akc['dog_breed_name'].astype(str).str.strip().str.lower()

# Perform the merge
df_cat_dog_merged = df_cat_dog.merge(
    df_akc,
    how='left',
    left_on='primary_breed_final_clean',
    right_on='dog_breed_name_clean'
)

# Check merge success
print("Merged sample:")
print(df_cat_dog_merged[['primary_breed_final', 'primary_breed_final_clean', 'dog_breed_name', 'description']].head(10))



In [None]:
# Filter out rows where primary_breed_final_clean is not nan
valid_rows = df_cat_dog_merged[df_cat_dog_merged['primary_breed_final_clean'].notna()]
print(valid_rows[['primary_breed_final', 'primary_breed_final_clean', 'dog_breed_name', 'description']].head(10))


In [None]:
print("Number of rows with a match to AKC breeds:", df_cat_dog_merged['dog_breed_name'].notna().sum())
print("Number of rows with no match to AKC breeds:", df_cat_dog_merged['dog_breed_name'].isna().sum())


In [None]:
# Standardize the breed names for matching
df_akc['dog_breed_name_clean'] = df_akc['dog_breed_name'].astype(str).str.strip().str.lower()
df_cat_dog['primary_breed_final_clean'] = df_cat_dog['primary_breed_final'].astype(str).str.strip().str.lower()

# Merge AKC data into df_cat_dog where dog breeds match
# Only apply to dog records
df_cat_dog_merged = df_cat_dog.merge(
    df_akc,
    how='left',
    left_on='primary_breed_final_clean',
    right_on='dog_breed_name_clean'
)

# Show the result: shape and columns added
merged_info = (
    df_cat_dog_merged.shape,
    [col for col in df_cat_dog_merged.columns if col in df_akc.columns]
)



### Updated Mapping for 'outcome_type'

In [None]:
# Mappings are defined based on the analysis from the notebook
outcome_mappings = {
    # ADOPTIONS
    'ADOPTION': 'ADOPTION',
    'FOSTER TO ADOPT': 'ADOPTION',

    # RETURNS TO OWNER
    'RTO': 'RETURN TO OWNER',
    'RETURN TO OWNER': 'RETURN TO OWNER',
    'RECLAIMED': 'RETURN TO OWNER', # Bloomington

    # EUTHANASIA
    'EUTH': 'EUTHANASIA',
    'EUTHANASIA': 'EUTHANASIA',
    'REQ EUTH': 'EUTHANASIA',  # REQUESTED EUTHANASIA

    # TRANSFERS AND TRANSPORTS
    'TRANSFER': 'TRANSFER',
    'TRANSPORT': 'TRANSFER',

    # SPAY/NEUTER PROGRAMS
    'SPAY': 'SPAY/NEUTER',
    'NEUTER': 'SPAY/NEUTER',
    'S N UNABLE': 'SPAY/NEUTER',

    # TNR AND COMMUNITY CAT PROGRAMS
    'SHELTER  NEUTER  RETURN': 'SPAY/NEUTER',
    'SHELTER, NEUTER, RETURN': 'SPAY/NEUTER',
    'TRAP  NEUTER  RELEASE': 'SPAY/NEUTER',
    'COMMUNITY CAT': 'SPAY/NEUTER',
    'RTF': 'SPAY/NEUTER',
    'S N CLINIC': 'SPAY/NEUTER', # Spay / Neuter Clinic
    # A shelter community cat program, often called Community Cat Program (CCP), focuses on managing and reducing the population of free-roaming or feral cats through humane methods, primarily using Trap-Neuter-Vaccinate-Return (TNVR).
    # Return-to-field (RTF) humane management models get stray and feral cats fixed and returned to their neighborhoods, bypassing the shelter

    # RESCUE ORGANIZATIONS
    'RESCUE': 'RESCUE',
    'RETURN TO RESCUE': 'RESCUE',

    # DEATH/DISPOSAL
    'DIED': 'DIED',
    'DISPOSAL': 'DIED',  # "DISPOSAL" TYPICALLY MEANS THE ANIMAL DIED

    # FOSTER CARE
    'FOSTER': 'FOSTER',
    'HOMEFIRST': 'FOSTER',  # HOMEFIRST IS A FOSTER PROGRAM

    # LOST/FOUND/MISSING
    'MISSING': 'MISSING/STOLEN/ESCAPED',
    'LOST EXP': 'MISSING/STOLEN/ESCAPED',
    'FOUND EXP': 'FOUND',
    'FOUND ANIM': 'FOUND',
    'STOLEN': 'MISSING/STOLEN/ESCAPED',
    'ESCAPED': 'MISSING/STOLEN/ESCAPED',

    # WILDLIFE
    'RETURN TO WILD HABITAT': 'RETURN TO WILD',
    'RELEASED TO WILD': 'RETURN TO WILD', # Bloomington

    # OTHER
    'DUPLICATE': 'DUPLICATE'
}

In [None]:
# Intake Condition Harmonization Mappings

intake_condition_mappings = {
    # Normal/Healthy conditions
    'NORMAL': 'NORMAL',
    'HEALTHY': 'NORMAL',

    # Illness and Medical Conditions
    'SICK': 'MEDICAL',
    'UNHEALTHY': 'MEDICAL',
    'ILL MODERATETE': 'MEDICAL',  # Note: typo in original
    'MED R': 'MEDICAL',
    'MED M': 'MEDICAL',
    'MED SEV': 'MEDICAL',
    'MED EMERG': 'MEDICAL',
    'I/I REPORT': 'MEDICAL', # Assumed Injured/Ill Reported.
    'PREGNANT': 'MEDICAL',
    'OTHER MED': 'MEDICAL',
    'ILL MILD': 'MEDICAL',
    'ILL SEVERE': 'MEDICAL',

    # Behavioral conditions by severity
    'AGGRESSIVE': 'BEHAVIOR',
    'FEARFUL': 'BEHAVIOR',
    'FRACTIOUS': 'BEHAVIOR',
    'BEH U': 'BEHAVIOR',
    'BEH M': 'BEHAVIOR',
    'BEH R': 'BEHAVIOR',
    'BEHAVIOR MILD': 'BEHAVIOR',
    'BEHAVIOR MODERATE': 'BEHAVIOR',
    'BEHAVIOR SEVERE': 'BEHAVIOR',

    # Under age/weight = nursing
    'UNDER AGE/WEIGHT': 'UNDER AGE, WEIGHT, NURSING',
    'NURSING': 'UNDER AGE, WEIGHT, NURSING',

    # Special conditions
    'MANAGE': 'OTHER',
    'REHAB': 'OTHER',
    'WELFARE SEIZURES': 'OTHER',

    # Injured
    'INJURED SEVERE': 'INJURED',
    'INJURED MODERATE': 'INJURED',
    'INJURED MILD': 'INJURED'
}

# Breed Harmonization with AKC (7/4/2025)

## Load AKC

In [None]:
akc_path = "akc-data-latest.csv"
df_akc = pd.read_csv(akc_path)

df_akc = df_akc.rename(columns={'Unnamed: 0': 'dog_breed_name'})

In [None]:
df_akc.sample()

Unnamed: 0,dog_breed_name,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
259,Toy Fox Terrier,A surefire recipe for fun: Take the lovability...,"Friendly, Alert, Intelligent",111,21.59,29.21,1.587573,3.175147,13.0,15.0,...,0.4,Weekly Brushing,0.4,Occasional,0.8,Energetic,1.0,Eager to Please,0.8,Friendly


In [None]:
pd.set_option('display.max_rows', None)


## Breed Harmonization function

In [None]:
def harmonize_dog_breeds(df_source: pd.DataFrame, df_akc: pd.DataFrame, threshold: int = 70) -> pd.DataFrame:
    """
    Performs end-to-end breed harmonization on a DataFrame in a single, self-contained function.

    Args:
        df_source: The original DataFrame (e.g., df_cat_dog_cleaned).
        df_akc: The DataFrame containing official AKC breed names.
        threshold: The minimum weighted fuzzy score (0-100) to consider a match valid.

    Returns:
        A new DataFrame with the 'primary_breed_harmonized' column.
    """
    # --- 1. Define a nested helper function for consistent cleaning ---
    def _clean_breed(breed_name: str) -> str:
        if not isinstance(breed_name, str) or pd.isna(breed_name) or breed_name == '0':
            return "UNKNOWN"
        cleaned = breed_name.upper()
        # Remove colors, special characters, and normalize whitespace
        cleaned = re.sub(r'\b(BLACK|WHITE|BROWN|YELLOW|CHOCOLATE|RED|BLUE|BRINDLE)\b', '', cleaned)
        cleaned = re.sub(r'[.,\'"()]', '', cleaned) # Also remove parentheses
        cleaned = cleaned.replace('-', ' ')
        cleaned = ' '.join(cleaned.split())
        return cleaned if cleaned else "UNKNOWN"

    # --- 2. Prepare Cleaned Source and Target Breed Lists ---
    print("Step 1: Cleaning source and AKC breed lists...")
    # Clean source breeds (dogs only)
    dog_breeds_source = df_source[df_source['animal_type'] == 'dog']['primary_breed_clean'].dropna().unique()
    cleaned_source_breeds = sorted({_clean_breed(b) for b in dog_breeds_source})

    # Clean AKC target breeds
    akc_breeds_target = df_akc['dog_breed_name'].dropna().unique()
    cleaned_akc_targets = sorted({_clean_breed(b) for b in akc_breeds_target})

    # --- 3. Perform Fuzzy Matching Logic ---
    print("Step 2: Calculating fuzzy matches for each unique breed...")
    weights = {'ratio': 0.20, 'token_sort_ratio': 0.40, 'token_set_ratio': 0.40}
    # This map will store the translation from a cleaned source breed to its harmonized AKC name
    cleaned_to_harmonized_map = {}

    for source_breed in tqdm(cleaned_source_breeds):
        if source_breed == 'UNKNOWN':
            cleaned_to_harmonized_map[source_breed] = 'UNKNOWN'
            continue

        best_match = source_breed
        best_score = 0

        # Find the best match from the cleaned AKC list
        for akc_target in cleaned_akc_targets:
            scores = {
                'ratio': fuzz.ratio(source_breed, akc_target),
                'token_sort_ratio': fuzz.token_sort_ratio(source_breed, akc_target),
                'token_set_ratio': fuzz.token_set_ratio(source_breed, akc_target)
            }
            weighted_score = sum(scores[metric] * weight for metric, weight in weights.items())
            if weighted_score > best_score:
                best_score = weighted_score
                best_match = akc_target

        # Apply the threshold to decide if the match is valid
        if best_score >= threshold:
            cleaned_to_harmonized_map[source_breed] = best_match
        else:
            cleaned_to_harmonized_map[source_breed] = source_breed # No change if below threshold

    # --- 4. Apply Harmonization to the Full DataFrame ---
    print("Step 3: Applying final harmonization to the DataFrame...")
    df_final = df_source.copy()

    # Create a temporary column of cleaned original breeds
    df_final['temp_cleaned'] = df_final['primary_breed_clean'].apply(_clean_breed)

    # Map the cleaned breeds to their final harmonized names
    df_final['primary_breed_harmonized'] = df_final['temp_cleaned'].map(cleaned_to_harmonized_map)

    # Ensure non-dogs are not affected by setting their harmonized name to their original name
    dog_mask = df_final['animal_type'] == 'dog'
    df_final.loc[~dog_mask, 'primary_breed_harmonized'] = df_final.loc[~dog_mask, 'primary_breed_clean']

    # Handle any remaining NaNs and drop the temporary column
    df_final['primary_breed_harmonized'].fillna(df_final['primary_breed_clean'], inplace=True)
    df_final.drop(columns=['temp_cleaned'], inplace=True)

    print("Harmonization complete.")
    return df_final

In [None]:
df_final_harmonized = harmonize_dog_breeds(df_cat_dog_cleaned, df_akc, threshold=70)

Step 1: Cleaning source and AKC breed lists...
Step 2: Calculating fuzzy matches for each unique breed...


  0%|          | 0/332 [00:00<?, ?it/s]

Step 3: Applying final harmonization to the DataFrame...
Harmonization complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final['primary_breed_harmonized'].fillna(df_final['primary_breed_clean'], inplace=True)


In [None]:
df_final_harmonized[['shelter', 'primary_breed', 'primary_breed_clean', 'primary_breed_harmonized']].sample(20)

Unnamed: 0,shelter,primary_breed,primary_breed_clean,primary_breed_harmonized
11613,Long Beach,,unknown,unknown
58974,San Jose,PIT BULL,PIT BULL,PIT BULL
53276,San Jose,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH
59704,San Jose,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH
21370,Long Beach,,unknown,UNKNOWN
60195,San Jose,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH
47021,San Jose,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH
134380,Bloomington,Shep Mix,Shep Mix,SHEP MIX
57580,San Jose,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH
64133,San Jose,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH


## Add harmonized column to dataframe

In [None]:
df_cat_dog_cleaned = harmonize_dog_breeds(df_cat_dog_cleaned, df_akc, threshold=70)

Step 1: Cleaning source and AKC breed lists...
Step 2: Calculating fuzzy matches for each unique breed...


  0%|          | 0/332 [00:00<?, ?it/s]

Step 3: Applying final harmonization to the DataFrame...
Harmonization complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final['primary_breed_harmonized'].fillna(df_final['primary_breed_clean'], inplace=True)


Examine harmonization results

In [None]:
df_cat_dog_cleaned[(df_cat_dog_cleaned['animal_type']=='dog')&(df_cat_dog_cleaned['primary_breed_harmonized']!='UNKNOWN')].sample(100)[['primary_breed', 'primary_breed_clean', 'primary_breed_harmonized']]

Unnamed: 0,primary_breed,primary_breed_clean,primary_breed_harmonized
170502,Boxer/Poodle,Boxer,BOXER
99633,SIBERIAN HUSKY,SIBERIAN HUSKY,SIBERIAN HUSKY
136397,Pitbull/Labrador Retriever,Pitbull,PITBULL
136184,Pitbull,Pitbull,PITBULL
109965,DOBERMAN PINSCH,DOBERMAN PINSCH,DOBERMAN PINSCHER
104019,SIBERIAN HUSKY,SIBERIAN HUSKY,SIBERIAN HUSKY
71061,CHIHUAHUA SH,CHIHUAHUA SH,CHIHUAHUA
49425,PIT BULL,PIT BULL,PIT BULL
169940,American Staffordshire Terrier,American Staffordshire Terrier,AMERICAN STAFFORDSHIRE TERRIER
100914,PIT BULL,PIT BULL,PIT BULL


## Evaluation analysis (do not include in production).

#### Merge with AKC.

In [None]:
def clean_breed(breed_name: str) -> str:
    if not isinstance(breed_name, str) or pd.isna(breed_name) or breed_name == '0':
        return "UNKNOWN"
    cleaned = breed_name.upper()
    # Remove colors, special characters, and normalize whitespace
    cleaned = re.sub(r'\b(BLACK|WHITE|BROWN|YELLOW|CHOCOLATE|RED|BLUE|BRINDLE)\b', '', cleaned)
    cleaned = re.sub(r'[.,\'"()]', '', cleaned) # Also remove parentheses
    cleaned = cleaned.replace('-', ' ')
    cleaned = ' '.join(cleaned.split())
    return cleaned if cleaned else "UNKNOWN"

In [None]:
# Apply the function to the 'dog_breed_name' column
df_akc['dog_breed_name_clean'] = df_akc['dog_breed_name'].apply(clean_breed)

# Display the original and new cleaned column to verify
display(df_akc[['dog_breed_name', 'dog_breed_name_clean']].head())

Unnamed: 0,dog_breed_name,dog_breed_name_clean
0,Affenpinscher,AFFENPINSCHER
1,Afghan Hound,AFGHAN HOUND
2,Airedale Terrier,AIREDALE TERRIER
3,Akita,AKITA
4,Alaskan Malamute,ALASKAN MALAMUTE


In [None]:
df_merged_akc = pd.merge(
    df_final_harmonized,
    df_akc,
    how='left',
    left_on='primary_breed_harmonized',
    right_on='dog_breed_name_clean'
)

In [None]:
df_merged_akc.sample(20).T

Unnamed: 0,84359,113878,147862,57704,31336,144401,73804,61948,62046,41464,87883,52265,49041,7323,27050,82703,70453,66974,131933,98738
location,,,,,Cat room C,,,,,,,,,Cat room C,Adoptable window colony,,,,,
animalage,,,,,8 months.,,,,,,,,,6 years 3 months.,8 months.,,,,,
intake_date,2021-07-03 00:00:00,2023-08-12 00:00:00,2018-08-18 00:00:00,2019-07-20 00:00:00,2024-07-30 12:28:08,2017-07-23 00:00:00,2020-09-23 00:00:00,2019-10-07 00:00:00,2019-10-09 00:00:00,2018-07-25 00:00:00,2021-09-03 00:00:00,2019-04-17 00:00:00,2019-01-09 00:00:00,2018-12-12 13:44:21,2023-10-06 00:00:00,2021-06-06 00:00:00,2020-07-10 00:00:00,2020-03-15 00:00:00,2024-09-09 00:00:00,2022-07-02 00:00:00
animal_name,,,STITCH,,Butter,,,,CARLOS,TANYA,KIMBO,,SCOOTER,Merlin,Cookie,,,,KAYLA,
OutcomeCondition,UNHEALTHY,DEAD,,UNHEALTHY,,,UNHEALTHY,UNHEALTHY,HEALTHY,REHAB,HEALTHY,DEAD,HEALTHY,,,UNHEALTHY,UNHEALTHY,DEAD,HEALTHY,DEAD
puttosleep,,,,,0.0,,,,,,,,,0.0,0.0,,,,,
latitude,,,33.874214,,,33.791689,,,,,,,,,,,,,,
outcome_type,RESCUE,DISPOSAL,RETURN TO OWNER,TRANSFER,ADOPTION,EUTHANASIA,RESCUE,RESCUE,ADOPTION,RESCUE,ADOPTION,DISPOSAL,ADOPTION,ADOPTION,ADOPTION,RESCUE,RESCUE,DISPOSAL,RTO,DISPOSAL
secondary_color,WHITE,BLACK,WHITE,,,,,,WHITE,BROWN,WHITE,,BROWN,,,WHITE,,,WHITE,BROWN
primary_breed,DOMESTIC SH,DOMESTIC MH,,DOMESTIC SH,Domestic Short Hair,,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH,YORKSHIRE TERR,DOMESTIC SH,DOMESTIC SH,PIT BULL,Domestic Short Hair,Domestic Short Hair,DOMESTIC SH,DOMESTIC SH,DOMESTIC SH,SIBERIAN HUSKY,SIBERIAN HUSKY


#### Results: 18% -> 83% harmonized (exclude cats, unkowns)

In [None]:
df_base = df_merged_akc[(df_merged_akc['primary_breed_harmonized'] != 'UNKNOWN') & (df_merged_akc['animal_type']!= 'cat')]

In [None]:
# Identify the number of unique animal_id that have non-NaN dog_breed_name
total_unique_dogs = df_base['animal_id'].nunique()
dogs_with_breed_info = df_base[df_base['dog_breed_name_clean'].notna()]['animal_id'].nunique()

# Calculate the percentage
percentage_with_breed_info = (dogs_with_breed_info / total_unique_dogs) * 100 if total_unique_dogs > 0 else 0

print(f"Total unique dog animal_id in df_base: {total_unique_dogs}")
print(f"Unique dog animal_id with non-NaN dog_breed_name: {dogs_with_breed_info}")
print(f"Percentage of unique dog animal_id with breed info: {percentage_with_breed_info:.2f}%")

Total unique dog animal_id in df_base: 32581
Unique dog animal_id with non-NaN dog_breed_name: 27181
Percentage of unique dog animal_id with breed info: 83.43%


Results before

In [None]:
df_merged_akc_original = pd.merge(
    df_cat_dog_cleaned,
    df_akc,
    how='left',
    left_on='primary_breed_clean',
    right_on='dog_breed_name_clean'
)

In [None]:
# Filter out specific values from the 'primary_breed_clean' column
df_merged_akc_filtered = df_merged_akc_original[
    (~df_merged_akc_original['primary_breed_clean'].isin(['0', 'UNKNOWN'])) &
    (df_merged_akc_original['primary_breed_clean'].notna())
]

# Apply the animal_type filter
df_base_original = df_merged_akc_filtered[df_merged_akc_filtered['animal_type'] != 'cat']

In [None]:
# Identify the number of unique animal_id that have non-NaN dog_breed_name
total_unique_dogs_original = df_base_original['animal_id'].nunique()
dogs_with_breed_info_original = df_base_original[df_base_original['dog_breed_name'].notna()]['animal_id'].nunique()

# Calculate the percentage
percentage_with_breed_info_original = (dogs_with_breed_info_original / total_unique_dogs_original) * 100 if total_unique_dogs_original > 0 else 0

print(f"Total unique dog animal_id in df_base: {total_unique_dogs_original}")
print(f"Unique dog animal_id with non-NaN dog_breed_name: {dogs_with_breed_info_original}")
print(f"Percentage of unique dog animal_id with breed info: {percentage_with_breed_info_original:.2f}%")

Total unique dog animal_id in df_base: 42081
Unique dog animal_id with non-NaN dog_breed_name: 7802
Percentage of unique dog animal_id with breed info: 18.54%


#### Examine non-AKC matched breeds:
**Insight** predominantly mixed and should be categorized as such as that can ben an imoprtant feature.
TODO: confirm these are categorized as is_mix
1. Pitbull: https://www.reddit.com/r/pitbulls/comments/y1johk/unpopular_opinion_pitbull_is_not_a_recognized/
2. Bully breed mix: https://www.reddit.com/r/pitbulls/comments/y1johk/unpopular_opinion_pitbull_is_not_a_recognized/
3. SHEP MIX
4. MIX
5. PITBULL

In [None]:
df_base[df_base['dog_breed_name'].isna()]['primary_breed_harmonized'].value_counts()


Unnamed: 0_level_0,count
primary_breed_harmonized,Unnamed: 1_level_1
PIT BULL,4157
BULLY BREED MIX,1936
SHEP MIX,385
MIX,342
PITBULL,249
ALASKAN HUSKY,195
AMERICAN STAFF,106
AM PIT BULL TER,94
QUEENSLAND HEEL,57
CAVALIER SPAN,55
