# Austin Animal Shelter
The goal of this project is to determine the important factors that lead to an outcome (adoption, transfer, return to owner, death, euthanasia) of an animal in the shelter and to make a model that predicts outcomes based on these features.

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

sample_submission = pd.read_csv("/kaggle/input/shelter-animal-outcomes/sample_submission.csv.gz")
train = pd.read_csv("/kaggle/input/shelter-animal-outcomes/train.csv.gz")
test = pd.read_csv("/kaggle/input/shelter-animal-outcomes/test.csv.gz")

print(train.describe())
print(train.head())


# Data Cleaning

In [None]:
# Change name variable to binary feature detecting if there's a name or not
def binarize_name(df):
    df = df.rename(columns={'Name': 'HasName'})
    df['HasName'] = pd.to_numeric(df['HasName'].str.isalpha())
    df['HasName'] = df['HasName'].fillna(0)
    return df

train = binarize_name(train)
test = binarize_name(test)

# Fill NA for SexuponOutcome with 'Unknown'
train['SexuponOutcome'] = train['SexuponOutcome'].fillna('Unknown')
test['SexuponOutcome'] = test['SexuponOutcome'].fillna('Unknown')

# Fill NA for AgeuponOutcome with temporary placeholder, later we will impute the average after the data is formatted
train = train.fillna({'AgeuponOutcome': '9999 days'})
test = test.fillna({'AgeuponOutcome': '9999 days'})

# convert 'AgeuponOutcome' to days
def string_to_num_days(series):
    # split string into column for quantity and  time unit
    split_series = series.str.split(" ")
    split_df = pd.DataFrame(split_series.to_list(), columns=['multiplier', 'days'])

    convert_days = {'year': 365, 'years': 365, 'months': 30.5, 'month': 30.5, 'week': 7, 'weeks': 7, 'days': 1, 'day':1}
    split_df['days'] = split_df['days'].map(convert_days)
    
    # multiply the two columns to get the age in number of days
    split_df['multiplier'] = split_df['multiplier'].astype(int)
    split_df['total_days'] = split_df['days'] * split_df['multiplier']

    return split_df['total_days']

train['AgeuponOutcome'] = string_to_num_days(train['AgeuponOutcome'])
test['AgeuponOutcome'] = string_to_num_days(test['AgeuponOutcome'])

# Fill the placeholders for NA with the column average
train = train.replace({'AgeuponOutcome': {9999: np.nan}})
train['AgeuponOutcome'] = train['AgeuponOutcome'].fillna(train['AgeuponOutcome'].mean())

test = test.replace({'AgeuponOutcome': {9999: np.nan}})
test['AgeuponOutcome'] = test['AgeuponOutcome'].fillna(test['AgeuponOutcome'].mean())


In [None]:
# Get unique entries for Color and Breed
print('Distinct Colors: ', len(train['Color'].unique()))
print('Distinct Breeds: ', len(train['Breed'].unique()))

There are far too many distinct breed and colors to be useful for our model directly. We will attempt to reduce the cardinality of these while retaining their predicitive power (if any). First we will extract an IsSolidColor feature, detecting whether the animal is one color or multiple/mixed colors. Secondly, we will extract the description of the markings. We will reduce Colors to a binary feature, 1 for solid color, 0 for multiple/mixed colors, and likewise for breeds, purebreds and mixed breeds, respectively.

In [None]:
# Extract solid color feature
def extract_color(df):
    df['IsSolidColor'] = 1
    mixed_entries = df['Color'].str.contains(r'\/')
    df.loc[mixed_entries, 'IsSolidColor'] = 0
    df['Color'] = df['Color'].str.replace('\/.*', '')
    return df

train = extract_color(train)
test = extract_color(test)

# Extract other color markings
def extract_markings(df):
    for marking in ['Tabby','Point','Tiger','Brindle','Smoke','Tick','Merle']:
        df[marking] = df['Color'].str.contains(marking).astype(int)

    return df
    
train = extract_markings(train)
test = extract_markings(test)

# Display value counts for the new categories
print(train['IsSolidColor'].value_counts())
for marking in ['Tabby','Point','Tiger','Brindle','Smoke','Tick','Merle']:
    print(train[marking].value_counts())

It is questionable whether these color/marking features will have any impact. The markings don't have enough variance and it's TBD whether an animal being one color or multiple will affect outcomes at all. We'll keep IsSolidColor for now and drop the others.

In [None]:
# Drop markings columns
train = train.drop(columns=['Tabby','Point','Tiger','Brindle','Smoke','Tick','Merle'])

For the Breed feature, first we will extract a binary Purebred feature, detecting whether the breed is one breed or mixed breeds. Then, we will check the breed against a list of breeds deemed to be particularly aggressive to establish an AggroBreed feature. Finally, we will extract potentially desireable hypoallergenic breeds.

In [None]:
# Extract purebred feature
def extract_purebred(df):
    df['Purebred'] = 1
    mixed_entries = (df['Breed'].str.contains('\/')) | (df['Breed'].str.contains('Mix'))
    df.loc[mixed_entries, 'Purebred'] = 0
    return df

train = extract_purebred(train)
test = extract_purebred(test)

# Extract aggressive breeds
def extract_aggression(df):
    aggro_breeds = 'Staffordshire|Pit|Doberman|Chow|Rottweiler|German Shepherd|American Bulldog|Mastiff|Bullmastiff|Husky|Malamute|Akita|Boxer'
    df['AggroBreed'] = 0
    df.loc[df['Breed'].str.contains(aggro_breeds), 'AggroBreed'] = 1
    return df

train = extract_aggression(train)
test = extract_aggression(test)

# Extract Hypoallergenic breeds
def extract_hypoallergenic(df):
    hypoallergenic_breeds = 'Affenpinscher|Afghan|Hairless|Barbet|Bedlington|Bichon|Bolognese|Crested|Schnauzer|Water Spaniel|Kerry|Maltese|Poodle|Portuguese Water|Yorkshire'
    df['HypoallergenicBreed'] = 0
    df.loc[df['Breed'].str.contains(hypoallergenic_breeds), 'HypoallergenicBreed'] = 1
    return df

train = extract_hypoallergenic(train)
test = extract_hypoallergenic(test)

We need to convert the time feature to a format the model can use (a format that tracks the cyclical nature of hours of the day).

In [None]:
# Convert DateTime to datetime dtype
train['DateTime'] = pd.to_datetime(train['DateTime'], infer_datetime_format=True)
test['DateTime'] = pd.to_datetime(test['DateTime'], infer_datetime_format=True)

# Round time of day to nearest hour
train['Hour'] = train['DateTime'].dt.round("H").dt.hour
test['Hour'] = test['DateTime'].dt.round("H").dt.hour

# Encode 'Hour' in two dimensions to account for cyclical nature
train['HourSin'] = np.sin(2 * np.pi * train['Hour']/23.0)
train['HourCos'] = np.cos(2 * np.pi * train['Hour']/23.0)

test['HourSin'] = np.sin(2 * np.pi * test['Hour']/23.0)
test['HourCos'] = np.cos(2 * np.pi * test['Hour']/23.0)

# Add day of the week column
train['DayOfWeek'] = pd.to_datetime(train['DateTime'], infer_datetime_format=True).dt.weekday.map({0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday',5:'Saturday', 6:'Sunday'})
test['DayOfWeek'] = pd.to_datetime(test['DateTime'], infer_datetime_format=True).dt.weekday.map({0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday',5:'Saturday', 6:'Sunday'})


In [None]:
# Convert AnimalType to binary feature, 1 for dog, 0 for cat
train['Dog'] = train['AnimalType'].str.contains('Dog').astype(int)
test['Dog'] = test['AnimalType'].str.contains('Dog').astype(int)

# divide SexuponOutcome to two features
def extract_sex_info(df):
    df['SexuponOutcome'].dropna(inplace=True)
    df['Male'] = df['SexuponOutcome'].str.contains("Male").astype(int)
    df['Intact'] = df['SexuponOutcome'].str.contains("Intact").astype(int)
    df['SexUnknown'] = df['SexuponOutcome'].str.contains("Unknown").astype(int)
    return df

train = extract_sex_info(train)
test = extract_sex_info(test)
    

# Data Exploration

In [None]:
# Plot outcome by time of day
hourly_outcomes = train.groupby('Hour')['OutcomeType'].value_counts().unstack(0)
hourly_outcomes = hourly_outcomes.fillna(0)

fig, ax = plt.subplots(figsize=(8, 6))

ind = hourly_outcomes.columns
width = .75

adoption = hourly_outcomes.loc['Adoption'].values
died = hourly_outcomes.loc['Died'].values
euthanasia = hourly_outcomes.loc['Euthanasia'].values
return_to_owner = hourly_outcomes.loc['Return_to_owner'].values
transfer = hourly_outcomes.loc['Transfer'].values

ax.bar(ind, adoption, width)
ax.bar(ind, transfer, width, bottom = adoption)
ax.bar(ind, return_to_owner, width, bottom = adoption + transfer)
ax.bar(ind, euthanasia, width, bottom = adoption + transfer + return_to_owner)
ax.bar(ind, died, width, bottom = adoption + transfer + return_to_owner + euthanasia)
ax.legend(['adoption','transfer','return_to_owner','euthanasia','died'])
ax.set_title("Outcome by Hour of Day")
plt.ylabel('Number of Outcomes')
plt.xticks(range(24), ["%d" %i for i in range(0, 24)])

plt.show()

We see that the data for midnight and 9am doesn't align with the surrounding times. Investigating further, we see that the plot shows some unusual activity at 9a and midnight. Let's look into this further, there could be more information available.

In [None]:
# Display entries for midnight and 9am
print(train[train['Hour'] == 0]['DateTime'].dt.strftime("%Y-%m-%d %H:%M:%S"))
print(train[train['Hour'] == 9]['DateTime'].dt.strftime("%Y-%m-%d %H:%M:%S"))

The vast majority of these entries are exactly on the hour, 0:00:00 or 9:00:00. Perhaps the staff is getting caught up on data entry for the day or night and those are the default values. Still, this could be valuable info if there's a clear pattern. Let's create a special OnTheHour feature to track this for all times.

In [None]:
# Add 'OnTheHour' variable that tracks whether the time is exactly on the hour
train['OnTheHour'] = (train['DateTime'].dt.minute == 0) & (train['DateTime'].dt.second == 0).astype(int)
test['OnTheHour'] = (test['DateTime'].dt.minute == 0) & (test['DateTime'].dt.second == 0).astype(int)

all_times = train['OutcomeType'].value_counts(normalize=True)
on_the_hour = train[train['OnTheHour']==1]['OutcomeType'].value_counts(normalize=True)
comparison = pd.concat([all_times, on_the_hour], axis=1)
comparison.columns = ['All Times', 'Exactly On The Hour']

comparison.plot.bar(title = 'Outcome Based on OnTheHour Feature')

The plot compares entries with timestamps of X:00:00 vs all of the other normally varied times and shows that times exactly on the hour are much more likely to result in a transfer than anything else. Let's revisit the special cases of midnight and 9am.

In [None]:
# Display outcomes for 0:00:00 and 9:00:00 timestamps
print('Outcomes for 0:00:00: \n\n', train[(train['OnTheHour'] == 1) & (train['Hour'] == 0)]['OutcomeType'].value_counts(), '\n')
print('Outcomes for 9:00:00: \n\n', train[(train['OnTheHour'] == 1) & (train['Hour'] == 9)]['OutcomeType'].value_counts())

Midnight entries are very likely to result in a transfer and 9am entries are almost certain to be a transfer. Let's move on to the sex of the animal and what effect that has on outcome.

In [None]:
# Get data on sex of animal
sex_outcomes = train.groupby('SexuponOutcome')['OutcomeType'].value_counts().unstack(0)
sex_outcomes = sex_outcomes.fillna(0)

fig, ax = plt.subplots(figsize=(8, 6))

ind = sex_outcomes.columns
width = .75

adoption = sex_outcomes.loc['Adoption'].values
died = sex_outcomes.loc['Died'].values
euthanasia = sex_outcomes.loc['Euthanasia'].values
return_to_owner = sex_outcomes.loc['Return_to_owner'].values
transfer = sex_outcomes.loc['Transfer'].values

ax.bar(ind, adoption, width)
ax.bar(ind, transfer, width, bottom = adoption)
ax.bar(ind, return_to_owner, width, bottom = adoption + transfer)
ax.bar(ind, euthanasia, width, bottom = adoption + transfer + return_to_owner)
ax.bar(ind, died, width, bottom = adoption + transfer + return_to_owner + died)

# Plot
ax.legend(['adoption','transfer','return_to_owner','euthanasia','died'])
plt.title('Outcome Based on Sex')
plt.ylabel('Number of Outcomes')
plt.show()

Intact/unknown animals have much lower rates of adoption, understandably, as that is another hurdle for would be owners. Let's look into breed attributes and get a better idea of their usefulness.

In [None]:
# Plot various breed attributes
# normalize results to isolate the relative impact of the features
fig, ax = plt.subplots(3, 1, figsize=(5, 12))

train.groupby('HypoallergenicBreed')['OutcomeType'].value_counts(normalize=True).unstack(0).plot.barh(ax = ax[0])
train.groupby('AggroBreed')['OutcomeType'].value_counts(normalize=True).unstack(0).plot.barh(ax = ax[1])
train.groupby('Purebred')['OutcomeType'].value_counts(normalize=True).unstack(0).plot.barh(ax = ax[2])
fig.suptitle('Outcome Based on Different Breed Characteristics')
plt.show()

print(train['HypoallergenicBreed'].value_counts(),'\n')
print(train['AggroBreed'].value_counts(),'\n')
print(train['Purebred'].value_counts())

AggroBreed may be useful, there's enough variance in the outcomes, and unlike the other 2 features, the binary column isn't as lopsided towards 0.

In [None]:
# Get outcomes for day of the week
day_outcomes = train.groupby('DayOfWeek')['OutcomeType'].value_counts().unstack(0)
day_outcomes = day_outcomes.fillna(0)

# Fix column order
day_outcomes = day_outcomes[['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]

fig, ax = plt.subplots(figsize=(8, 6))

ind = day_outcomes.columns
width = .75

adoption = day_outcomes.loc['Adoption'].values
died = day_outcomes.loc['Died'].values
euthanasia = day_outcomes.loc['Euthanasia'].values
return_to_owner = day_outcomes.loc['Return_to_owner'].values
transfer = day_outcomes.loc['Transfer'].values

# Plot
ax.bar(ind, adoption, width)
ax.bar(ind, transfer, width, bottom = adoption)
ax.bar(ind, return_to_owner, width, bottom = adoption + transfer)
ax.bar(ind, euthanasia, width, bottom = adoption + transfer + return_to_owner)
ax.bar(ind, died, width, bottom = adoption + transfer + return_to_owner + euthanasia)
ax.legend(['adoption','transfer','return_to_owner','euthanasia','died'])
plt.title('Outcome by Day of Week')
plt.show()

Most of the day of week variation occurs between weekdays and weekends, so our model will probably do better by reducing cardinality to weekend or weekday.

In [None]:
# Extract weekend variable from DateTime
def is_weekend(df):
    df['weekend'] = df['DateTime'].dt.weekday.isin([5, 6]).astype(int)
    df = df.drop(columns=['DayOfWeek'])
    return df

train = is_weekend(train)
test = is_weekend(test)

In [None]:
# Group outcome data by age
max_age = train['AgeuponOutcome'].max() / 365
bins = [0, 1, 3, 7, max_age]
train['binned_age'] = pd.cut(train['AgeuponOutcome'] / 365, bins)

age_outcomes = train.groupby('binned_age')['OutcomeType'].value_counts().unstack(0)

fig, ax = plt.subplots(figsize=(8, 6))

ind = ['0 to 1', '1 to 3', '3 to 7', '7 to 20']
width = .75

adoption = age_outcomes.loc['Adoption'].values
died = age_outcomes.loc['Died'].values
euthanasia = age_outcomes.loc['Euthanasia'].values
return_to_owner = age_outcomes.loc['Return_to_owner'].values
transfer = age_outcomes.loc['Transfer'].values

# Plot
ax.bar(ind, adoption, width)
ax.bar(ind, transfer, width, bottom = adoption)
ax.bar(ind, return_to_owner, width, bottom = adoption + transfer)
ax.bar(ind, euthanasia, width, bottom = adoption + transfer + return_to_owner)
ax.bar(ind, died, width, bottom = adoption + transfer + return_to_owner + euthanasia)
ax.legend(['adoption','transfer','return_to_owner','euthanasia','died'])

plt.title('Outcome by Age')
plt.xlabel('Age in Years')
plt.ylabel('Number of Outcomes')
plt.show()

Looking at the data for age, we see that the vast majority of animals are puppy/kitten aged. There's enough variance here that age should provide some predictive power.

# Build Classifier

In [None]:
# Feature selection
y_train = train['OutcomeType']
X_train = train[['HasName','AgeuponOutcome', 'AggroBreed','HourSin', 'HourCos', 'OnTheHour', 'Dog', 'Male', 'Intact', 'SexUnknown', 'weekend']]

X_test = test[['HasName','AgeuponOutcome', 'AggroBreed','HourSin', 'HourCos', 'OnTheHour', 'Dog', 'Male', 'Intact', 'SexUnknown', 'weekend']]


In [None]:
# Define the model
xgb_model = XGBClassifier(random_state = 42, n_estimators = 45)

# Fit the model
xgb_model.fit(X_train, y_train)

In [None]:
# Inspect feature importance
feature_importance = xgb_model.get_booster().get_score(importance_type="gain")

feature_importance = sorted(feature_importance.items(), key=lambda x: x[1])

feature = list(zip(*feature_importance))[0]
score = list(zip(*feature_importance))[1]
y_pos = np.arange(len(feature)) 

plt.barh(y_pos, score, align='center')
plt.yticks(y_pos, feature) 
plt.title('Feature Importance')
plt.box(False)
plt.tick_params(left = False, bottom = False, labelbottom = False)
plt.show()

Intactness appears to be a strong predictor, along with the SexUnknown feature, which is clear from the previous plot based on sex. Our manufactured OnTheHour feature is confirmed as another good indicator, while the binary Male feature, AggroBreed, and two time features are among the less helpful. We are ready to make our predictions!

In [None]:
# Get predictions
xgb_pred = xgb_model.predict_proba(X_test) 

# Export to csv for submission
results = pd.DataFrame(xgb_pred, columns = xgb_model.classes_, index = test['ID']).reset_index()
results.to_csv('submission.csv', index=False)