# Shelter Animal Outcomes

## Load Training Data

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
train.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   AnimalID        26729 non-null  object
 1   Name            19038 non-null  object
 2   DateTime        26729 non-null  object
 3   OutcomeType     26729 non-null  object
 4   OutcomeSubtype  13117 non-null  object
 5   AnimalType      26729 non-null  object
 6   SexuponOutcome  26728 non-null  object
 7   AgeuponOutcome  26711 non-null  object
 8   Breed           26729 non-null  object
 9   Color           26729 non-null  object
dtypes: object(10)
memory usage: 2.0+ MB


In [5]:
train.isnull().sum()

AnimalID              0
Name               7691
DateTime              0
OutcomeType           0
OutcomeSubtype    13612
AnimalType            0
SexuponOutcome        1
AgeuponOutcome       18
Breed                 0
Color                 0
dtype: int64

## Clean Training Data

In [6]:
def get_sex(string):
    string = str(string)
    if string.find('Male') >= 0: return 'male'
    if string.find('Female') >= 0: return 'female'
    return 'unknown'

def get_neutered(string):
    string = str(string)
    if string.find('Spayed') >= 0: return 'neutered'
    if string.find('Neutered') >= 0: return 'neutered'
    if string.find('Intact') >= 0: return 'intact'
    return 'unknown'

In [7]:
def calculate_age_years(age_string):

    age_string = str(age_string)
    if age_string == 'nan':
        return 0

    age = int(age_string.split()[0])

    if age_string.find('year') > -1:
        return age
    elif age_string.find('month') > -1:
        return age / 12
    elif age_string.find('week') > -1:
        return age / 52
    elif age_string.find('day') > -1:
        return age / 365

In [8]:
def age_category(age):
    if age < 3:
        return 'young'
    elif age >= 3 and age < 5:
        return 'middle'
    elif age >= 5 and age < 10:
        return 'adult'
    elif age >= 10:
        return 'old'

In [9]:
def clean_data(dataframe, drop_columns):
    dataframe['sex'] = dataframe['SexuponOutcome'].apply(get_sex)
    dataframe['neutered'] = dataframe['SexuponOutcome'].apply(get_neutered)

    dataframe['age_years'] = dataframe['AgeuponOutcome'].apply(calculate_age_years)

    dataframe['age_category'] = dataframe['age_years'].apply(age_category)

    dataframe['DateTime'] = pd.to_datetime(dataframe['DateTime'])

    dataframe.drop(drop_columns, axis=1, inplace=True)

In [10]:
drop_columns = ['SexuponOutcome', 'AgeuponOutcome', 'OutcomeSubtype']

clean_data(train, drop_columns)

In [11]:
train.isnull().sum()

AnimalID           0
Name            7691
DateTime           0
OutcomeType        0
AnimalType         0
Breed              0
Color              0
sex                0
neutered           0
age_years          0
age_category       0
dtype: int64

In [12]:
train.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,Breed,Color,sex,neutered,age_years,age_category
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,male,neutered,1.0,young
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,female,neutered,1.0,young
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Dog,Pit Bull Mix,Blue/White,male,neutered,2.0,young
3,A683430,,2014-07-11 19:09:00,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,male,intact,0.057692,young
4,A667013,,2013-11-15 12:52:00,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,male,neutered,2.0,young


In [13]:
train['DateTime'].max()

Timestamp('2016-02-21 19:17:00')

## Visualizations

### Overall

In [14]:
category_orders=dict(OutcomeType=['Return_to_owner', 'Adoption', 'Transfer', 'Euthanasia', 'Died'])

In [15]:
fig = px.histogram(train, x="AnimalType")
fig.show()

In [16]:
fig = px.histogram(train, x="OutcomeType", color='AnimalType', category_orders=category_orders)
fig.show()

In [17]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="DateTime", color='OutcomeType', category_orders=category_orders)
fig.show()

### Breed

In [18]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="OutcomeType", color='Breed', category_orders=category_orders)
fig.show()

In [19]:
top_breeds = train[train['AnimalType'] == 'Cat']['Breed'].value_counts()[:5].index.to_list()

fig = px.histogram(train[train['Breed'].isin(top_breeds)], x="Breed")
fig.show()

### Color

In [20]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="OutcomeType", color='Color', category_orders=category_orders)
fig.show()

In [21]:
top_colors = train[train['AnimalType'] == 'Cat']['Color'].value_counts()[:5].index.to_list()

fig = px.histogram(train[train['Color'].isin(top_colors)], x="Color")
fig.show()

### Sex

In [22]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="OutcomeType", color='sex', category_orders=category_orders)
fig.show()

In [23]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="sex")
fig.show()

### Neutered

In [24]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="OutcomeType", color='neutered', category_orders=category_orders)
fig.show()

In [25]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="neutered")
fig.show()

### Age

In [26]:
fig = px.histogram(train[train['AnimalType'] == 'Cat'], x="OutcomeType", color='age_category', category_orders=category_orders)
fig.show()

In [27]:
fig = px.histogram(train[train['AnimalType'] == 'Dog'], x='age_category')
fig.show()

## Machine Learning

### Data Preparation

In [80]:
def get_quarter(timestamp):
    month = timestamp.month

    if month <= 3:
        return 1
    elif month <= 6:
        return 2
    elif month <= 9:
        return 3
    elif month <= 12:
        return 4

def get_month(timestamp):
    return timestamp.month

In [81]:
def get_breed_mix(string):
    string = str(string)
    if string.find('Mix') >= 0 or string.find('/') >= 0:
        return 1
    else:
        return 0

def get_single_color(string):
    string = str(string)
    if string.find('/') >= 0:
        return 0
    else:
        return 1

In [82]:
def data_preparation(df, drop_extra_columns):

    dataframe = df.copy()

    #dataframe['has_name'] = dataframe['Name']
    #dataframe['has_name'].fillna(0, inplace=True)
    #dataframe['has_name'] = dataframe['has_name'].apply(lambda x: 1 if x != 0 else x)

    #dataframe['quarter'] = dataframe['DateTime'].apply(get_quarter)

    dataframe['month'] = dataframe['DateTime'].apply(get_month)

    dataframe['breed_mix'] = dataframe['Breed'].apply(get_breed_mix)

    dataframe['color_single'] = dataframe['Color'].apply(get_single_color)

    dataframe.drop(drop_extra_columns, axis=1, inplace=True)

    dataframe = pd.get_dummies(dataframe, columns=['AnimalType', 'sex', 'neutered'], drop_first=True)

    dataframe.drop(['sex_unknown', 'neutered_unknown'], axis=1, inplace=True)

    return dataframe

In [83]:
drop_extra_columns = ['AnimalID', 'Name', 'DateTime', 'Breed', 'Color', 'age_category']
train_prepared = data_preparation(train, drop_extra_columns)

In [84]:
train_prepared.head()

Unnamed: 0,OutcomeType,age_years,month,breed_mix,color_single,AnimalType_Dog,sex_male,neutered_neutered
0,Return_to_owner,1.0,2,1,0,1,1,1
1,Euthanasia,1.0,10,1,1,0,0,1
2,Adoption,2.0,1,1,0,1,1,1
3,Transfer,0.057692,7,1,1,0,1,0
4,Transfer,2.0,11,1,1,1,1,1


In [85]:
from sklearn.model_selection import train_test_split

X = train_prepared.drop('OutcomeType', axis=1)
y = train_prepared['OutcomeType']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [86]:
y_train.value_counts()

Adoption           6448
Transfer           5644
Return_to_owner    2869
Euthanasia          947
Died                129
Name: OutcomeType, dtype: int64

### Decision Tree

In [87]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

In [88]:
y_pred = dt.predict(x_test)

In [89]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

0.5904414515525627
0.5766090453674129
                 precision    recall  f1-score   support

       Adoption       0.62      0.80      0.70      4321
           Died       0.04      0.03      0.03        68
     Euthanasia       0.22      0.17      0.20       608
Return_to_owner       0.38      0.29      0.33      1917
       Transfer       0.70      0.58      0.63      3778

       accuracy                           0.59     10692
      macro avg       0.39      0.37      0.38     10692
   weighted avg       0.58      0.59      0.58     10692



In [90]:
importances = dt.feature_importances_
feature_names = dt.feature_names_in_

feat_importances = pd.DataFrame({'feature_name':feature_names, 'importance':importances}).sort_values(by='importance', ascending=False)
feat_importances

Unnamed: 0,feature_name,importance
6,neutered_neutered,0.354485
0,age_years,0.309512
1,month,0.182031
3,color_single,0.056163
5,sex_male,0.042512
4,AnimalType_Dog,0.03375
2,breed_mix,0.021547


### Decision Tree with Resampling

In [91]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier


oversample = SMOTE(random_state=42)

undersample = RandomUnderSampler(random_state=42)

model = DecisionTreeClassifier()

steps = [('o', oversample), ('u', undersample)]
pipeline = Pipeline(steps=steps)
X_train_rus, y_train_rus = pipeline.fit_resample(X_train, y_train)


In [92]:
y_train_rus.value_counts()

Adoption           6448
Died               6448
Euthanasia         6448
Return_to_owner    6448
Transfer           6448
Name: OutcomeType, dtype: int64

In [93]:
dt_rus = DecisionTreeClassifier()

dt_rus.fit(X_train_rus, y_train_rus)

y_pred_rus = dt_rus.predict(x_test)

print(accuracy_score(y_test, y_pred_rus))
print(f1_score(y_test, y_pred_rus, average='weighted'))
print(classification_report(y_test, y_pred_rus))

0.5406846240179574
0.5584339819913325
                 precision    recall  f1-score   support

       Adoption       0.68      0.66      0.67      4321
           Died       0.06      0.31      0.11        68
     Euthanasia       0.17      0.28      0.21       608
Return_to_owner       0.35      0.47      0.40      1917
       Transfer       0.71      0.48      0.57      3778

       accuracy                           0.54     10692
      macro avg       0.40      0.44      0.39     10692
   weighted avg       0.60      0.54      0.56     10692



In [94]:
importances = dt_rus.feature_importances_
feature_names = dt_rus.feature_names_in_

feat_importances = pd.DataFrame({'feature_name':feature_names, 'importance':importances}).sort_values(by='importance', ascending=False)
feat_importances

Unnamed: 0,feature_name,importance
0,age_years,0.47275
1,month,0.194946
6,neutered_neutered,0.164349
5,sex_male,0.052627
3,color_single,0.046625
4,AnimalType_Dog,0.042064
2,breed_mix,0.026639


### Gradient Boosting

In [95]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state=42, n_estimators=500)

gbc.fit(X_train_rus, y_train_rus)

y_pred_gbc = gbc.predict(x_test)

print(accuracy_score(y_test, y_pred_gbc))
print(f1_score(y_test, y_pred_gbc, average='weighted'))
print(classification_report(y_test, y_pred_gbc))

0.5701459034792368
0.590145430149425
                 precision    recall  f1-score   support

       Adoption       0.73      0.65      0.69      4321
           Died       0.06      0.29      0.10        68
     Euthanasia       0.22      0.34      0.27       608
Return_to_owner       0.38      0.60      0.47      1917
       Transfer       0.74      0.50      0.60      3778

       accuracy                           0.57     10692
      macro avg       0.43      0.48      0.42     10692
   weighted avg       0.64      0.57      0.59     10692



In [96]:
importances = gbc.feature_importances_
feature_names = gbc.feature_names_in_

feat_importances = pd.DataFrame({'feature_name':feature_names, 'importance':importances}).sort_values(by='importance', ascending=False)
feat_importances

Unnamed: 0,feature_name,importance
0,age_years,0.562112
6,neutered_neutered,0.203887
4,AnimalType_Dog,0.131292
1,month,0.053098
3,color_single,0.025456
5,sex_male,0.012603
2,breed_mix,0.011553


In [97]:
from joblib import dump

dump(gbc, 'gbc_model.joblib')

['gbc_model.joblib']

### Predictions on Unseen Data

In [98]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [99]:
drop_columns = ['SexuponOutcome', 'AgeuponOutcome']

clean_data(test, drop_columns)

In [100]:
drop_extra_columns = ['ID', 'Name', 'DateTime', 'Breed', 'Color', 'age_category']

test_prepared = data_preparation(test, drop_extra_columns)

In [101]:
test_prepared.head()

Unnamed: 0,age_years,month,breed_mix,color_single,AnimalType_Dog,sex_male,neutered_neutered
0,0.833333,10,1,0,1,0,0
1,2.0,7,1,0,1,0,1
2,1.0,1,1,1,0,1,1
3,0.333333,12,1,1,1,1,0
4,2.0,9,1,1,1,1,1


In [102]:
from joblib import load

model = load('gbc_model.joblib')

predictions = model.predict(test_prepared)

In [103]:
results = test_prepared.copy()
results['prediction'] = predictions
results.head()

Unnamed: 0,age_years,month,breed_mix,color_single,AnimalType_Dog,sex_male,neutered_neutered,prediction
0,0.833333,10,1,0,1,0,0,Transfer
1,2.0,7,1,0,1,0,1,Adoption
2,1.0,1,1,1,0,1,1,Transfer
3,0.333333,12,1,1,1,1,0,Transfer
4,2.0,9,1,1,1,1,1,Return_to_owner
