In [None]:
# Sources
# https://andraszsom.wordpress.com/2016/07/27/kaggle-for-the-paws/
# https://www.kaggle.com/andraszsom/age-gender-breed-and-name-vs-outcome
# https://www.kaggle.com/andraszsom/dog-breeds-dog-groups
# https://www.kaggle.com/andraszsom/uncertainty-estimates-of-outcome-types
# https://www.kaggle.com/c/shelter-animal-outcomes/discussion/22538


import numpy as np
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import shap

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv('/kaggle/input/shelter-animal-outcomes/train.csv.gz', infer_datetime_format=True, parse_dates = [2])
test = pd.read_csv('/kaggle/input/shelter-animal-outcomes/test.csv.gz', infer_datetime_format=True, parse_dates = [2])

import warnings
warnings.filterwarnings("ignore", message="FixedFormatter should only be used together with FixedLocator")
warnings.filterwarnings("ignore", message="The use of label encoder in XGBClassifier is deprecated and will be removed in a future release")

# First look at the data

In [None]:
train.head()

# Dataset includes a DateTime column. The read_csv() function will be changed to read it as a datetime value

In [None]:
train.info()

# All columns are object type, except DateTime. There are some NAs that must be investigated further

In [None]:
train.duplicated().sum()

# No duplicated values in the dataset

In [None]:
train.describe(datetime_is_numeric=True)

# Values ranging from 01-10-2013 to 02-21-2016

In [None]:
train.describe(datetime_is_numeric=True, include=['O'])

# There are 5 unique types of Outcome, 2 types of animals, a huge variety of breeds and colors
# The colors and breeds must be reworked and grouped together to reduce the number of unique elements
# AgeUponOutcome must be transformed into a numeric variable

In [None]:
# Converting AgeuponOutcome	into a numeric value

def age_to_years(item):
    # convert item to list if it is one string
    if type(item) is str:
        item = [item]
    ages_in_years = np.zeros(len(item))
    for i in range(len(item)):
        # check if item[i] is str
        if type(item[i]) is str:
            if 'day' in item[i]:
                ages_in_years[i] = int(item[i].split(' ')[0])/365
            if 'week' in item[i]:
                ages_in_years[i] = int(item[i].split(' ')[0])/52.1429
            if 'month' in item[i]:
                ages_in_years[i] = int(item[i].split(' ')[0])/12
            if 'year' in item[i]:
                ages_in_years[i] = int(item[i].split(' ')[0])    
        else:
            # item[i] is not a string but a nan
            ages_in_years[i] = 0
    return ages_in_years

train['AgeuponOutcome'] = age_to_years(train['AgeuponOutcome']).round(1)
test['AgeuponOutcome'] = age_to_years(test['AgeuponOutcome']).round(1)

train.describe()

In [None]:
# Time to plot the age of both cats and dogs at the moment they leave the shelter

sns.displot(data=train, x='AgeuponOutcome', hue='AnimalType', bins=20, multiple="stack")
plt.xticks(range(20))
plt.show()

# Around 16000 animals left the shelter before turning 1 year old, which represents around 45% of all animals

In [None]:
# It makes sense to divide the dataset into Dogs and Cats, since they are so unique on their characteristics
# The division will be used for EDA. The prediction model will be built with both dataframes joined

train_cat = train.loc[train['AnimalType'] == 'Cat'] 
train_dog = train.loc[train['AnimalType'] == 'Dog']

train.head()

In [None]:
# Creates the dataframe that will be filled with values to be plotted
df_plot1 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

# Defines range used in the for loop
max_year = int(train_cat['AgeuponOutcome'].max())

# Loop that fills the df_plot1
for i in range(max_year):
    # Selects the values of train_cat for the year i
    train_cat_year = train_cat[(train_cat['AgeuponOutcome'] >= i) & (train_cat['AgeuponOutcome'] < i+1)]
    # Groups number of outcomes in current year and normalizes it
    val_year = train_cat_year.groupby(['OutcomeType']).count()['AgeuponOutcome']/train_cat_year['AgeuponOutcome'].count()
    # Append values of current year in the dataframe
    df_plot1 = df_plot1.append(val_year)
# Sets the dataframe index from 0 to max year count - 1
df_plot1.set_index(np.array(range(max_year)), inplace=True)
df_plot1.fillna(0, inplace=True)




# Creates the dataframe that will be filled with values to be plotted
df_plot2 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

# Defines range used in the for loop
max_year = int(train_dog['AgeuponOutcome'].max())

# Loop that fills the df_plot1
for i in range(max_year):
    # Selects the values of train_cat for the year i
    train_dog_year = train_dog[(train_dog['AgeuponOutcome'] >= i) & (train_dog['AgeuponOutcome'] < i+1)]
    # Groups number of outcomes in current year and normalizes it
    val_year = train_dog_year.groupby(['OutcomeType']).count()['AgeuponOutcome']/train_dog_year['AgeuponOutcome'].count()
    # Append values of current year in the dataframe
    df_plot2 = df_plot2.append(val_year)
# Sets the dataframe index from 0 to max year count - 1
df_plot2.set_index(np.array(range(max_year)), inplace=True)
df_plot2.fillna(0, inplace=True)

print(df_plot1.head())
print('-'*60)
print(df_plot2.head())

In [None]:
def plot_frac(df, x_label, ganimal, gtype):

    # Creating aliases to reduce the length of code
    ds1d, ds2d, ds3d, ds4d, ds5d = df['Adoption'], df['Died'], df['Euthanasia'], df['Return_to_owner'], df['Transfer']

    fig, ax = plt.subplots()
    # Creating each of the bars, passing the the bottom parameter as the sum of the bars under it
    ax.bar(df.index, ds1d, label='Adoption')
    ax.bar(df.index, ds2d, label='Died', bottom=ds1d)
    ax.bar(df.index, ds3d, label='Euthanasia', bottom=np.array(ds1d)+np.array(ds2d))
    ax.bar(df.index, ds4d, label='Return_to_owner', bottom=np.array(ds1d)+np.array(ds2d)+np.array(ds3d))
    ax.bar(df.index, ds5d, label='Transfer', bottom=np.array(ds1d)+np.array(ds2d)+np.array(ds3d)+np.array(ds4d))
    ax.legend()
    # Set the x-axis to the animal gender
    ax.set_xticklabels(x_label)
    ax.xaxis.set_major_locator(matplotlib.ticker.FixedLocator(range(50)))
    plt.xticks(rotation=30)
    plt.yticks(np.linspace(0,1,11))
    plt.title('Outcome of ' + ganimal + ' grouped by ' + gtype)
    plt.ylabel('Fraction of outcomes')
    plt.show()
    return None

In [None]:
# Certainly age upon outcome has a high correlation with the outcome type of the animal. Lets check that next

plot_frac(df_plot1, range(20), 'Cats', 'Age')

plot_frac(df_plot2, range(20), 'Dogs', 'Age')

# Its important to remember that only 1% of the animals in the shelter remained there after they were 14 years or older
# There isn't a clear correlation between Adoption Rate and Age in the Cats dataframe
# On the other side, we can see a clear correlation between Age and Transfer, Return to Owner and mainly Euthanasia
# We can see a high Transfer Rate on the Cats dataframe when compared to the Dogs dataframe
# The rate of increasingly Euthanasia outcomes are alarmingly high on cats when compared to dogs
# The average adoption rate of dogs and cats are similar
# Its important to remember that only 1% of the animals in the shelter remained there after they were 14 years or older
# The preference for younger dogs is very clear. So we can assume there's a high linear correlation between Age and Adoption Rate
# A linear correlation can be easily seen in all outcomes, except 'Died', since there are too few examples
# It's way more common for a dog to be returned to a previous owner than a cat.
# The average adoption rate of dogs and cats are similar

In [None]:
# Lets analyse the genders on the dataset

print(train['SexuponOutcome'].value_counts())

sns.countplot(data=train, x='SexuponOutcome')
plt.xticks(rotation=30)
plt.show()

# The SexuponOutcome feature is composed with both the gender and Neutered/Spayed state of the animal
# Both gender and N/S state certainly plays a big role 

In [None]:
# Lets see if we can see some pattern on the unknown gender animals

unk_train = train[train['SexuponOutcome'] == 'Unknown']

print(unk_train.describe())
print('-'*60)
print(unk_train.describe(include='O'))
print('-'*60)
print(unk_train.info())
print('-'*60)
print(unk_train['OutcomeType'].value_counts())
print('-'*60)
print(unk_train['OutcomeSubtype'].value_counts())
print('-'*60)
print(unk_train['AgeuponOutcome'].value_counts())
print('-'*60)
print(unk_train['AnimalType'].value_counts())
print('-'*60)

train_test = train_cat[(train_cat['AgeuponOutcome'] >= 0) & (train_cat['AgeuponOutcome'] < 1)]
val_test = train_test.groupby(['OutcomeType', 'AgeuponOutcome']).count()['OutcomeSubtype']

print(val_test)

# Most unknown gender animals were also unnamed
# Most unknown gender animals were transferred
# Over 60% of unknown gender animals left the shelter before they were 2 months old
# Almost all of them were cats
# ------------------------------------------------------------------------------------------------------------
# So the unknown gender animals were mostly really young nameless cats that were transferred from the shelter
# High correlation with the target feature
# The zero adoption rate may be explained by some internal rule that forbids adoption of cats below a certain age
# The data shows that the minimum age for adoptions is somewhere between 2 and 3 months old

In [None]:
# Creates the dataframe that will be filled with values to be plotted
df_plot1 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

# Defines range used in the for loop
total_sex = ['Neutered Male','Spayed Female','Intact Male','Intact Female','Unknown']

# Loop that fills the df_plot1
for i in total_sex:
    # Selects the values of train_cat for the sex i
    train_cat_sex = train_cat[train_cat['SexuponOutcome'] == i]
    # Groups number of outcomes in current sex and normalizes it
    val_sex = train_cat_sex.groupby(['OutcomeType']).count()['SexuponOutcome']/train_cat_sex['SexuponOutcome'].count()
    # Append values of current sex in the dataframe
    df_plot1 = df_plot1.append(val_sex)
# Sets the dataframe index from 0 to number of sexes - 1
df_plot1.set_index(np.array(range(len(total_sex))), inplace=True)
df_plot1.fillna(0, inplace=True)



# Creates the dataframe that will be filled with values to be plotted
df_plot2 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

# Loop that fills the df_plot1
for i in total_sex:
    # Selects the values of train_cat for the sex i
    train_dog_sex = train_dog[train_dog['SexuponOutcome'] == i]
    # Groups number of outcomes in current sex and normalizes it
    val_sex = train_dog_sex.groupby(['OutcomeType']).count()['SexuponOutcome']/train_dog_sex['SexuponOutcome'].count()
    # Append values of current sex in the dataframe
    df_plot2 = df_plot2.append(val_sex)
# Sets the dataframe index from 0 to number of sexes - 1
df_plot2.set_index(np.array(range(len(total_sex))), inplace=True)
df_plot2.fillna(0, inplace=True)

print(df_plot1.head())
print('-'*60)
print(df_plot2.head())

In [None]:
plot_frac(df_plot1, total_sex, 'Cats', 'Gender')

plot_frac(df_plot2, total_sex, 'Dogs', 'Gender')

# We can see theres a huge difference on the outcometype between neutered/spayed and intact animals
# We can only see a really slight higher adoption rate to female cats, but that hardly will make a difference in the prediction model
# Once again, we see a higher transfer rate and lower return_to_owner rate among cats compared to dogs
# As mentioned before, the cats with unknown gender has a very specific OutcomeType proportion, with 0 adoption rate
# We can see theres a huge difference on the outcometype between neutered/spayed and intact animals
# We can only see a really slight higher adoption rate to female dogs, but that hardly will make a difference in the prediction model
# We can see around 50% adoption rate for neutered/spayed dogs, a smaller rate compared to cat's 65%
# As mentioned before,the dogs with unknown gender has a very specific OutcomeType proportion, with 0 adoption rate

In [None]:
# Adding the new features to the main dataframes

train['SexuponOutcome'].fillna('Unknown', inplace=True)
train['IsIntact'] = train.loc[train['SexuponOutcome'] == 'Unknown', 'SexuponOutcome']
train.loc[~train['IsIntact'].isnull(), 'IsIntact'] = -1
train.loc[train['SexuponOutcome'].str.contains('Intact'), 'IsIntact'] = 1
train['IsIntact'].fillna(0, inplace=True)
train['IsIntact'] = pd.to_numeric(train['IsIntact'])

train.rename(columns={'SexuponOutcome' : 'IsFemale'}, inplace=True)
train.loc[train['IsFemale'].str.contains('Female'), 'IsFemale'] = '1'
train.loc[train['IsFemale'].str.contains('Male'), 'IsFemale'] = 0
train.loc[train['IsFemale'] == 'Unknown', 'IsFemale'] = -1
train['IsFemale'] = pd.to_numeric(train['IsFemale'])



test['SexuponOutcome'].fillna('Unknown', inplace=True)
test['IsIntact'] = test.loc[test['SexuponOutcome'] == 'Unknown', 'SexuponOutcome']
test.loc[~test['IsIntact'].isnull(), 'IsIntact'] = -1
test.loc[test['SexuponOutcome'].str.contains('Intact'), 'IsIntact'] = 1
test['IsIntact'].fillna(0, inplace=True)
test['IsIntact'] = pd.to_numeric(test['IsIntact'])

test.rename(columns={'SexuponOutcome' : 'IsFemale'}, inplace=True)
test.loc[test['IsFemale'].str.contains('Female'), 'IsFemale'] = '1'
test.loc[test['IsFemale'].str.contains('Male'), 'IsFemale'] = 0
test.loc[test['IsFemale'] == 'Unknown', 'IsFemale'] = -1
test['IsFemale'] = pd.to_numeric(test['IsFemale'])

In [None]:
# We've analysed Age and Gender features related to OutcomeType. Now lets take a closer look into the Name feature

print(train['Name'].describe())
print('-'*50)
print(train['Name'].isna().sum())
print('-'*50)
print(train['Name'].value_counts())

# We see a big variety of names and there isn't any name that occurs way too often
# It doesn't make sense to search for patterns in specific names, since there isn't a clear way to group them
# There are plenty of unnamed animals though, so compare Names and Unnamed animals may be useful

In [None]:
name_test = train[train['Name'] == 'DummyName']
print(name_test)
# No name in the dataframe corresponding to DummyName, as expected

train_cat, train_dog = train_cat.fillna('DummyName'), train_dog.fillna('DummyName')


total_name = ['Named', 'Unnamed']

# Creates the dataframe that will be filled with values to be plotted
df_plot1 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])


train_cat_name = train_cat[train_cat['Name'] != 'DummyName']
# Groups number of outcomes in current name and normalizes it
val_name = train_cat_name.groupby(['OutcomeType']).count()['Name']/train_cat_name['Name'].count()
# Append values of current name in the dataframe
df_plot1 = df_plot1.append(val_name)
train_cat_name = train_cat[train_cat['Name'] == 'DummyName']
# Groups number of outcomes in current name and normalizes it
val_name = train_cat_name.groupby(['OutcomeType']).count()['Name']/train_cat_name['Name'].count()
# Append values of current name in the dataframe
df_plot1 = df_plot1.append(val_name)

# Sets the dataframe index from 0 to number of names - 1
df_plot1.set_index(np.array(range(2)), inplace=True)
df_plot1.fillna(0, inplace=True)



# Creates the dataframe that will be filled with values to be plotted
df_plot2 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

train_dog_name = train_dog[train_dog['Name'] != 'DummyName']
# Groups number of outcomes in current name and normalizes it
val_name = train_dog_name.groupby(['OutcomeType']).count()['Name']/train_dog_name['Name'].count()
# Append values of current name in the dataframe
df_plot2 = df_plot2.append(val_name)
train_dog_name = train_dog[train_dog['Name'] == 'DummyName']
# Groups number of outcomes in current name and normalizes it
val_name = train_dog_name.groupby(['OutcomeType']).count()['Name']/train_dog_name['Name'].count()
# Append values of current name in the dataframe
df_plot2 = df_plot2.append(val_name)

# Sets the dataframe index from 0 to number of names - 1
df_plot2.set_index(np.array(range(2)), inplace=True)
df_plot2.fillna(0, inplace=True)



print(df_plot1.head())
print('-'*60)
print(df_plot2.head())

In [None]:
plot_frac(df_plot1, total_name, 'Cats', 'Name')

plot_frac(df_plot2, total_name, 'Dogs', 'Name')

# We see a huge difference in adoption of named and unnamed cats
# As we saw before, theres a group of young unnamed cats with unknown gender that had zero adoption rate and high transfer rate
# This group represents around 1000 cats though. The difference in adoption rate of named and unnamed cats is of 46.4%
# The total of cats in the dataset is 11134. So there's a difference of over 5000 animals
# This indicates that the simple act of naming a cat may increase the chance of a cat being adopted
# We also see a considerable increase in euthanasia rate of unnamed cats as well
# If we assume naming cats makes them more likely to be adopted, it indirectly results in a increase of unnamed cats going through euthanasia
# Since if a cat doesn't get adopted, it increases his chance of staying in the shelter long enough to get sick and need the procedure
# It is always good to remember though: Correlation doesn't imply causation
# Theres too little difference between adoption rate of named and unnamed dogs
# It's more evident on this graph the great difference between return_to_owner rates of named and unnamed dogs
# Here we see as well a increase of euthanasia and died rates
# Once again, this may be consequence of the lesser return_to_owner rate, since it means the animal stays longer in the shelter

In [None]:
# Adding the edited feature to the main dataframes

train.rename(columns={'Name' : 'HasName'}, inplace=True)
train['HasName'].fillna(0, inplace=True)
train.loc[train['HasName'] != 0, 'HasName']  = 1
train['HasName'] = pd.to_numeric(train['HasName'])


test.rename(columns={'Name' : 'HasName'}, inplace=True)
test['HasName'].fillna(0, inplace=True)
test.loc[test['HasName'] != 0, 'HasName']  = 1 
test['HasName'] = pd.to_numeric(test['HasName'])

In [None]:
# And, why not, lets check the difference of Outcometype ratio of Cats vs Dogs

total_type = ['Cat', 'Dog']

# Creates the dataframe that will be filled with values to be plotted
df_plot1 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])


train_type = train[train['AnimalType'] != 'Dog']
# Groups number of outcomes in current name and normalizes it
val_type = train_type.groupby(['OutcomeType']).count()['AnimalType']/train_type['AnimalType'].count()
# Append values of current name in the dataframe
df_plot1 = df_plot1.append(val_type)
train_type = train[train['AnimalType'] == 'Dog']
# Groups number of outcomes in current name and normalizes it
val_type = train_type.groupby(['OutcomeType']).count()['AnimalType']/train_type['AnimalType'].count()
# Append values of current name in the dataframe
df_plot1 = df_plot1.append(val_type)

# Sets the dataframe index from 0 to number of names - 1
df_plot1.set_index(np.array(range(2)), inplace=True)
df_plot1.fillna(0, inplace=True)

print(df_plot1.head())

In [None]:
plot_frac(df_plot1, total_type, 'Animals', 'Animal Type')

# Cats and Dogs have similar adoption rates
# Dogs are more likely to return_to_owner and cats to transfer
# Cats have a slightly higher euthanasia ratio and higher died ratio

In [None]:
# Adding the edited feature to the main dataframes

train.rename(columns={'AnimalType' : 'IsCat'}, inplace=True)
train.loc[train['IsCat'] == 'Cat', 'IsCat']  = 1 
train.loc[train['IsCat'] != 1, 'IsCat']  = 0 
train['IsCat'] = pd.to_numeric(train['IsCat'])


test.rename(columns={'AnimalType' : 'IsCat'}, inplace=True)
test.loc[test['IsCat'] == 'Cat', 'IsCat']  = 1 
test.loc[test['IsCat'] != 1, 'IsCat']  = 0 
test['IsCat'] = pd.to_numeric(test['IsCat'])

In [None]:
# Both OutcomeSubtype and Datetime features are understood as data leakage, since in practical scenarios the model would be used
# before an animal has a defined outcome, and not after. Therefore it would not be possible to know the subtype of outcome or
# the exact minute the outcome happens before making the predictions
# In this case, the only thing left to do is feature engineer Breed and Color into new features and then drop the four columns
# DateTime, OutcomeSubtype, Breed and Color from both the train and test dataframes

print(train['Color'].value_counts())
print('-'*10)
print(train['Color'].isna().sum())


# We can try to check if unicolor/multicolor has any correlation with the outcometype
# Multicolored animals seem to have a / separing different colors
# So thats how we`ll identify them


print(train_cat['Color'].tail())

train_cat['Color'] = train_cat['Color'].apply(lambda x: 'Multi Colored' if '/' in x else 'Single Colored')
train_dog['Color'] = train_dog['Color'].apply(lambda x: 'Multi Colored' if '/' in x else 'Single Colored')

print(train_cat['Color'].tail())

In [None]:
total_color = ['Single Colored', 'Multi Colored']

# Creates the dataframe that will be filled with values to be plotted
df_plot1 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])


train_cat_color = train_cat[train_cat['Color'] != 'Single Colored']
# Groups number of outcomes in current name and normalizes it
val_color = train_cat_color.groupby(['OutcomeType']).count()['Color']/train_cat_color['Color'].count()
# Append values of current name in the dataframe
df_plot1 = df_plot1.append(val_color)
train_cat_color = train_cat[train_cat['Color'] == 'Single Colored']
# Groups number of outcomes in current name and normalizes it
val_color = train_cat_color.groupby(['OutcomeType']).count()['Color']/train_cat_color['Color'].count()
# Append values of current name in the dataframe
df_plot1 = df_plot1.append(val_color)

# Sets the dataframe index from 0 to number of names - 1
df_plot1.set_index(np.array(range(2)), inplace=True)
df_plot1.fillna(0, inplace=True)



# Creates the dataframe that will be filled with values to be plotted
df_plot2 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

train_dog_color = train_dog[train_dog['Color'] != 'Single Colored']
# Groups number of outcomes in current name and normalizes it
val_color = train_dog_color.groupby(['OutcomeType']).count()['Color']/train_dog_color['Color'].count()
# Append values of current name in the dataframe
df_plot2 = df_plot2.append(val_color)
train_dog_color = train_dog[train_dog['Color'] == 'Single Colored']
# Groups number of outcomes in current name and normalizes it
val_color = train_dog_color.groupby(['OutcomeType']).count()['Color']/train_dog_color['Color'].count()
# Append values of current name in the dataframe
df_plot2 = df_plot2.append(val_color)

# Sets the dataframe index from 0 to number of names - 1
df_plot2.set_index(np.array(range(2)), inplace=True)
df_plot2.fillna(0, inplace=True)



print(df_plot1.head())
print('-'*60)
print(df_plot2.head())

In [None]:
plot_frac(df_plot1, total_color, 'Cats', 'Number of Colors')

plot_frac(df_plot2, total_color, 'Dogs', 'Number of Colors')

# We see no significant difference between single colored and multi colored cats or dogs. So the feature will be dropped

In [None]:
# Now for breed, lets analyse what we already have

print(train_cat['Breed'].value_counts())

# A possible grouping is based on hair length.

train_cat['Breed'] = train_cat['Breed'].apply(lambda x: 'Shorthaired' if 'Shorthair' in x else ('Medium Haired' if 'Medium Hair' in x else ('Longhaired' if 'Longhair' in x else 'Other')))

print('-'*30)
train_cat['Breed'].value_counts()


In [None]:
# Creates the dataframe that will be filled with values to be plotted
df_plot1 = pd.DataFrame(columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

# Defines range used in the for loop
total_breed = ['Longhaired','Medium Haired','Shorthaired','Other']

# Loop that fills the df_plot1
for i in total_breed:
    # Selects the values of train_cat for the sex i
    train_cat_breed = train_cat[train_cat['Breed'] == i]
    # Groups number of outcomes in current sex and normalizes it
    val_breed = train_cat_breed.groupby(['OutcomeType']).count()['Breed']/train_cat_breed['Breed'].count()
    # Append values of current sex in the dataframe
    df_plot1 = df_plot1.append(val_breed)
# Sets the dataframe index from 0 to number of sexes - 1
df_plot1.set_index(np.array(range(len(total_breed))), inplace=True)
df_plot1.fillna(0, inplace=True)

print(df_plot1.head())

In [None]:
plot_frac(df_plot1, total_breed, 'Cats', 'HairType')

# We can se some difference between different hair types, but may be too little to be considered in the model.

In [None]:
# Now for the dogs

#print(train_dog['Breed'])

df_test123 = train_dog['Breed'].value_counts()
df_test123.head(30)

# There are 1320 unique entries for dog's breeds
# Most entries are combinations of different breeds
# A categorization in smaller groups is possible, but it would certainly take a lot of work
# Since breed of dogs have a considerable impact when deciding each animal to adopt, it may be useful to revisit this in the future
# Dividing dog breeds in dog groups will be future work
# For now, the feature will be dropped

In [None]:
# Preparing the data to be modeled
print('Before Dropping')
print(train.columns)
print('-'*80)
print(test.columns)

train.drop(['AnimalID', 'DateTime', 'OutcomeSubtype', 'Breed', 'Color'], axis=1, inplace=True)
test.drop(['ID', 'DateTime', 'Breed', 'Color'], axis=1, inplace=True)

print('-'*80)
print('-'*80)
print('After Dropping')
print(train.columns)
print('-'*80)
print(test.columns)

In [None]:
# First we choose a way to split the data into train and validation
# Since there are some rare outcomes such as died and euthanasia
# The best way to split the data is using a Stratified K-Fold
# Because it will make splits with equal percentages of outcomes in each split

y = train['OutcomeType']
X = train.drop('OutcomeType', axis=1)
print(y)
print(X)

In [None]:
# Here we initialize all the modeling methods we'll test
kneigh = KNeighborsClassifier()
dectree = DecisionTreeClassifier(random_state=42)
forest = RandomForestClassifier(random_state=42)
adab = AdaBoostClassifier(random_state=42)
gb = xgb.XGBClassifier(eval_metric='mlogloss', random_state=42)

# Create the train_test_split for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, stratify=y , random_state=42)

# Making y_test probability-like
y_test_conf = y_test.to_numpy()
y_test = pd.get_dummies(y_test)
y_test = y_test.to_numpy().astype(np.float32)

# Defining a logloss function that accepts float values
def logloss(y_true, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -(y_true * np.log(y_pred)).sum(axis=1).mean()

In [None]:
def bayes_search(model, param_grid):

    # Initialize the cross validation method
    n_iter = 5
    cv = StratifiedKFold(n_splits=n_iter, shuffle=True, random_state=42)

    # Execute the bayes search
    bsearch = BayesSearchCV(model, param_grid, n_iter=n_iter, scoring='neg_log_loss', cv=cv, verbose=True).fit(X,y)
    # Print the values to be used in each parameter for best result in the final fitting
    print(' ',bsearch.best_score_)
    print(' ',bsearch.best_params_)
    
    return None

In [None]:
# Searching for KNeighbors

'''
# Define the parameters to be tested in the bayes search
param_grid = {'n_neighbors': Integer(2, 20),
              'weights': Categorical(['uniform','distance']),
              'leaf_size': Integer(10, 100)}

bayes_search(kneigh, param_grid)
'''

# Results: 
# ('leaf_size', 69), ('n_neighbors', 18), ('weights', 'distance')


kneigh = KNeighborsClassifier(leaf_size=69, n_neighbors=18, weights='distance')
kneigh.fit(X_train, y_train)
y_pred = kneigh.predict_proba(X_test)
result = logloss(y_test, y_pred)
print(result)

# Result:
# logloss 1.9320097854834941

In [None]:
# Searching for DecisionTreeClassifier

'''
# Define the parameters to be tested in the bayes search
param_grid = {'criterion': Categorical(['gini','entropy']),
              'splitter': Categorical(['best','random']),
              'max_depth': Integer(10, 200),
              'min_samples_split': Integer(5, 50),
              'max_leaf_nodes': Integer(10, 200),
              }

bayes_search(dectree, param_grid)
'''

# Results:
# ('criterion', 'gini'), ('max_depth', 69), ('min_samples_split', 43), ('splitter', 'random')

dectree = DecisionTreeClassifier(criterion='gini', max_depth=69, min_samples_split=43, splitter='random', random_state=42)
dectree.fit(X_train, y_train)
y_pred = dectree.predict_proba(X_test)
result = logloss(y_test, y_pred)
print(result)

# Result:
# logloss 1.097150037822769

In [None]:
# Searching for RandomForestRegressor

'''
# Define the parameters to be tested in the bayes search
param_grid = {'n_estimators': Integer(100, 2000),
              'criterion': Categorical(['gini','entropy']),
              'max_leaf_nodes': Integer(20, 500),
              'min_samples_split': Integer(5, 50),
              'max_leaf_nodes': Integer(10, 200),
              }

bayes_search(forest, param_grid)
'''

# Results:
# ('criterion', 'entropy'), ('max_leaf_nodes', 83), ('min_samples_split', 10), ('n_estimators', 173)

forest = RandomForestClassifier(criterion='entropy', max_leaf_nodes=83, min_samples_split=10, n_estimators=173, random_state=42)
forest.fit(X_train, y_train)
y_pred = forest.predict_proba(X_test)
result = logloss(y_test, y_pred)
print(result)

# Result
# logloss 0.8632633267

In [None]:
# Searching for AdaBoostClassifier

'''
# Define the parameters to be tested in the bayes search
param_grid = {'n_estimators': Integer(50, 1000),
              'learning_rate': Real(0.01, 1, prior='log-uniform')
              }

bayes_search(adab, param_grid)


# Results:
# ('learning_rate', 0.010311092067728185), ('n_estimators', 640)

adab = AdaBoostClassifier(learning_rate=0.01, n_estimators=640 ,random_state=42)
adab.fit(X_train, y_train)
y_pred = adab.predict_proba(X_test)
result = logloss(y_test, y_pred)
print(result)
'''
# Result
# logloss 1.3654060077388854

In [None]:
# Searching for XGBoost

'''
# Define the parameters to be tested in the bayes search
param_grid = {'max_depth': Integer(1, 90),
              'learning_rate': Real(0.01, 1, prior='log-uniform'),
              'reg_alpha': Real(0.01, 100),
              'colsample_bytree': Real(0.2e0, 0.8e0),
              'subsample': Real(0.2e0, 0.8e0),
              'n_estimators': Integer(50, 200)}

bayes_search(gb, param_grid)
'''

# Results:
# ('colsample_bytree', 0.7922734554018536), ('learning_rate', 0.1826943790109235), ('max_depth', 10), ('n_estimators', 81), ('reg_alpha', 42.02553629393182), ('subsample', 0.536177294437312)


gb = xgb.XGBClassifier(colsample_bytree=0.8, learning_rate=0.18, max_depth=10, n_estimators=81, reg_alpha=42, subsample=0.54, eval_metric='mlogloss', random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict_proba(X_test)
result = logloss(y_test, y_pred)
print(result)

# Result:
# logloss 0.8880572

In [None]:
# Out of all models tried, RandomForestClassifier got the better result, with XGBClassifier a little behind

forest = RandomForestClassifier(criterion='entropy', max_leaf_nodes=83, min_samples_split=10, n_estimators=173, random_state=42)
forest.fit(X, y)
y_pred = forest.predict_proba(test)

In [None]:
# Lets analyse the results of the randomforest model

y_pred_conf = forest.predict(X_test)
labels_cm = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'] 
cm = confusion_matrix(y_test_conf, y_pred_conf)

df_cm = pd.DataFrame(cm, index = [i for i in labels_cm], columns = [i for i in labels_cm])
plt.figure(figsize = (8,6))
sns.heatmap(df_cm, annot=True, fmt="d", cmap='YlGnBu')

# The main issue detected is that the model didnt predict any died outcome
# Even though there are really few deaths in the database, the model shouldnt discard the outcome altogether
# The model also had the wrong prediction too many times on the return_to_owner and adoption outcomes
# Its good to remember though that the model is supposed to give a percentage for each outcome
# So the model wasn't built to predict which outcome will happen, what makes it somewhat weaker at this

In [None]:
# Lets see the relevancy the model gave to each feature

# print the JS visualization code to the notebook
shap.initjs()

# use Kernel SHAP to explain test set predictions
explainer = shap.TreeExplainer(forest)
shap_values = explainer.shap_values(X_train)

In [None]:
# summarize the effects of all the features
print('------------------------------------ADOPTION-------------------------------------')
shap.summary_plot(shap_values[0], X_train)
print('--------------------------------------DIED---------------------------------------')
shap.summary_plot(shap_values[1], X_train)
print('-----------------------------------EUTHANASIA------------------------------------')
shap.summary_plot(shap_values[2], X_train)
print('--------------------------------RETURN TO OWNER----------------------------------')
shap.summary_plot(shap_values[3], X_train)
print('------------------------------------TRANSFER-------------------------------------')
shap.summary_plot(shap_values[4], X_train)

# In this plot we can see how the features affect different outcomes
# We can see how different features become more important depending on each outcome the model is trying to predict
# Overall AgeuponOutcome and IsIntact shows more importance
# While the type of animal (cat/dog) and gender have less importance

In [None]:
# Plot way the model decides based in a single feature
print('------------------------ADOPTION-------------------------')
shap.dependence_plot('AgeuponOutcome', shap_values[0], X_train, interaction_index=None)

# In this plot we can see clearer how age affects the adoption rate

In [None]:
submission = pd.DataFrame(data=y_pred, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
submission['ID'] = (submission.index+1)
submission = submission[['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]
submission = submission.to_csv('submission.csv', index=False)

# Submission score: 0.85150