In [None]:
import pandas as pd
import numpy as np

import os
import math

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', 50)

# Data Fields
**PetID** - Unique hash ID of pet profile   

**AdoptionSpeed** - Categorical speed of adoption. Lower is faster. This is the value to predict. See below section for more info.

**Type** - Type of animal  *(1 = Dog, 2 = Cat)

**Name** - Name of pet  *(Empty if not named)

**Age** - Age of pet when listed, in months

**Breed1** - Primary breed of pet  *(Refer to BreedLabels dictionary)

**Breed2** - Secondary breed of pet, if pet is of mixed breed  *(Refer to BreedLabels dictionary)*

**Gender** - Gender of pet  *(1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)

**Color1** - Color 1 of pet  *(Refer to ColorLabels dictionary)

**Color2** - Color 2 of pet  *(Refer to ColorLabels dictionary)

**Color3** - Color 3 of pet  *(Refer to ColorLabels dictionary)

**MaturitySize** - Size at maturity  *(1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)

**FurLength** - Fur length  *(1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)

**Vaccinated** - Pet has been vaccinated  *(1 = Yes, 2 = No, 3 = Not Sure)

**Dewormed** - Pet has been dewormed  *(1 = Yes, 2 = No, 3 = Not Sure)

**Sterilized** - Pet has been spayed / neutered   *(1 = Yes, 2 = No, 3 = Not Sure)

**Health** - Health Condition  *(1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)

**Quantity** - Number of pets represented in profile

**Fee** - Adoption fee  *(0 = Free)

**State** - State location in Malaysia  *(Refer to StateLabels dictionary)

**RescuerID** - Unique hash ID of rescuer

**VideoAmt** - Total uploaded videos for this pet

**PhotoAmt** - Total uploaded photos for this pet

**Description** - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.

#### AdoptionSpeed
Contestants are required to predict this value. The value is determined by how quickly, if at all, a pet is adopted. The values are determined in the following way:    

0 - Pet was adopted on the same day as it was listed.   

1 - Pet was adopted between 1 and 7 days (1st week) after being listed.   

2 - Pet was adopted between 8 and 30 days (1st month) after being listed. 

3 - Pet was adopted between 31 and 90 days (2nd - 3rd month) after being listed.    

4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

In [None]:
df_orig = pd.read_csv('../input/train/train.csv', index_col = "PetID")

In [None]:
df_orig.head()

In [None]:
print("Columns that contains NA:", list(df_orig.columns[df_orig.isna().any()]))

In [None]:
df = df_orig.copy()

### Labels mapping

In [None]:
breeds = pd.read_csv('../input/breed_labels.csv')
colors = pd.read_csv('../input/color_labels.csv')
states = pd.read_csv('../input/state_labels.csv')

In [None]:
display(breeds.head())
display(colors.head())
display(states.head())

In [None]:
breed_labels = dict(zip(breeds.BreedID,
                       breeds.BreedName))

color_labels = dict(zip(colors.ColorID,
                       colors.ColorName))

state_labels = dict(zip(states.StateID,
                       states.StateName))

In [None]:
type_labels = {1 : 'Dog', 
               2 : 'Cat'}

gender_labels = {1 : 'Male', 
                 2 : 'Female', 
                 3 : 'Mixed (group of pets)'}

maturity_size_labels = {1 : 'Small', 
                        2 : 'Medium', 
                        3 : 'Large', 
                        4 : 'Extra Large', 
                        0 : 'Not Specified'}

fur_length_labels = {1 : 'Short', 
                     2 : 'Medium', 
                     3 : 'Long', 
                     0 : 'Not Specified'}

# for columns 'Vaccinated', 'Dewormed', 'Sterilized'
treatment_labels = {1 : 'Yes', 
                    2 : 'No', 
                    3 : 'Not Sure'}

health_labels = {1 : 'Healthy', 
                 2 : 'Minor Injury', 
                 3 : 'Serious Injury', 
                 0 : 'Not Specified'}

In [None]:
for i in [1,2]:
    df['Breed{}'.format(i)] = df['Breed{}'.format(i)].map(breed_labels)
    
for i in [1,2,3]:
    df['Color{}'.format(i)] = df['Color{}'.format(i)].map(color_labels)

df['State'] = df['State'].map(state_labels)
df['Type'] = df['Type'].map(type_labels)
df['Gender'] = df['Gender'].map(gender_labels)
df['MaturitySize'] = df['MaturitySize'].map(maturity_size_labels)
df['FurLength'] = df['FurLength'].map(fur_length_labels)

for col in ['Vaccinated', 'Dewormed', 'Sterilized']:
    df[col] = df[col].map(treatment_labels)
    
df['Health'] = df['Health'].map(health_labels)

In [None]:
df.head()

### Dataset exploration

In [None]:
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

In [None]:
df.info()

From these informations we can already see that some features won't be relevant in our exploratory analysis as there are too much missing values - we will ommit Breed2, Color2 and Color3 due to low number of non-null values.

In [None]:
print("Columns that contains NA:", list(df_orig.columns[df.isna().any()]))

The Breed1 is a primary breed of pet and it does not suggest any missing values. However, some IDs from Breed1 were absent in the csv file with breed labels.

In [None]:
missing_Breed1 = df[df[['Breed1']].isnull().any(axis=1)]
print("Number of rows that have missing values in Breed1: ", 
      missing_Breed1.shape[0])

Removing those 5 rows.

In [None]:
df.drop(list(missing_Breed1.index), axis =0, inplace = True)

Columns 'Name', 'Breed2', 'Color2', 'Color3' and 'Description' won't be used in further exploration, so I leave them as they are. But those columns may be used for new features creation, so I don't remove them.

In [None]:
# define AdoptionSpeed as categorical variable
df['AdoptionSpeed'] = pd.Categorical(df['AdoptionSpeed'], 
                                     categories=[0,1,2,3,4],
                                    ordered = True)

#### Adoption Speed (target)

In [None]:
print(df.AdoptionSpeed.describe())
plt.figure(figsize=(9, 8))
sns.countplot(df.AdoptionSpeed, palette = 'winter');

The most frequent outcome is adoption after 100 days of being listed, slightly less frequent is adoption in 1st month after being listed. The least frequent outcome is adoption in the 1st day of listing.

#### Type

In [None]:
print(df.Type.describe())
plt.figure(figsize=(8, 5))
sns.countplot(df.Type, palette = 'winter');

There are more dogs in the dataset than cats.

In [None]:
sns.catplot(x="Type", hue ="AdoptionSpeed", kind='count', data=df, palette = 'winter');

In [None]:
sns.boxplot(x="Type", y =df.AdoptionSpeed.astype(int), data=df, palette = 'winter');

Looks like cats in general are more adaptive than dogs - cats' AdoptionSpeed is lower (median=2) than than dogs' one (median=3). 

#### Gender 

In [None]:
print(df.Gender.describe())
plt.figure(figsize=(8, 5))
sns.countplot(df.Gender, palette = 'winter');

There are more female animals in the dataset. 
The least frequant gender type is mixed.

In [None]:
sns.catplot(x="Gender", hue ="AdoptionSpeed", kind='count', data=df, palette = 'winter');

In [None]:
sns.boxplot(x="Gender", y =df.AdoptionSpeed.astype(int), data=df, palette = 'winter');

The adoption speed of females and mixed group animals is similar (median=3), while males have better adoption speed (median=2). 

#### Breed1

Let's look at cat and dog breeds.

In [None]:
cats = df[df['Type'] == 'Cat']
top20_cat_breeds = cats.Breed1.value_counts().sort_values(ascending = False).iloc[:20].index.tolist()

print(cats.Breed1.describe())
plt.figure(figsize=(13,10))
ax1 = sns.countplot(y='Breed1', palette = 'winter', 
              data=cats[cats['Breed1'].isin(top20_cat_breeds)])
ax1.set(ylabel = 'Cat breeds')
ax1.set_title('Top 20 cat breeds')
plt.show()

The most frequant cat breeds are all domestic breeds, Tabby, Siamese and Persian. 
Domestic Short hair is the most frequant cat breed.

In [None]:
dogs = df[df['Type'] == 'Dog']
top20_dog_breeds = dogs.Breed1.value_counts().sort_values(ascending = False).iloc[:20].index.tolist()

print(dogs.Breed1.describe())
plt.figure(figsize=(13,10))
ax2 = sns.countplot(y='Breed1', palette = 'winter', 
              data=dogs[dogs['Breed1'].isin(top20_dog_breeds)])
ax2.set(ylabel = 'Dog breeds')
ax2.set_title('Top 20 dog breeds')

plt.show()

Mixed Breed is the most frequant dog breed.

#### PureBreed

In [None]:
# create the feature PureBreed
breedless_labels = ['Mixed Breed',
                    'Domestic Medium Hair',
                    'Domestic Long Hair',
                    'Domestic Short Hair']
df['PureBreed'] = df.apply(lambda row: "Breedlees" if ((row['Breed1'] != row['Breed2']) & (row['Breed2'] == row['Breed2'])) or 
                                                        (row['Breed1'] in breedless_labels) else "Pure", axis =1)

In [None]:
print(df.PureBreed.describe())
plt.figure(figsize=(8, 5))
sns.countplot(df.Type, palette = 'winter', hue = df.PureBreed);

There are much more breedless animals among both dogs and cats.

In [None]:
plt.figure(figsize=(8, 5))

sns.boxplot(x="Type", y =df.AdoptionSpeed.astype(int), hue = 'PureBreed', data=df, palette = 'winter');

Pureness of breed does not really matters for cats, but so does for dogs. Pure breed dogs (median = 2) are adopted faster than breedless ones (median = 3).

#### Color1

In [None]:
sns.countplot(y='Color1', palette = 'winter', data=df);

Most animals primary color is black.

In [None]:
plt.figure(figsize=(8, 5))

sns.boxplot(x="Color1", y =df.AdoptionSpeed.astype(int), data=df, palette = 'winter');

Cream, Gray, Golden and White colored animals are adopted faster.Those colors are more rare among animals, so dogs and cats of these colors  attract more attention. Also, those rare colors are likely to be more frequent among pure breed animals.

#### Age

In [None]:
print(df.Age.describe())
plt.figure(figsize=(10, 5))
sns.distplot(df['Age'], kde = True);

75% of animals are aged 12 months and less. Distibution plot looks bad, so I'll create age bins.

In [None]:
age_bins = {
        (0, 6): '0 to 5 months',
        (6, 12): '6 to 11 months',
        (12, 36): '1 to 2 years',
        (36, 60): '3 to 4 years',
        (60, 96): '5 to 7 years',
        (96, np.inf): '8 and more years'}

df['Age_bins'] = None
for age_inter in age_bins.keys():
    df.loc[(age_inter[0] <= df['Age']) & (df['Age'] < age_inter[1]), 
           ['Age_bins']] = age_bins[age_inter]


In [None]:
fig, axs = plt.subplots(ncols=2,figsize=(17,7))

ax1 = sns.countplot(x='Age_bins', palette = 'winter', data=df, ax=axs[0]);
plt.sca(ax1)
plt.xticks(rotation=45)
ax2 = sns.boxplot(x='Age_bins', y = df.AdoptionSpeed.astype(int), palette = 'winter', data=df, ax=axs[1]);
plt.sca(ax2)
plt.xticks(rotation=45);

Most animals are less than 6 months old. Kittens and puppies are adopted faster than adult animals.

### MaturitySize 

In [None]:
print(df.MaturitySize.describe())
fig, axs = plt.subplots(ncols=2,figsize=(20,7))
sns.countplot(x='MaturitySize', hue = 'Type',  palette = 'winter', data=df, ax=axs[0]);
sns.boxplot(x='Type', y = df.AdoptionSpeed.astype(int),hue = 'MaturitySize', palette = 'winter', data=df, ax=axs[1]);
plt.show()

Most dogs and cats are medium. There are few extra large animals to say that extra large animals are adopted faster or slower. For cats size does not really matter. Small dogs are adopted faster than medium ones.

### FurLength

In [None]:
print(df.FurLength.describe())
fig, axs = plt.subplots(ncols=2,figsize=(20,7))
sns.countplot(x='FurLength', hue = 'Type',  palette = 'winter', data=df, ax=axs[0]);
sns.boxplot(x='Type', y = df.AdoptionSpeed.astype(int),hue = 'FurLength', palette = 'winter', data=df, ax=axs[1]);
plt.show()

Most animals fur is short or medium-length. Long fur is not as common as short and medium one, so I doubt fur length is important.

#### Vaccinated, Dewormed, Sterilized

In [None]:
df.Vaccinated.describe()

In [None]:
df.Dewormed.describe()

In [None]:
df.Sterilized.describe()

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(20,20))
sns.countplot(x='Vaccinated', palette = 'winter', data=df, ax=axs[0, 0]);
sns.boxplot(x='Vaccinated', y = df.AdoptionSpeed.astype(int), palette = 'winter', data=df, ax=axs[0,1]);
sns.countplot(x='Dewormed', palette = 'winter', data=df, ax=axs[1, 0]);
sns.boxplot(x='Dewormed', y = df.AdoptionSpeed.astype(int), palette = 'winter', data=df, ax=axs[1,1]);
sns.countplot(x='Sterilized', palette = 'winter', data=df, ax=axs[2, 0]);
sns.boxplot(x='Sterilized', y = df.AdoptionSpeed.astype(int), palette = 'winter', data=df, ax=axs[2,1]);

Looks like all these things make adoption period longer. Maybe there are some connections between age of the animal and those features - very young animals are not vaccinated, dewormed or sterilized. 

In [None]:
fig, axs = plt.subplots(ncols=1, nrows=3, figsize=(20,20))
sns.countplot(x='Vaccinated', hue='Age_bins',  palette = 'winter', data=df, ax=axs[0]);
sns.countplot(x='Dewormed',hue='Age_bins',  palette = 'winter', data=df, ax=axs[1]);
sns.countplot(x='Sterilized', hue='Age_bins', palette = 'winter', data=df, ax=axs[2]);

Most animals less than 6 months old are not vaccinated or sterilized.

#### Health

In [None]:
print(df.Health.describe())
fig, axs = plt.subplots(ncols=2,figsize=(20,7))
sns.countplot(x='Health',  palette = 'winter', data=df, ax=axs[0]);
sns.boxplot(x='Health', y = df.AdoptionSpeed.astype(int), palette = 'winter', data=df, ax=axs[1]);
plt.show()

Almost all animals are healthy (14473 out of 14988).

#### Quantity 

In [None]:
print(df.Quantity.describe())
fig, axs = plt.subplots(ncols=2,figsize=(13,5))
sns.countplot(x='Quantity',  palette = 'winter', data=df, ax=axs[0]);
sns.stripplot(x='Quantity', y = df['AdoptionSpeed'].astype(int), data =df, palette = 'winter',ax=axs[1])
plt.show()

Large number of animals is likely to have worse AdoptionSpeed.

#### Fee

In [None]:
print(df.Fee.describe())
fig, axs = plt.subplots(ncols=2, nrows =1,figsize=(13,5))
sns.distplot(df['Fee'],  color = 'blue', kde = False, ax=axs[0]);
sns.catplot(y='Fee', x = 'AdoptionSpeed', data =df, palette = 'winter', kind = 'strip',ax=axs[1]);
plt.close(2)
plt.show()

Looks like there are 2 outliers, let's look at them and remove than.

In [None]:
fee_outliers_IDs = list(df[df['Fee'] > 1500].index)
df[df.index.isin(fee_outliers_IDs)]

In [None]:
# removing outliers
df.drop(fee_outliers_IDs, axis =0, inplace=True)

Let's look on these plots one more time after outliers removal.

In [None]:
print(df.Fee.describe())
fig, axs = plt.subplots(ncols=2, nrows =1,figsize=(13,5))
sns.distplot(df['Fee'],  color = 'blue', kde = False, ax=axs[0]);
sns.catplot(y='Fee', x = 'AdoptionSpeed', data =df, palette = 'winter', kind = 'strip',ax=axs[1]);
plt.close(2)
plt.show()

Nothing special.

#### State

In [None]:
df.State.describe()
# fig, axs = plt.subplots(ncols=1, nrows =2,figsize=(13,17))
plt.figure(figsize=(13, 5))
ax1=sns.countplot(x='State',  palette = 'winter', data=df);
# plt.sca(ax1)
plt.xticks(rotation=45)
# ax2=sns.catplot(y='State', x = 'AdoptionSpeed', data =df, palette = 'winter', kind = 'box',ax = axs[1]);
# plt.close(2);
plt.show()

In [None]:
sns.catplot(y='State', x = 'AdoptionSpeed', data =df, palette = 'winter', kind = 'box');

#### RescuerID

In [None]:
df.RescuerID.describe()

Let's look on the number of rescued animals for each ResquerID

In [None]:
rescuers = pd.DataFrame(df.RescuerID.value_counts()).reset_index()
rescuers.columns = ['RescuerID', 'Number of rescued animals']
rescuers.head()

In [None]:
rescuers['Number of rescued animals'].describe()

At least 75% percent of resquers saved 2 and less animals only. 

In [None]:
rescuers.columns

In [None]:
df['RescuerNumber'] = df['RescuerID'].map(dict(zip(rescuers.RescuerID,
                                                  rescuers['Number of rescued animals'])))

In [None]:
df['RescuerNumber'].describe()

In [None]:
sns.distplot(df['RescuerNumber'], kde = False, bins = 30);

#### VideoAmt and PhotoAmt

In [None]:
print(df['VideoAmt'].describe())

In [None]:
print("{0:.2%} of pets don't have videos".format(
    df[df['VideoAmt'] == 0].shape[0] / float(df.shape[0])))

In [None]:
print(df['PhotoAmt'].describe())
plt.figure(figsize=(10, 5))
sns.countplot(df['PhotoAmt'].astype(int), palette='winter_r');

Most animals have from 1 to 5 photos.