In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

This competition requires us to find out the adoptability of pets according to their "cuteness" and other attributes. The datasets available include texts, tabular data and images. So it is a really good competition to work with various types of data. 
First we will do an EDA and then move on to modeling.


In [None]:
import numpy as np 
import pandas as pd 
import os
import json
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from PIL import Image
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import gc
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import random
import warnings
warnings.filterwarnings("ignore")
from functools import partial
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 100)
import os
import scipy as sp
from math import sqrt
from collections import Counter
from sklearn.metrics import confusion_matrix as sk_cmatrix

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.ensemble import RandomForestClassifier
import langdetect
import eli5
from IPython.display import display 

from sklearn.metrics import cohen_kappa_score
def kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [None]:
# Read the datasets
breed = pd.read_csv('../input/breed_labels.csv')
color = pd.read_csv('../input/color_labels.csv')
state = pd.read_csv('../input/state_labels.csv')

train = pd.read_csv('../input/train/train.csv')
test = pd.read_csv('../input/test/test.csv')
sub = pd.read_csv('../input/test/sample_submission.csv')




In [None]:
train.head()

In [None]:
#We need to add new columns to train and 
#test data to specify the dataset type since we will combine them for now. 
train['dataset_type'] = 'train'
test['dataset_type'] = 'test'
all_data = pd.concat([train, test])

In [None]:
all_data.head()

In [None]:
all_data.drop('Description', axis = 1).head()

In [None]:
train.info()

Target variable is the AdoptionSpeed that is coded as:
0 - Pet was adopted on the same day as it was listed.
1 - Pet was adopted between 1 and 7 days (1st week) after being listed.
2 - Pet was adopted between 8 and 30 days (1st month) after being listed.
3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed.
4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

In [None]:
# Take a look at the number of different adoption speed classes
all_data['AdoptionSpeed'].value_counts().sort_index().plot('bar', color = 'green');
plt.title('AdoptionSpeed counts');

It is really sad that a very few of them get adopted fast and a lot of them do not get adopted at all.

We know that in the dataset dogs are coded as 1 and cats are 2, So we can see how many are there from each of them.

In [None]:
all_data['Type'] = all_data['Type'].apply(lambda x: 'Dog' if x == 1 else 'Cat')
sns.countplot(x='dataset_type', data=all_data, hue='Type');
plt.title('Number of cats and dogs in train and test data');

In [None]:
sns.countplot(x='AdoptionSpeed', data=all_data, hue='Type');
plt.title('Number of cats and dogs within each AdoptionSpeed Class');

We can say that it seems like cats tend to be adopted earlier than dogs. 

In [None]:
sns.countplot(x='AdoptionSpeed', data=all_data, hue='Age');
plt.title('Age within each AdoptionSpeed Class');

Obviously, there is something wrong with the age field. No cat or dog will ever be 255 years old. So we will dismiss the ones over 25 to get rid of inaccurate results. Besides, there are too many values and this would cause the analysis to be less clear. So we need to fix this issue by grouping the age values and make it easier to understand the trends. We can change this back for the modeling part since we do not want to lose this information.   

In [None]:
all_data= all_data.loc[all_data['Age'] < 26] 
# We would like to make this part permanent, since this is necessary 
#to get rid of the 'wrong' entries that cannot be true. 

In [None]:
all_data_toy = all_data 
# We are doing this just for clear visualizations, so we do not want to make it permanent.
all_data_toy['Age'] = all_data_toy['Age'].apply(lambda x: '>10' if x > 10 else ('8-10' if x > 7 else ('5-7' if x > 4 else ('2-4' if x > 1 else 'Young'))))

In [None]:
all_data_toy['Age'].value_counts().sort_index().plot('bar', color = 'green');
plt.title('Age Class Counts');

We can see that the majority of pets that are up for adoption belongs to 2-4 age group. We can also see this by diverging cats and dogs. 

In [None]:
sns.countplot(x='Age', data=all_data_toy, hue='Type');
plt.title('Cat and Dog counts for age groups');

Dogs live more than cats in average. This explains that they are up for adoption at later ages like over 10 years. It is interesting to see that even though we have less counts in total for cats, for the 5-7 age group they surpass the dogs. 

In [None]:
sns.countplot(x='AdoptionSpeed', data=all_data_toy, hue='Age');
plt.title('Age within each AdoptionSpeed Class');

What we need to look at above graph is the change in the count of ifferent age groups between different adoptions speeds. Total numbers would give a wrong analysis.
For example, we can see that 2-4 age group has highest counts for all adoption speed groups except the last one which means the ones that do not get adopted. For this group we observe that >10 age group is the highest. This does not contradict with what we know about these adoption processes, since it is harder for older animals to get adopted.  

Can names be effective in adoption speed? 
People usually just rename the pet if it is young enough anyways, and logically it should not have any effect. But, let's see...
People working in adopting these pets tend to give the name 'Lucky' to change the luck of the pet and it is a quite common name. Maybe we can compare the 'Lucky' ones with the adoption rate of all other pets. 

In [None]:
all_data['Name'] = all_data['Name'].apply(lambda x: 'Lucky' if x == 'Lucky' else 'Others')

In [None]:

lucky = all_data.loc[all_data['Name'] == 'Lucky']
others = all_data.loc[all_data['Name'] == 'Others']
plt.subplot(1,2,1)
lucky['AdoptionSpeed'].value_counts(normalize = True).sort_index().plot('bar', color = 'pink');
plt.title('Lucky');
plt.subplot(1,2,2)
others['AdoptionSpeed'].value_counts(normalize = True).sort_index().plot('bar', color = 'blue');
plt.title('Others');
plt.show()
plt.show()


Wow! 
It seems like having the name 'Lucky' really makes a difference in short term. However, not in the long term.
We can observe that about 8% of 'Lucky' pets gets adopted in the first day, while that number is 3% for others. However more pets that is not named Lucky gets adopted in the first week making the total number of these two adoption speed groups about the same for both name categories. 
On a good  but not statistically important note, we can see that the percentage of pets not getting adopted at all is slightly lesser when they are named 'Lucky'. 
We can also see this in a table;

 

In [None]:
#Here I am creating a new column to crosstab it with Adoption Speed
all_data['isLucky'] = 0
all_data.loc[all_data['Name'] == 'Lucky', 'isLucky'] = 1
sns.heatmap(pd.crosstab(all_data['isLucky'], all_data['AdoptionSpeed'], normalize='index'),
            cmap = "YlGnBu", annot=True, cbar=False)

Now, since I believe that it is most of the time an important factor, I will analyse the different breeds of pets and how that affects the adoption speed. 
I should add that I find it quite sad it should have any effect at all. 


In [None]:
#We need to look at the breed_labels.csv file to find information about the breeds. 
#In the train and test datasets we only have the id numbers. 
#Now I will try to decide on how best to merge the files.
breed.info()

In [None]:
#We can see that BreedID here matches the breed1 and breed2 variables in the main dataset.  
breed.head()

In [None]:
#Here I am trying to determine if I need to have the information about the breed name for the second breed.
#It looks like most of them does not have a second breed, so I am not going to use it for the name. 
#It might be useful to determine if they are purebreeds.
all_data['Breed2'].value_counts().head()

In [None]:
all_data_breed  = pd.merge(all_data, 
                          breed[['BreedID', 'BreedName']],
                          left_on = 'Breed1',
                          right_on = 'BreedID',
                          how = 'left')

In [None]:
#Now we have the breedname as another variable and we can use it for analysis.
all_data_breed.drop('BreedID', axis = 1)
all_data_breed.drop('Description', axis = 1).head()

In [None]:
#Here we can see thet most common breeds
all_data_breed['BreedName'].value_counts().head(15)

Now I will examine the maturity size of the dogs and which breeds do they belong mostly. I believe this is not that important for cats since cats usually do not differ as much as dogs according to their breed. 
Here I also want to take a look at the images of the dogs to compare them with the description of their breed. 

In [None]:

images = [i.split('-')[0] for i in os.listdir('../input/train_images/')]
size_dict = {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large'}

for m in all_data_breed['MaturitySize'].unique():
    df = all_data_breed.loc[(all_data_breed['Type'] == 'Dog') & (all_data_breed['MaturitySize'] == m)]
    top_breeds = list(df['BreedName'].value_counts().index)[:5]
    m = size_dict[m]
    print(f"Most common Breeds of {m} dogs:")
        
    fig = plt.figure(figsize=(25, 4))
        
    for i, breed in enumerate(top_breeds):
        # excluding pets without pictures
        b_df = df.loc[(df['BreedName'] == breed) & (df['PetID'].isin(images)), 'PetID']
        if len(b_df) > 1:
            pet_id = b_df.values[1]
        else:
            pet_id = b_df.values[0]
        ax = fig.add_subplot(1, 5, i+1, xticks=[], yticks=[])

        im = Image.open("../input/train_images/" + pet_id + '-1.jpg')
        plt.imshow(im)
        ax.set_title(f'Breed: {breed}')
    plt.show();

In [None]:
plt.subplots(figsize = (10,8))
ax = sns.countplot(x='AdoptionSpeed', data=all_data_toy, hue='MaturitySize');

plt.title('Maturity Size within each AdoptionSpeed Class');


In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(pd.crosstab(all_data.MaturitySize, [all_data.AdoptionSpeed, all_data.Type], 
            rownames = ['MaturitySize'], colnames = ['AdoptionSpeed', 'Type'], 
            normalize='index', margins = True, margins_name = 'Total'),
            cmap = "YlGnBu", annot=True, cbar=True, ax = ax )
#We should add that there are very little extra large animals, so the data about them is mostly random. 
#From this, we can see that small or large animals have a better chance than the medium ones.
#This might be because most people either specifically want a small animal or a large one, and they serve
#different purposes, while a medium ones do not fit the either purpose. 
#We can also observe that dogs are having a harder time with size than cats. 

In [None]:
g = sns.catplot(x="AdoptionSpeed", hue="MaturitySize", col="Type",
                data=all_data, kind="count",
                height=6, aspect=.7);