In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool

%matplotlib inline

In [2]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

data.shape, test.shape

((26729, 10), (11456, 8))

In [3]:
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [4]:
def an_age(age):
    n, period = age.split()
    if period.startswith('year'):
        return int(n)*365
    elif period.startswith('month'):
        return int(n)*30
    elif period.startswith('week'):
        return int(n)*7
    else:
        return int(n)*1

In [5]:
data['AgeuponOutcome'] = data['AgeuponOutcome'].fillna('0 years').apply(an_age)
test['AgeuponOutcome'] = test['AgeuponOutcome'].fillna('0 years').apply(an_age)

In [6]:
for col in test.columns:
    try:
        print(col, data[col].nunique(), test[col].nunique())
    except:
        print(col)

ID
Name 6374 3712
DateTime 22918 10575
AnimalType 2 2
SexuponOutcome 5 5
AgeuponOutcome 43 44
Breed 1380 913
Color 366 277


In [7]:
train = data.drop(['AnimalID', 'DateTime', 'OutcomeType', 'OutcomeSubtype'], axis=1).astype(str)
ttest = test.drop(['ID', 'DateTime'], axis=1).astype(str)

In [8]:
ttest.head()

Unnamed: 0,Name,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Summer,Dog,Intact Female,300,Labrador Retriever Mix,Red/White
1,Cheyenne,Dog,Spayed Female,730,German Shepherd/Siberian Husky,Black/Tan
2,Gus,Cat,Neutered Male,365,Domestic Shorthair Mix,Brown Tabby
3,Pongo,Dog,Intact Male,120,Collie Smooth Mix,Tricolor
4,Skooter,Dog,Neutered Male,730,Miniature Poodle Mix,White


In [9]:
tar_dict = {'Adoption': 0,
            'Died': 1,
            'Euthanasia': 2,
            'Return_to_owner': 3,
            'Transfer': 4}

In [10]:
target = list(data['OutcomeType'].map(tar_dict))

In [11]:
train_pool = Pool(train, target, cat_features=[0, 1, 2, 3, 4, 5])

In [12]:
test_pool = Pool(ttest, cat_features=[0, 1, 2, 3, 4, 5])

In [13]:
model = CatBoostClassifier(iterations=800, learning_rate=0.01, depth=5, loss_function='MultiClass', verbose=20)


In [14]:
model.fit(train_pool)
preds_proba = model.predict_proba(test_pool)

0:	learn: -1.5946406	total: 146ms	remaining: 1m 56s
20:	learn: -1.3800753	total: 1.91s	remaining: 1m 10s
40:	learn: -1.2480780	total: 3.69s	remaining: 1m 8s
60:	learn: -1.1589317	total: 5.46s	remaining: 1m 6s
80:	learn: -1.0954372	total: 7.18s	remaining: 1m 3s
100:	learn: -1.0489421	total: 8.94s	remaining: 1m 1s
120:	learn: -1.0143679	total: 10.7s	remaining: 59.8s
140:	learn: -0.9874245	total: 12.4s	remaining: 58.2s
160:	learn: -0.9595622	total: 14.3s	remaining: 56.6s
180:	learn: -0.9366832	total: 16.1s	remaining: 55.1s
200:	learn: -0.9197723	total: 17.9s	remaining: 53.4s
220:	learn: -0.9066656	total: 19.6s	remaining: 51.5s
240:	learn: -0.8964368	total: 21.4s	remaining: 49.7s
260:	learn: -0.8880980	total: 23.2s	remaining: 47.9s
280:	learn: -0.8813256	total: 25s	remaining: 46.2s
300:	learn: -0.8759227	total: 26.7s	remaining: 44.3s
320:	learn: -0.8717592	total: 28.5s	remaining: 42.5s
340:	learn: -0.8682268	total: 30.1s	remaining: 40.5s
360:	learn: -0.8650637	total: 31.9s	remaining: 38.7s

In [15]:
preds_proba

array([[0.08690766, 0.00661207, 0.05963382, 0.22292351, 0.62392294],
       [0.54455375, 0.00181051, 0.02529933, 0.27210443, 0.15623198],
       [0.45151007, 0.00295979, 0.02036527, 0.14301212, 0.38215275],
       ...,
       [0.00707252, 0.01489194, 0.11884986, 0.00853724, 0.85064845],
       [0.30868334, 0.0024527 , 0.04827139, 0.4920923 , 0.14850027],
       [0.03451474, 0.00387299, 0.18023376, 0.52515406, 0.25622445]])

In [16]:
sub = pd.read_csv('data/sample_submission.csv')

In [17]:
sub.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,1,0,0,0,0
1,2,1,0,0,0,0
2,3,1,0,0,0,0
3,4,1,0,0,0,0
4,5,1,0,0,0,0


In [18]:
sub.shape

(11456, 6)

In [19]:
test.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,300,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,730,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,365,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,120,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,730,Miniature Poodle Mix,White


In [20]:
submission = pd.DataFrame(data=preds_proba, index=range(1, 11457)).reset_index()

In [21]:
submission.columns=['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']

submission.tail()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
11451,11452,0.904082,0.002416,0.00561,0.009035,0.078857
11452,11453,0.001607,0.016259,0.033404,0.00448,0.944251
11453,11454,0.007073,0.014892,0.11885,0.008537,0.850648
11454,11455,0.308683,0.002453,0.048271,0.492092,0.1485
11455,11456,0.034515,0.003873,0.180234,0.525154,0.256224


In [22]:
submission.to_csv('sub1.csv', index=False)