In [11]:
import pandas as pd
import re
import matplotlib as plt
import numpy as np
from datetime import *

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn import cross_validation
%matplotlib inline

from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

In [12]:
# Read the dataset
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')

dataset = train.drop(['AnimalID', 'OutcomeSubtype', 'OutcomeType'], axis=1)
dataset = dataset.append(test.drop('ID', axis=1), ignore_index=True)
print train.shape, test.shape, dataset.shape

(26729, 10) (11456, 8) (38185, 7)


In [13]:
dataset.head()

Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [14]:
dataset.isnull().sum()

Name              10916
DateTime              0
AnimalType            0
SexuponOutcome        1
AgeuponOutcome       24
Breed                 0
Color                 0
dtype: int64

In [15]:
def calculate_age(x):
    if pd.isnull(x):
        return x
    num = int(x.split(' ')[0])
    if 'year' in x:
        return num * 365
    elif 'month' in x:
        return num * 30
    elif 'week' in x:
        return num * 7
    
def has_name(x):
    if pd.isnull(x):
        return 0
    return 1

def is_mix(x):
    if 'Mix' in x:
        return 1
    return 0

In [16]:
dataset['AgeuponOutcome'] = dataset['AgeuponOutcome'].apply(lambda x : calculate_age(x))
dataset['AgeuponOutcome'].fillna(dataset['AgeuponOutcome'].dropna().mean(), inplace=True)
#sns.distplot(dataset['AgeuponOutcome'])


# Since there is only one NA, I will assign it to maximum class
dataset['SexuponOutcome'].fillna('Neutered Male', inplace=True)
#sns.countplot(dataset['SexuponOutcome'])


# Does Animal has a name
dataset['HasName'] = dataset['Name'].apply(has_name)
#sns.countplot(dataset['HasName'])


# Is animal of mix breed?
dataset['IsMix'] = dataset['Breed'].apply(is_mix)
#sns.countplot(dataset['IsMix'])


# Break SexuponOutcome into two - Sterilized and Sex
sex = dataset['SexuponOutcome'].str.split(' ', expand=True)
dataset['Sterilized'] = sex[0]
dataset['Sterilized'].fillna('Unknown', inplace=True)
dataset['Sex'] = sex[1]
dataset['Sex'].fillna('Unknown', inplace=True)
#sns.countplot(dataset['Sex'])
#sns.countplot(dataset['Sterilized'])


dates = dataset['DateTime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
dataset['Year'] = dates.apply(lambda x : x.year)
dataset['Month'] = dates.apply(lambda x : x.month)
dataset['Day'] = dates.apply(lambda x : x.weekday())
dataset['Hour'] = dates.apply(lambda x : x.hour)


dataset['Breed_New'] = dataset['Breed'].apply(lambda x: x.split(' Mix')[0])
breeds = dataset['Breed_New'].apply(lambda x : x.split('/'))
dataset['Breed_1'] = breeds.apply(lambda x : x[0])
# Instead of Breed_2, I will use Multiple_Breeds feature
#dataset['Breed_2'] = breeds.apply(lambda x : 'Unknown' if len(x) == 1 else x[1] )
dataset['Multiple_Breeds'] = dataset['Breed'].apply(lambda x : 1 if '/' in x else 0)


colors = dataset['Color'].apply(lambda x : x.split('/'))
dataset['Color_1'] = colors.apply(lambda x : x[0].split(' ')[0])
# Instead of Color_2, I will use Multiple_Colors feature
# dataset['Color_2'] = colors.apply(lambda x : x[1].split(' ')[0] if len(x) > 1 else 'None')
dataset['Multiple_Colors'] = dataset['Color'].apply(lambda x : 1 if '/' in x else 0)


# Encoding
enc = LabelEncoder()
dataset['Color_1'] = enc.fit_transform(dataset['Color_1'])
dataset['Breed_1'] = enc.fit_transform(dataset['Breed_1'])


# Dummy Columns
dummy_columns = ['Sterilized', 'Sex', 'AnimalType']
dataset = pd.get_dummies(dataset, columns=dummy_columns)


# Drop unnecessary columns
drop_columns = ['Name', 'DateTime', 'SexuponOutcome', 'Breed', 'Color', 'Breed_New']
dataset = dataset.drop(drop_columns, axis=1)


#Make breed count table
#breed_count = result['Breed'].value_counts()
#frequent_breeds = breed_count[breed_count > 50].to_dict().keys()
#result['Breed'][~result['Breed'].isin(frequent_breeds)] = 'Rare'

In [17]:
print train.shape, test.shape, dataset.shape

(26729, 10) (11456, 8) (38185, 20)


In [18]:
train_x = dataset.loc[0:26728,]

enc = LabelEncoder()
train_y = enc.fit_transform(train['OutcomeType'])
train_y = pd.DataFrame(train_y)

test_x = dataset.loc[26729:38185,]

In [19]:
train_x.head()

Unnamed: 0,AgeuponOutcome,HasName,IsMix,Year,Month,Day,Hour,Breed_1,Multiple_Breeds,Color_1,Multiple_Colors,Sterilized_Intact,Sterilized_Neutered,Sterilized_Spayed,Sterilized_Unknown,Sex_Female,Sex_Male,Sex_Unknown,AnimalType_Cat,AnimalType_Dog
0,365.0,1,1,2014,2,2,18,191,0,4,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,365.0,1,1,2013,10,6,12,85,0,8,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,730.0,1,1,2015,1,5,12,168,0,3,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,21.0,0,1,2014,7,4,19,85,0,3,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,730.0,0,0,2013,11,4,12,134,1,23,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [20]:
# Cross Validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_x.values, train_y[0].values,
                                                                     test_size=0.3, random_state=0)


def best_params(train_x, train_y):
    rfc = RandomForestClassifier()
    param_grid = { 
        'n_estimators': [50, 400],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    
    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(train_x, train_y)
    return CV_rfc.best_params_

print best_params(train_x.values, train_y[0].values)

{'max_features': 'log2', 'n_estimators': 400}


In [23]:
# RandomForest Classifier 

rf = RandomForestClassifier(n_estimators=400, max_features='log2').fit(X_train, y_train)
print 'Cross Validation for RandomForestClassifier'
print rf.score(X_test, y_test)

prediction = pd.DataFrame(rf.predict_proba(test_x.values))
prediction.columns = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
prediction = pd.concat([test['ID'], prediction], axis=1)
prediction.to_csv('randomforest.csv', index=False)

#Public LeaderBoard Score - 0.81316
prediction.head()

Cross Validation for RandomForestClassifier
0.670781893004


Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.035,0.0,0.155,0.165,0.645
1,2,0.8625,0.0,0.005,0.08,0.0525
2,3,0.5825,0.005,0.0025,0.0825,0.3275
3,4,0.175,0.005,0.04,0.1975,0.5825
4,5,0.5425,0.0,0.0025,0.4225,0.0325


In [24]:
# XGBClassifier 

xgboost = XGBClassifier(learning_rate =0.05,
 n_estimators=500,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 nthread=4,
 scale_pos_weight=1,
 seed=27).fit(X_train, y_train)

print 'Cross Validation for XGBClassifier'
print xgboost.score(X_test, y_test)

prediction = pd.DataFrame(xgboost.predict_proba(test_x.values))
prediction.columns = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
prediction = pd.concat([test['ID'], prediction], axis=1)
prediction.to_csv('xgbclassifier.csv', index=False)

#Public LeaderBoard Score - 0.74264
prediction.head()

Cross Validation for XGBClassifier
0.689736874922


Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.025499,0.000899,0.029093,0.1083,0.836208
1,2,0.834436,0.00046,0.008206,0.122327,0.034571
2,3,0.398541,0.001761,0.006869,0.14515,0.447679
3,4,0.276409,0.001159,0.01621,0.152524,0.553698
4,5,0.408119,0.000337,0.005728,0.487298,0.098518
