# Score: 11.32097

### Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

### Load data

In [2]:
dateparse = lambda x: pd.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")

train = pd.read_csv("../../data/shelter/train.csv", parse_dates=['DateTime'], date_parser=dateparse)
test  = pd.read_csv("../../data/shelter/test.csv", parse_dates=['DateTime'], date_parser=dateparse)

print("train shape:", train.shape)
print("test shape:", test.shape)

print("Earliest train time:", min(train.DateTime), "- Latest train time:", max(train.DateTime))
print("Earliest test time:",  min(test.DateTime),  "- Latest test time:",  max(test.DateTime))

train shape: (26729, 10)
test shape: (11456, 8)
Earliest train time: 2013-10-01 09:31:00 - Latest train time: 2016-02-21 19:17:00
Earliest test time: 2013-10-01 10:44:00 - Latest test time: 2016-02-21 18:37:00


### Drop row with missing Sex value

In [3]:
train = train.drop(train[train.SexuponOutcome.isnull()].index)

### Merge train and test data

In [4]:
full = pd.concat([train, test])
train_N = len(train)

### Time

In [5]:
full["weekday"] = full.DateTime.map(lambda x: x.weekday())
full["year"]    = full.DateTime.map(lambda x: x.year)
full["month"]   = full.DateTime.map(lambda x: x.month)
full["hour"]    = full.DateTime.map(lambda x: x.hour)

full["is_weekend"] = full.weekday.map(lambda x: int(x in [5, 6]))

### Create column: time of day

In [6]:
full["night"]   = full.hour.map(lambda x: int(x > 5  and x < 11))
full["morning"] = full.hour.map(lambda x: int(x > 10 and x < 16))
full["midday"]  = full.hour.map(lambda x: int(x > 15 and x < 20))
full["lateday"] = full.hour.map(lambda x: int(x > 19 or  x < 6))

### Create columns sex and is_netured

In [7]:
def get_neut(x):
    if x is np.nan:
        return np.nan
    elif x == "Unknown":
        return 2
    return int("Neutered" in x or "Sprayed" in x)

def get_sex(x):
    if x is np.nan:
        return np.nan
    elif x == "Unknown":
        return 2
    return int("Male" in x)

full["Sex"] = full.SexuponOutcome.map(get_sex)
full["isNetured"] = full.SexuponOutcome.map(get_neut)

full = full.drop(["SexuponOutcome"], axis=1)

### AgeuponOutcome

In [8]:
def format_age(x):
    if x is np.nan:
        return None
    
    num, scale = x.split(" ")
    if scale in ["day", "days"]:
        return int(num)
    elif scale in ["week", "weeks"]:
        return 7 * int(num)
    elif scale in ["month", "months"]:
        return 30 * int(num)
    elif scale in ["year", "years"]:
        return 365 * int(num)

# This may seem like double variables, but people are weird and may think 100 weeks is a lot younger than 2 years
def human_age(x, timescale):
    if x is np.nan:
        return 0
    num, scale = x.split(" ")
    if scale in [timescale, timescale+"s"]:
        return int(num)
    return 0

full["age_year"]  = full.AgeuponOutcome.map(lambda x: human_age(x, "year"))
full["age_month"] = full.AgeuponOutcome.map(lambda x: human_age(x, "month"))
full["age_week"]  = full.AgeuponOutcome.map(lambda x: human_age(x, "week"))
full["age_day"]   = full.AgeuponOutcome.map(lambda x: human_age(x, "day"))
full.AgeuponOutcome = full.AgeuponOutcome.map(format_age)

### Fill in AgeuponOutcome values

In [9]:
full.AgeuponOutcome.fillna(
    full.groupby("Breed")["AgeuponOutcome"].transform("median"), inplace=True)

### Create column: is_baby

In [10]:
full["is_baby"] = full.AgeuponOutcome.map(lambda x: int(x < 365))

### AnimalType mapping

In [11]:
full.AnimalType = full.AnimalType.map({"Dog": 0, "Cat": 1})

# Breed

### Hair type and isMix

Looks at the breed and determines whether the breed is a mix or determine the length of its hair

In [12]:
shorthair_map  = lambda x: 1 if "Shorthair" in x else 0
mediumhair_map = lambda x: 1 if "Medium Hair" in x else 0
longhair_map   = lambda x: 1 if "Longhair" in x else 0
mix_map        = lambda x: 1 if "Mix" in x else 0

full["Shorthair"]  = full.Breed.map(shorthair_map)
full["Mediumhair"] = full.Breed.map(mediumhair_map)
full["Longhair"]   = full.Breed.map(longhair_map)
full["Mix"]        = full.Breed.map(mix_map)

### Create column first breed

In [13]:
full["first_breed"] = full.Breed.map(lambda x: x.split("/")[0].replace(" Mix", ""))

### Breed mapping

In [14]:
def get_col_map(df, col, cutoff):
    val_counts = df[col].value_counts()
    ret = dict()
    counter = 0
    
    for index in val_counts.index:
        count = val_counts[index]
        if count > cutoff:
            ret[index] = counter
            counter += 1
        else:
            ret[index] = counter
    return ret

breed_map = get_col_map(full, "Breed", 300)
first_breed_map = get_col_map(full, "first_breed", 300)

full.Breed       = full.Breed.map(breed_map)
full.first_breed = full.first_breed.map(first_breed_map)

# Color

### Create column: Individual colors

In [15]:
blue_map  = lambda x: 1 if "Blue"      in x else 0
black_map = lambda x: 1 if "White"     in x else 0
white_map = lambda x: 1 if "White"     in x else 0
brown_map = lambda x: 1 if "Brown"     in x else 0
tabby_map = lambda x: 1 if "Tabby"     in x else 0
tan_map   = lambda x: 1 if "Tan"       in x else 0
red_map   = lambda x: 1 if "Red"       in x else 0
choc_map  = lambda x: 1 if "Chocolate" in x else 0


full["Blue"]  = full.Color.map(blue_map)
full["Black"] = full.Color.map(black_map)
full["White"] = full.Color.map(white_map)
full["Brown"] = full.Color.map(brown_map)
full["Tabby"] = full.Color.map(tabby_map)
full["Tan"]   = full.Color.map(tan_map)
full["Red"]   = full.Color.map(red_map)
full["Choc"]  = full.Color.map(choc_map)

### Most popular colors are given a numeric value

In [16]:
color_map = get_col_map(full, "Color", 300)

full.Color = full.Color.map(color_map)

### Name

I'm wondering if a missing name is indicative of anything?  Maybe the name is missing because the dog was transfered quickly or spent a little time at the shelter.  Also some outcome's may rely on having documentation of the name

In [17]:
name_map = lambda x: 0 if x is np.nan else 1

full.Name = full.Name.map(name_map)

### Split full back into train and test

In [18]:
train = full[:train_N]
test = full[train_N:]
full.head()

Unnamed: 0,AgeuponOutcome,AnimalID,AnimalType,Breed,Color,DateTime,ID,Name,OutcomeSubtype,OutcomeType,...,Mix,first_breed,Blue,Black,White,Brown,Tabby,Tan,Red,Choc
0,365.0,A671945,0,13,5,2014-02-12 18:22:00,,1,,Return_to_owner,...,1,20,0,1,1,1,0,0,0,0
1,365.0,A656520,1,0,30,2013-10-13 12:44:00,,1,Suffering,Euthanasia,...,1,0,0,0,0,0,1,0,0,0
2,730.0,A686464,0,1,9,2015-01-31 12:28:00,,1,Foster,Adoption,...,1,1,1,1,1,0,0,0,0,0
3,21.0,A683430,1,0,31,2014-07-11 19:09:00,,0,Partner,Transfer,...,1,0,1,0,0,0,0,0,0,0
4,730.0,A667013,0,13,12,2013-11-15 12:52:00,,0,Partner,Transfer,...,0,20,0,0,0,0,0,1,0,0


### Separate target from predictors

In [19]:
train_y = train.OutcomeType.map({"Adoption": 0, "Transfer": 1, "Return_to_owner": 2, "Euthanasia": 3, "Died": 4})
train_x = train.drop(["DateTime", "OutcomeType", "OutcomeSubtype", "AnimalID", "ID"], axis=1)

test_x = test.drop(["DateTime", "OutcomeType", "OutcomeSubtype", "AnimalID", "ID"], axis=1)

## Construct the model

Let's try out a few models and see which works best through cross validation

In [20]:
# Massive amounts of model imports
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, StratifiedKFold, learning_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

kfold = StratifiedKFold()

def score_model(model):
    score = cross_val_score(model, train_x, train_y, cv=kfold, n_jobs=1, scoring="accuracy")
    print(score)
    print("\nAverage is ...")
    print(sum(score) / len(score))

 ---

## Extra Trees Classifier

### Parameter Search

(open Markdown here to see param search history)

<div hidden>

#### Attempt 1:

Tried:

{"max_depth": [None],
  "max_features": [1, 3, 10],
  "min_samples_split": [2, 3, 10],
  "min_samples_leaf": [1, 3, 10],
  "bootstrap": [False],
  "n_estimators" :[100,300],
  "criterion": ["gini"]}

Best:

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 10,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 300}
 
#### Attempt 2:
 
Tried:

ex_param_grid = {
                  "max_depth": [2, 4],
                  "max_features": [10, 14],
                  "min_samples_split": [8, 10],
                  "min_samples_leaf": [3, 8, 10],
                  "n_estimators" :[80, 100]
                }

ExtC = ExtraTreesClassifier(max_depth=None, criterion="gini", bootstrap=False,)

Best:

{'max_depth': 4, 'max_features': 14, 'min_samples_leaf': 8, 'min_samples_split': 10, 'n_estimators': 80}

#### Attempt 3:

Tried:

ex_param_grid = {
                  "max_depth": [None, 4],
                  "max_features": [14, 16],
                  "min_samples_split": [10, 12],
                  "min_samples_leaf": [7, 8],
                  "n_estimators" :[80, 90]
                }

ExtC = ExtraTreesClassifier(max_depth=None, criterion="gini", bootstrap=False,)

Best:

{'max_depth': None, 'max_features': 14, 'min_samples_leaf': 8, 'min_samples_split': 10, 'n_estimators': 80}

</div>

In [22]:
## Search grid for optimal parameters
ex_param_grid = {
                  "max_depth": [None, 4],
                  "max_features": [14, 16],
                  "min_samples_split": [10, 12],
                  "min_samples_leaf": [7, 8],
                  "n_estimators" :[80, 90]
                }

ExtC = ExtraTreesClassifier(max_depth=None, criterion="gini", bootstrap=False,)

gsExtC = GridSearchCV(ExtC, param_grid=ex_param_grid, cv=kfold, scoring="accuracy", verbose=1)

gsExtC.fit(train_x, train_y)

# Best score
print(gsExtC.best_score_)
print(gsExtC.best_params_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:  3.3min finished


0.653247530679
{'max_depth': None, 'max_features': 14, 'min_samples_leaf': 8, 'min_samples_split': 10, 'n_estimators': 80}


### Quick setup

In [31]:
# etc_model = ExtraTreesClassifier(
#                                     bootstrap=False,
#                                     criterion="gini",
#                                     max_depth=None,
#                                     max_features=14,
#                                     min_samples_leaf=8,
#                                     min_samples_split=10,
#                                     n_estimators=80
#                                 )
# etc_model.fit(train_x, train_y)
# score_model(etc_model)

[ 0.65536977  0.64769921  0.64679466]

Average is ...
0.649954545986


 ---

## Random Forest Classifier

### Parameter Search

(open Markdown here to see param search history)

<div hidden>

#### Attempt 1:

Tried:

{"max_depth": [None],
  "max_features": [1, 3, 10],
  "min_samples_split": [2, 3, 10],
  "min_samples_leaf": [1, 3, 10],
  "bootstrap": [False],
  "n_estimators" :[100,300],
  "criterion": ["gini"]}

Best:

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 300}
 
#### Attempt 2:

Tried:

rf_param_grid = {
                    "max_depth": [None, 1, 2],
                    "max_features": [3, 5],
                    "min_samples_split": [8, 10],
                    "min_samples_leaf": [8, 10],
                    "n_estimators" :[100,300],
                }

RFC = RandomForestClassifier(bootstrap=False, criterion="gini")

Best:

{'max_depth': None, 'max_features': 5, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 300}

#### Attempt 3:

Tried:

rf_param_grid = {
                    "max_features": [5, 7],
                    "min_samples_split": [6, 8],
                    "min_samples_leaf": [6, 8],
                    "n_estimators" :[200, 300, 350],
                }

RFC = RandomForestClassifier(bootstrap=False, criterion="gini", max_depth=None)

Best:

{'max_features': 7, 'min_samples_leaf': 6, 'min_samples_split': 8, 'n_estimators': 350}

</div>

In [23]:
## Search grid for optimal parameters
rf_param_grid = {
                    "max_features": [5, 7],
                    "min_samples_split": [6, 8],
                    "min_samples_leaf": [6, 8],
                    "n_estimators" :[200, 300, 350],
                }

RFC = RandomForestClassifier(bootstrap=False, criterion="gini", max_depth=None)

gsRFC = GridSearchCV(RFC, param_grid=rf_param_grid, cv=kfold, scoring="accuracy", verbose=1)

gsRFC.fit(train_x, train_y)

# Best score
print(gsRFC.best_score_)
print(gsRFC.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 10.0min finished


0.6528733912
{'max_features': 7, 'min_samples_leaf': 6, 'min_samples_split': 8, 'n_estimators': 350}


### Quick setup

In [33]:
# rfc_model = RandomForestClassifier(
#                                     bootstrap=False,
#                                     criterion="gini",
#                                     max_depth=None,
#                                     max_features=5,
#                                     min_samples_leaf=8,
#                                     min_samples_split=8,
#                                     n_estimators=300
#                                   )
# rfc_model.fit(train_x, train_y)
# score_model(rfc_model)

[ 0.64728987  0.64320988  0.64264062]

Average is ...
0.644380120913


---

## Gradient Boost Classifier

### Parameter Search

(open Markdown here to see param search history)

<div hidden>

#### Attempt 1:

Tried:

{'loss' : ["deviance"],
  'n_estimators' : [100,200,300],
  'learning_rate': [0.1, 0.05, 0.01],
  'max_depth': [4, 8],
  'min_samples_leaf': [100,150],
  'max_features': [0.3, 0.1] 
  }

Best:

{'learning_rate': 0.05,
 'loss': 'deviance',
 'max_depth': 8,
 'max_features': 0.3,
 'min_samples_leaf': 100,
 'n_estimators': 100}
 
#### Attempt 2:

Tried:
 
gb_param_grid = {
                    'n_estimators' : [100, 120],
                    'max_depth': [8, 10],
                    'min_samples_leaf': [80, 100],
                    'max_features': [8, 10]
                }

GBC = GradientBoostingClassifier(loss="deviance", learning_rate=0.04)

Best:

{'max_depth': 10, 'max_features': 10, 'min_samples_leaf': 80, 'n_estimators': 120}

#### Attempt 3:

Tried:
 
gb_param_grid = {
                    'n_estimators': [120, 200],
                    'max_depth': [None, 10],
                    'min_samples_leaf': [70, 80, 90],
                    'max_features': [10, 12]
                }

GBC = GradientBoostingClassifier(loss="deviance", learning_rate=0.04)

Best:


</div>

In [24]:
## Search grid for optimal parameters
gb_param_grid = {
                    'n_estimators': [120, 200],
                    'max_depth': [None, 10],
                    'min_samples_leaf': [70, 80, 90],
                    'max_features': [10, 12]
                }

GBC = GradientBoostingClassifier(loss="deviance", learning_rate=0.04)

gsGBC = GridSearchCV(GBC, param_grid=gb_param_grid, cv=kfold, scoring="accuracy", verbose=1)

gsGBC.fit(train_x, train_y)

# Best score
print(gsGBC.best_score_)
print(gsGBC.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 236.6min finished


0.668026040108
{'max_depth': 10, 'max_features': 12, 'min_samples_leaf': 70, 'n_estimators': 120}


### Quick setup

In [35]:
# gbc_model = GradientBoostingClassifier(
#                                           loss = "deviance",
#                                           learning_rate = 0.05,
#                                           n_estimators = 120,
#                                           max_depth = 10,
#                                           min_samples_leaf = 80,
#                                           max_features = 10
#                                       )
# gbc_model.fit(train_x, train_y)
# score_model(gbc_model)

[ 0.66389855  0.66262626  0.66284944]

Average is ...
0.663124753078


---

## Ada Boost Classifier

### Parameter Search

(open Markdown here to see param search history)

<div hidden>

#### Attempt 1:

tried:

{
"n_estimators": [1, 50, 100],
"learning_rate": [0.05, 0.1, 0.2],
"base_estimator\__max_depth": [None, 2, 4],
"base_estimator\__splitter" :   ["best", "random"],
"base_estimator\__criterion" : ["gini", "entropy"]
}

best:

{'base_estimator\__criterion': 'entropy', 'base_estimator\__max_depth': 4, 'base_estimator\__splitter': 'random', 'learning_rate': 0.1, 'n_estimators': 100}

#### Attempt 2:

tried:

ada_param_grid = {
                    "n_estimators": [50, 100, 120, 200],
                    "base_estimator__max_depth": [3, 4, 6],
                 }

DTC = DecisionTreeClassifier(random_state=17, splitter="random", criterion="entropy")

ADA = AdaBoostClassifier(base_estimator=DTC, learning_rate=0.08)

best:

{'base_estimator\__max_depth': 3, 'n_estimators': 120}

#### Attempt 2:

tried:

ada_param_grid = {
                    "n_estimators": [50, 100, 120, 200],
                    "base_estimator__max_depth": [3, 4, 6],
                 }

DTC = DecisionTreeClassifier(random_state=17, splitter="random", criterion="entropy")

ADA = AdaBoostClassifier(base_estimator=DTC, learning_rate=0.08)

best:

{'base_estimator__max_depth': 3, 'n_estimators': 120}

</div>

In [25]:
## Search grid for optimal parameters
ada_param_grid = {
                    "n_estimators": [110, 120, 130],
                    "base_estimator__max_depth": [None, 3],
                 }

DTC = DecisionTreeClassifier(random_state=17, splitter="random", criterion="entropy")
ADA = AdaBoostClassifier(base_estimator=DTC, learning_rate=0.08)

gsADA = GridSearchCV(ADA, param_grid=ada_param_grid, cv=kfold, scoring="accuracy", verbose=1)

gsADA.fit(train_x, train_y)

# Best score
print(gsADA.best_score_)
print(gsADA.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  2.5min finished


0.643482490272
{'base_estimator__max_depth': 3, 'n_estimators': 120}


### Quick setup

In [29]:
# dtc = DecisionTreeClassifier(
#                                 random_state=17,
#                                 splitter="random",
#                                 criterion="entropy",
#                                 max_depth=3
#                             )

# ada_model = AdaBoostClassifier(
#                                 base_estimator=dtc,
#                                 learning_rate=0.08,
#                                 n_estimators=120
#                               )

# ada_model.fit(train_x, train_y)
# score_model(ada_model)

[ 0.64930984  0.63647587  0.64454923]

Average is ...
0.64344498084


---

## XGB

### Model fitting

In [26]:
xgb_model = XGBClassifier()
xgb_model.fit(train_x, train_y)
score_model(xgb_model)

[ 0.6539109   0.65162738  0.65229595]

Average is ...
0.652611409538


---

## LGBM

### Model fitting

In [27]:
lgbm_model = LGBMClassifier()
lgbm_model.fit(train_x, train_y)
score_model(lgbm_model)

[ 0.67197845  0.6657688   0.66879982]

Average is ...
0.668849024355


### Averaging all the models

In [28]:
etc_model = gsExtC.best_estimator_
rfc_model = gsRFC.best_estimator_
gbc_model = gsGBC.best_estimator_
ada_model = gsADA.best_estimator_

model = VotingClassifier(estimators=[
                                        ("ETC", etc_model),                                    
                                        ("RFC", rfc_model),
                                        ("GBC",gbc_model),
                                        ("XGB", xgb_model),
                                        ("ADA", ada_model),
                                        ("LGBM", lgbm_model)
                                      ], voting='soft')

#### Fit and score model
model = model.fit(train_x, train_y)
score_model(model)

[ 0.66760184  0.66296296  0.66385989]

Average is ...
0.664808229623


### Predicting

In [29]:
predictions = model.predict(test_x)

### Save predictions to CSV

In [30]:
pd.DataFrame({
    "ID": test.ID.astype(int),
    "Adoption": [1 if pred == 0 else 0 for pred in predictions],
    "Died": [1 if pred == 4 else 0 for pred in predictions],
    "Euthanasia": [1 if pred == 3 else 0 for pred in predictions],
    "Return_to_owner": [1 if pred == 2 else 0 for pred in predictions],
    "Transfer": [1 if pred == 1 else 0 for pred in predictions]
}).to_csv('../../submissions/shelter_voting_powerful_tuning.csv',index=False)