In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer
from evaluate import get_score
from xgboost import XGBClassifier as XGB
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMClassifier as LGBM
from lightgbm import LGBMRegressor as LGBMR
from pprint import pprint

# Preparation

In [2]:
train = pd.read_csv('data/clean_train.csv')
train_x = train.drop(columns = ['AdoptionSpeed'])
train_y = train['AdoptionSpeed']

In [3]:
XGB_GRID = {
    'max_depth': [6, 7],
    'n_jobs': [-1],
    'n_estimators': [200],
    'reg_alpha': [0],
    'reg_lambda': [0, 0.001, 0.003, 0.1],
}
xgb = XGB()

In [18]:
LGBM_GRID = {
    'num_leaves': [20, 50, 100],
    'max_depth': [5, 6, 7],
    'n_estimators': [200],
    'subsample_for_bin': [2000],
    'min_child_samples': [25, 35, 45],
    'reg_alpha': [0],
    'reg_lambda': [0, 0.01, 0.03, 0.1]
}
lgbm = LGBM()

In [5]:
scorer = make_scorer(get_score, greater_is_better = True)
scorer_regr = make_scorer(get_score_regr, greater_is_better = True)
ss = ShuffleSplit(n_splits = 5, test_size = 0.3)

# Cross Validation

In [None]:
grid_search = GridSearchCV(estimator = xgb, param_grid = XGB_GRID, scoring = scorer, cv = ss, n_jobs = -1)
grid_search.fit(train_x.values, train_y.values)
cv_df = pd.DataFrame(grid_search.cv_results_)
cv_df.sort_values(by = 'rank_test_score')

In [22]:
grid_search = GridSearchCV(estimator = lgbm, param_grid = LGBM_GRID, scoring = scorer, cv = ss, n_jobs = -1)
grid_search.fit(train_x.values, train_y.values)
cv_df = pd.DataFrame(grid_search.cv_results_)
cv_df.sort_values(by = 'rank_test_score')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_samples,param_n_estimators,param_num_leaves,param_reg_alpha,param_reg_lambda,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
46,3.994507,0.442794,1.737495,0.497275,6,25,200,100,0,0.03,...,0.350182,0.012325,1,0.561412,0.579171,0.589974,0.559639,0.585751,0.575190,0.012471
58,3.657975,0.576823,0.999731,0.412659,6,35,200,100,0,0.03,...,0.349669,0.017594,2,0.543460,0.555168,0.565361,0.546566,0.560623,0.554236,0.008251
69,3.835187,0.730246,1.340666,0.628583,6,45,200,100,0,0.01,...,0.349525,0.009429,3,0.519054,0.540261,0.534291,0.529055,0.538417,0.532215,0.007625
55,3.299366,0.549257,2.003960,0.467596,6,35,200,50,0,0.1,...,0.348596,0.010846,4,0.549295,0.553117,0.561785,0.546738,0.555398,0.553267,0.005206
59,3.864075,0.680451,1.678927,0.730988,6,35,200,100,0,0.1,...,0.348462,0.014276,5,0.550943,0.552330,0.562133,0.547426,0.560039,0.554574,0.005591
53,3.592915,0.465672,1.734916,0.537731,6,35,200,50,0,0.01,...,0.348371,0.014205,6,0.552179,0.561254,0.563000,0.546360,0.560555,0.556670,0.006366
15,2.987800,0.139049,0.752657,0.219929,5,35,200,20,0,0.1,...,0.347723,0.016356,7,0.474910,0.478648,0.485585,0.480925,0.495587,0.483131,0.007123
21,3.231728,0.240853,0.966104,0.438192,5,35,200,100,0,0.01,...,0.347654,0.016422,8,0.489361,0.488974,0.500686,0.492553,0.503046,0.494924,0.005850
17,3.222873,0.182245,0.630190,0.170800,5,35,200,50,0,0.01,...,0.347654,0.016422,8,0.489361,0.488974,0.500686,0.492553,0.503046,0.494924,0.005850
25,2.841035,0.564462,0.569472,0.183801,5,45,200,20,0,0.01,...,0.347558,0.012088,10,0.470928,0.465928,0.473677,0.471567,0.478984,0.472217,0.004232


In [21]:
cv_df.sort_values(by = 'rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_samples,param_n_estimators,param_num_leaves,param_reg_alpha,param_reg_lambda,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
1,2.86864,0.231804,1.718956,1.015624,5,25,200,20,0,0.01,...,0.366032,0.012637,1,0.498074,0.486529,0.485173,0.477341,0.485873,0.486598,0.006633
7,3.33573,0.127136,0.802334,0.112929,5,25,200,50,0,0.1,...,0.365835,0.005028,2,0.50407,0.497094,0.49975,0.487433,0.497649,0.497199,0.005465
11,3.392644,0.070078,1.697174,0.674358,5,25,200,100,0,0.1,...,0.365835,0.005028,2,0.50407,0.497094,0.49975,0.487433,0.497649,0.497199,0.005465
0,2.556233,0.107797,1.988605,0.710369,5,25,200,20,0,0.0,...,0.365824,0.007905,4,0.499114,0.486668,0.490571,0.471265,0.48905,0.487334,0.009065
76,4.807536,0.26788,2.326447,0.400357,7,25,200,50,0,0.0,...,0.365521,0.005465,5,0.635436,0.612646,0.631078,0.628962,0.618859,0.625396,0.00838


# Prediction

In [6]:
test = pd.read_csv('data/test/test.csv')
test_x = test.drop(columns = ['Name', 'RescuerID', 'PetID', 'Description'])
pet_id = test['PetID']

In [7]:
LGBM_PARAMS = {
    'num_leaves': 100,
    'max_depth': 5,
    'n_estimators': 200,
    'subsample_for_bin': 2000,
    'min_child_samples': 25,
    'reg_lambda': 0.01
}
lgbm.fit(train_x, train_y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [8]:
predict = lgbm.predict(test_x)

In [9]:
result = pd.DataFrame({'PetID': pet_id, 'AdoptionSpeed': predict})