In [57]:
import os
import json
import pprint
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer
from evaluate import get_score
from xgboost import XGBClassifier as XGB
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMClassifier as LGBM
from lightgbm import LGBMRegressor as LGBMR

# Preparation

In [73]:
train = pd.read_csv('data/clean_train.csv')
train_x = train.drop(columns = ['AdoptionSpeed'])
train_y = train['AdoptionSpeed']

In [34]:
XGB_GRID = {
    'max_depth': [6, 7],
    'n_jobs': [-1],
    'n_estimators': [200],
    'reg_alpha': [0],
    'reg_lambda': [0, 0.001, 0.003, 0.1],
}
xgb = XGB()

In [39]:
LGBM_GRID = {
    'num_leaves': [20, 50, 100],
    'max_depth': [5, 6, 7],
    'n_estimators': [200],
    'subsample_for_bin': [2000],
    'min_child_samples': [25],
    'reg_alpha': [0],
    'reg_lambda': [0, 0.1],
}
lgbm = LGBM()

In [40]:
scorer = make_scorer(get_score, greater_is_better = True)
ss = ShuffleSplit(n_splits = 5, test_size = 0.3)

# Cross Validation

In [21]:
grid_search = GridSearchCV(estimator = xgb, param_grid = XGB_GRID, scoring = scorer, cv = ss, n_jobs = -1)
grid_search.fit(train_x.values, train_y.values)
cv_df = pd.DataFrame(grid_search.cv_results_)
cv_df.sort_values(by = 'rank_test_score')

KeyboardInterrupt: 

In [41]:
grid_search = GridSearchCV(estimator = lgbm, param_grid = LGBM_GRID, scoring = scorer, cv = ss, n_jobs = -1)
grid_search.fit(train_x.values, train_y.values)
cv_df = pd.DataFrame(grid_search.cv_results_)
cv_df.sort_values(by = 'rank_test_score')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_samples,param_n_estimators,param_num_leaves,param_reg_alpha,param_reg_lambda,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
3,4.036347,0.230582,0.979198,0.594688,5,25,200,50,0,0.1,...,0.35321,0.00792,1,0.50095,0.4947,0.50087,0.519549,0.499364,0.503086,0.008541
5,3.942066,0.191549,1.082606,0.392277,5,25,200,100,0,0.1,...,0.35321,0.00792,1,0.50095,0.4947,0.50087,0.519549,0.499364,0.503086,0.008541
8,4.010782,0.334271,1.622404,0.637375,6,25,200,50,0,0.0,...,0.35319,0.009273,3,0.566323,0.578491,0.56296,0.590471,0.568877,0.573424,0.009969
16,4.374925,0.174943,2.45899,0.534697,7,25,200,100,0,0.0,...,0.352292,0.009094,4,0.636482,0.647035,0.627423,0.65268,0.637873,0.640299,0.008774
11,4.0502,0.105529,1.302007,0.482863,6,25,200,100,0,0.1,...,0.351556,0.009959,5,0.571759,0.574765,0.56351,0.587208,0.569464,0.573341,0.007855
6,3.973975,0.101108,1.280237,0.493773,6,25,200,20,0,0.0,...,0.351256,0.005342,6,0.504334,0.503393,0.510805,0.526944,0.5113,0.511355,0.008438
0,3.817647,0.016347,2.027609,0.271244,5,25,200,20,0,0.0,...,0.350855,0.005719,7,0.48912,0.488387,0.488836,0.503407,0.490153,0.491981,0.005743
1,3.798891,0.061882,1.178486,0.667634,5,25,200,20,0,0.1,...,0.350716,0.004722,8,0.489803,0.482592,0.485158,0.503997,0.484426,0.489195,0.007773
9,3.809491,0.356414,1.702875,0.51427,6,25,200,50,0,0.1,...,0.349454,0.007942,9,0.568579,0.570281,0.565985,0.578182,0.570464,0.570698,0.004073
13,3.583586,0.111942,0.92225,0.10719,7,25,200,20,0,0.1,...,0.349219,0.006776,10,0.520678,0.52071,0.52827,0.538087,0.520787,0.525706,0.006846


# Prediction

In [None]:
test = pd.read_csv('data/test/test.csv')
test_x = test.drop(columns = ['Name', 'RescuerID', 'PetID', 'Description'])
pet_id = test['PetID']

In [None]:
LGBM_PARAMS = {
    'num_leaves': 100,
    'max_depth': 5,
    'n_estimators': 200,
    'subsample_for_bin': 2000,
    'min_child_samples': 25,
    'reg_lambda': 0.01
}
lgbm.fit(train_x, train_y)

In [None]:
predict = lgbm.predict(test_x)

In [None]:
result = pd.DataFrame({'PetID': pet_id, 'AdoptionSpeed': predict})