In [34]:
import pandas as pd
import os
import numpy as np

In [36]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [37]:
os.chdir('/content/gdrive/My Drive/')

In [38]:
train = pd.read_csv('train_nonanswer_jo_sum_final.csv')

In [87]:
train.x = train.drop(['Party', 'USER_ID'],axis = 1)
train.y = train['Party']

In [40]:
!pip install catboost



In [97]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
# Installing the most recent version of skopt directly from Github
!pip install git+https://github.com/scikit-optimize/scikit-optimize.git

Collecting git+https://github.com/scikit-optimize/scikit-optimize.git
  Cloning https://github.com/scikit-optimize/scikit-optimize.git to /tmp/pip-req-build-35ikik7r
  Running command git clone -q https://github.com/scikit-optimize/scikit-optimize.git /tmp/pip-req-build-35ikik7r
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyaml>=16.9
  Using cached https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6ddfa9c66f6310274fb75d42/pyaml-20.4.0-py2.py3-none-any.whl
Building wheels for collected packages: scikit-optimize
  Building wheel for scikit-optimize (PEP 517) ... [?25l[?25hdone
  Created wheel for scikit-optimize: filename=scikit_optimize-0.9.dev0-cp36-none-any.whl size=102002 sha256=557c90f362b0a1032a465177794004d5bedd121fde0fb5e9640c243a3f43bbf8
  Stored in directory: /tmp/pip-ephem-wheel-cache-4d9ofqxh/wheels/11/6f/86/2b77217

In [42]:
# Model selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [43]:
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from sklearn.metrics import make_scorer

In [88]:
categorical_features_indices = np.where(train.x.dtypes == object)[0]

In [89]:
categorical_features_indices

array([ 0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
       70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
       87, 88, 89, 90, 91, 92])

In [46]:
clf2 = CatBoostClassifier(loss_function='Logloss',
                         eval_metric = 'Accuracy',
                         early_stopping_rounds = 20,
                         task_type = 'GPU',
                         verbose = 500,
                         cat_features = categorical_features_indices,
                         iterations =  1000,
                        nan_mode = 'Max'                        
                        )

In [47]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = clf2,
    search_spaces = {
                 'depth': [8,9,10],
                 'learning_rate': Real(0.01, 0.5, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'), # randomness for scoring splits #살리는 방향으로 가보자
                 'l2_leaf_reg': Integer(2, 30), # L2 regularization
                 'scale_pos_weight':Real(0.01, 10.0, 'uniform'), #확인후 제거해보자
                 'grow_policy' : ['Lossguide','Depthwise']},    
     scoring='neg_log_loss',
    cv = StratifiedKFold(
        n_splits=5,
        shuffle= True,
        random_state=42
    ),
    n_iter=10,
    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
    return_train_score=False,
    refit=True,
    optimizer_kwargs={'base_estimator': 'GP'},
    random_state=22
)

In [49]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest logloss: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 6),
        bayes_cv_tuner.best_params_
    ))

In [50]:
result = bayes_cv_tuner.fit(train.x, train.y, callback=status_print)

0:	learn: 0.7716957	total: 53.5ms	remaining: 26.7s
499:	learn: 0.8220306	total: 25.2s	remaining: 0us
0:	learn: 0.7744932	total: 44.8ms	remaining: 22.4s
499:	learn: 0.8215787	total: 24.9s	remaining: 0us
0:	learn: 0.7836294	total: 46.8ms	remaining: 23.4s
499:	learn: 0.8317629	total: 24.7s	remaining: 0us
0:	learn: 0.7702803	total: 45.6ms	remaining: 22.7s
499:	learn: 0.8271157	total: 24.8s	remaining: 0us
0:	learn: 0.7736367	total: 45.3ms	remaining: 22.6s
499:	learn: 0.8227101	total: 25s	remaining: 0us
Model #1
Best accuracy: -0.755548
Best params: OrderedDict([('depth', 9), ('grow_policy', 'Lossguide'), ('l2_leaf_reg', 28), ('learning_rate', 0.028637181863626668), ('random_strength', 1.204860821395814e-08), ('scale_pos_weight', 3.5269515444602026)])

0:	learn: 0.7006734	total: 59.2ms	remaining: 29.5s
499:	learn: 0.9996710	total: 25.4s	remaining: 0us
0:	learn: 0.6990865	total: 51.8ms	remaining: 25.9s
499:	learn: 1.0000000	total: 25.3s	remaining: 0us
0:	learn: 0.7129296	total: 48.5ms	remaini

In [60]:
bestCB = CatBoostClassifier(loss_function='Logloss',
                         eval_metric = 'Accuracy',
                         early_stopping_rounds = 20,
                         task_type = 'GPU',
                         verbose = 500,
                         cat_features = categorical_features_indices,
                         iterations =  500,
                        nan_mode = 'Max', 
                        depth = 8,
                        learning_rate =  0.16181022985210577,
                        random_strength = 4.224249262387503, # randomness for scoring splits #살리는 방향으로 가보자
                        l2_leaf_reg = 21, # L2 regularization
                        scale_pos_weight = 1.9838833310054074, #확인후 제거해보자
                        grow_policy = 'Depthwise'
                        )

In [90]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=77)

In [98]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train.x, train.y)) :
   trn_x, trn_y = train.x.loc[trn_idx], train.y[trn_idx] 
   val_x, val_y = train.x.loc[val_idx], train.y[val_idx] 
   cb_model = CatBoostClassifier(loss_function='Logloss',
                         eval_metric = 'Accuracy',
                         early_stopping_rounds = 20,
                         task_type = 'GPU',
                         verbose = 500,
                         cat_features = categorical_features_indices,
                         iterations =  500,
                        nan_mode = 'Max', 
                        depth = 8,
                        learning_rate =  0.16181022985210577,
                        random_strength = 4.224249262387503, # randomness for scoring splits #살리는 방향으로 가보자
                        l2_leaf_reg = 21, # L2 regularization
                        scale_pos_weight = 1.9838833310054074, #확인후 제거해보자
                        grow_policy = 'Depthwise'
                        )
   cb_model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features= categorical_features_indices, use_best_model=True, verbose=True) 
   y_pred = cb_model.predict(val_x)
   print(accuracy_score(val_y, y_pred))


0:	learn: 0.7113347	test: 0.6424652	best: 0.6424652 (0)	total: 71.1ms	remaining: 35.5s
1:	learn: 0.7102788	test: 0.6689806	best: 0.6689806 (1)	total: 141ms	remaining: 35.2s
2:	learn: 0.7147074	test: 0.6716226	best: 0.6716226 (2)	total: 216ms	remaining: 35.9s
3:	learn: 0.7135452	test: 0.6768323	best: 0.6768323 (3)	total: 290ms	remaining: 36s
4:	learn: 0.7197675	test: 0.6755166	best: 0.6768323 (3)	total: 361ms	remaining: 35.7s
5:	learn: 0.7186186	test: 0.6748693	best: 0.6768323 (3)	total: 421ms	remaining: 34.7s
6:	learn: 0.7189423	test: 0.6787740	best: 0.6787740 (6)	total: 484ms	remaining: 34.1s
7:	learn: 0.7182789	test: 0.6807581	best: 0.6807581 (7)	total: 550ms	remaining: 33.8s
8:	learn: 0.7246844	test: 0.6827316	best: 0.6827316 (8)	total: 613ms	remaining: 33.5s
9:	learn: 0.7271388	test: 0.6840472	best: 0.6840472 (9)	total: 677ms	remaining: 33.2s
10:	learn: 0.7292695	test: 0.6788058	best: 0.6840472 (9)	total: 734ms	remaining: 32.6s
11:	learn: 0.7312410	test: 0.6781585	best: 0.6840472 (

In [None]:
def catboost_eval(bagging_temperature ,
                  depth , 
                  learning_rate ,
                  min_data_in_leaf , 
                  max_leaves , 
                  l2_leaf_reg , 
                  border_count):
    n_splits=5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    f1 = []
    predict = None
    params = {}
    params['iterations'] = 1000
    params['eval_metric'] = 'Accuracy'
    params['random_seed'] = 1234
    params['learning_rate'] = learning_rate
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))
    params['depth'] = int(round(depth))
    params['max_leaves'] = int(round(max_leaves))
    params['l2_leaf_reg'] = int(round(l2_leaf_reg))
    params['border_count'] = int(round(border_count))
    params['bagging_temperature'] = int(round(bagging_temperature))
    X , y = catX_train.values , caty_train
    for tr_ind, val_ind in skf.split(X , y):
        X_train = X[tr_ind]
        y_train = y[tr_ind]
        X_valid = X[val_ind]
        y_valid = y[val_ind]
        ## https://catboost.ai/docs/concepts/python-reference_catboost_eval-metrics.html
        clf = CatBoostClassifier(**params , 
                                 task_type = "GPU" , 
                                 leaf_estimation_iterations = 10,
                                 use_best_model=True,
                                 od_type="Iter",
                                 logging_level='Silent',
                                )
        clf.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid),
                verbose = False ,
        )
        
        y_pred = clf.predict(X_valid)
        
        f1_value = f1_score(y_valid.astype(int) ,
                            y_pred.astype(int)  ,
                            average='weighted')
        f1.append(f1_value)
    return sum(f1)/n_splits