In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from catboost import CatBoostClassifier

In [2]:
SEED = 17
PATH_TO_DATA = './mlxtend_input/'

In [3]:
train_1 = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_xaandera.csv'), 
    index_col='match_id_hash',
)
train_1.columns = ['xaandera_' + str(col) for col in train_1.columns]

test_1 = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'test_xaandera.csv'), 
    index_col='match_id_hash',
)
test_1.columns = ['xaandera_' + str(col) for col in test_1.columns]

In [4]:
train_2 = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_antibyte.csv'), 
    index_col='match_id_hash',
)
train_2.columns = ['antibyte_' + str(col) for col in train_2.columns]

test_2 = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'test_antibyte.csv'), 
    index_col='match_id_hash',
)
test_2.columns = ['antibyte_' + str(col) for col in test_2.columns]

In [5]:
train_3 = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_sheregeda.csv'), 
    index_col='match_id_hash',
)
train_3.columns = ['sheregeda_' + str(col) for col in train_3.columns]

test_3 = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'test_sheregeda.csv'), 
    index_col='match_id_hash',
)
test_3.columns = ['sheregeda_' + str(col) for col in test_3.columns]

In [15]:
X = pd.concat([train_1, train_2, train_3], axis=1, sort=False)
test = pd.concat([test_1, test_2, test_3], axis=1, sort=False)

columns_1 = (
    X.columns.get_loc(train_1.columns[0]),
    X.columns.get_loc(train_1.columns[-1])
)
columns_2 = (
    X.columns.get_loc(train_2.columns[0]),
    X.columns.get_loc(train_2.columns[-1])
)
columns_3 = (
    X.columns.get_loc(train_3.columns[0]),
    X.columns.get_loc(train_3.columns[-1])
)

In [16]:
drop_matches = [
    'c42dcffb66e8aeb5098690d8f1811a25',
    '2eb2ce35ed3dc6249b32082d052cb9dc',
    'a2cf3c288fd289e9fd9259b0672f8119',
    'f8bb9eb724fe1fc6823cb48bcdd383d7',
    '924ea902cc80bea694154846f2b60d81',
    '2a43d7f0611bd90cc780c090734722b4',
    '16bf0141245691f8aa1c7702c7a62a59',
    '86f40281e8b61b080cdd8e385931e5a0',
    '5a22abffd910b17ff5bc1517475ff0db',
    'ac92f34c19c068773fcf27ea6ba514dc',
    'd464de7959de7b1038bc053a63aa64d7',
    '0c1d5f10a55ac235ea796d8359c8f03b',
    '09b06773d4ba11e858e1042194985488',
    '42eec2fb2b810502f7c0ba48395600e1',
    '1801fa71983a9aff5a8f477983871541',
    'b31c089780e58ced730afafd39097356',
    'f9c159f8dcc5621769f47e6edc9db5b9',
    '9d7ce9819d6204edd7fd7aab576e9b27',
    'bdcf25c4ca853c5c79fd8ad486a0ac34',
    '87d1ce73a40e9fe16ff600cc3d0663c7'
]
X.drop(drop_matches, axis=0, inplace=True)

In [17]:
targets = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_targets.csv'), 
    index_col='match_id_hash'
)
targets.drop(drop_matches, axis=0, inplace=True)
y = targets['radiant_win'].map({True: 1, False: 0})

In [9]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=SEED)

In [10]:
pipe2 = make_pipeline(
    ColumnSelector(cols=tuple(range(columns_2[0], columns_2[1]+1))),
    CatBoostClassifier(**{
        'random_state': SEED,
        'verbose': False,
        'depth': 6,
        'early_stopping_rounds': 1000,
        'l2_leaf_reg': 3,
        'learning_rate': 0.01,
        'boosting_type': 'Plain'
    })
)

In [11]:
pipe3 = make_pipeline(
    ColumnSelector(cols=tuple(range(columns_3[0], columns_3[1]+1))),
    CatBoostClassifier(**{
        'verbose': False,
        'eval_metric': 'AUC',
        'custom_loss': ['AUC'],
        'random_state': SEED,
        'bagging_temperature': 0.0,
        'border_count': 255,
        'depth': 8,
        'iterations': 1000,
        'l2_leaf_reg': 30,
        'learning_rate': 0.036778105553030696,
        'random_strength': 0.0011406276175633515,
        'scale_pos_weight': 1.0
    })
)


In [18]:
meta_params = {
    'verbose': False,
    'random_state': SEED,
}
meta_classifier = CatBoostClassifier(**meta_params)

sclf = StackingCVClassifier(
    classifiers=[pipe2, pipe3], 
    meta_classifier=meta_classifier,
    use_probas=True,
    drop_last_proba=True,
    verbose=True,
    random_state=SEED,
)

sclf.fit(X, y);

Fitting 2 classifiers...
Fitting classifier1: pipeline (1/2)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
# y_pred = sclf.predict_proba(X_valid)[:, 1]
# roc_auc_score(y_valid, y_pred)

In [None]:
y_pred = sclf.predict_proba(test)[:, 1]
pd.DataFrame({'radiant_win_prob': y_pred}, index=test.index).to_csv('submission_stacking.csv')

In [None]:
plt.figure(figsize=(16, 6))
sns.distplot(y_pred).set_title('Distribution of predictions');