In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
import pickle
with open("data/all_train_features1.p", "rb") as f:
    X = pickle.load(f)

In [3]:
with open("data/all_test_features1.p", "rb") as f:
    X_test = pickle.load(f)

In [4]:
with open("data/y_desire_train_data_label1.p", "rb") as f:
    y = pickle.load(f)

In [5]:
with open("data/y_desire_test_data_label1.p", "rb") as f:
    y_test = pickle.load(f)

In [6]:
with open("data/all_sub_features1.p", "rb") as f:
    sub_features = pickle.load(f)

In [7]:
import lightgbm
import xgboost

lgb_clf_dart = lightgbm.LGBMClassifier(boosting_type='dart',learning_rate=0.15,
                                               subsample=0.5,
                                               num_leaves=20)

lgb_clf_gbdt = lightgbm.LGBMClassifier(boosting_type='gbdt',learning_rate=0.1,
                                               subsample=0.5,max_depth=4,
                                               num_leaves=20)

lgb_clf_rf = lightgbm.LGBMClassifier(boosting_type='rf',bagging_freq=1,
                                               bagging_fraction=0.75,feature_fraction=0.75,
                                               num_leaves=20)

xgb_clf_gbdt = xgboost.XGBClassifier(booster='gbtree',colsample_bylevel=0.75,
                                             learning_rate=0.05,max_depth=6,
                                             n_estimators=200,subsample=0.6)

xgb_clf_dart = xgboost.XGBClassifier(booster='dart',colsample_bylevel=0.85,
                                             learning_rate=0.05,max_depth=8,
                                             n_estimators=200,subsample=0.75)

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

base = RandomForestClassifier()
rf_param = {
#     "max_depth":[7,9,11,13],
    "max_depth":[17,19,21],
#     "min_samples_leaf":[1,3,5],
    "min_samples_leaf":[1],
#     "min_samples_split":[2,4,6,8,10],
    "min_samples_split":[4],
#     "max_features":["sqrt", "log2"],
    "max_features":["log2"],
#     "n_estimators":[100]
    "n_estimators":[800,1000,1200,1400]
}

rf_grid = GridSearchCV(base, rf_param, n_jobs=-1, scoring='roc_auc')

In [12]:
from sklearn.metrics import roc_auc_score
def evaluate_roc_auc(clf, features, labels):
    predicted = clf.predict_proba(features)

    # check which class corresponds to positive links
    positive_column = list(clf.classes_).index(1)
    return roc_auc_score(labels, predicted[:, positive_column])

In [13]:
estimators=[
    ('lgb_dart', lgb_clf_dart), 
    ('lgb_gbdt', lgb_clf_gbdt), 
    ('lgb_rf', lgb_clf_rf), 
    ('xgb_dart', xgb_clf_dart), 
    ('xgb_gbdt', xgb_clf_gbdt),
    ('rf', RandomForestClassifier(n_estimators=1000, max_depth=17, n_jobs=-1)),
    ('ada', AdaBoostClassifier(n_estimators=200, learning_rate=0.5)),
    ('extraTree', ExtraTreeClassifier(splitter="random",min_samples_split=26)),
    ('knn', KNeighborsClassifier(n_neighbors=15))
]

In [15]:
params = {"learning_rate":[0.01,0.03,0.05,0.1],
                  "max_depth":[2,4,6,8,10],
                  "subsample":[0.6,0.75,0.85,1],
                  "colsample_bylevel":[0.6,0.75,0.85,1]}

final_base = xgboost.XGBClassifier(n_estimators=200,booster="gbtree")
final = GridSearchCV(final_base, params, n_jobs=-1, scoring='roc_auc', verbose=1)

In [17]:
from sklearn.ensemble import StackingClassifier
stacking = StackingClassifier(estimators=estimators, final_estimator=rf_grid, n_jobs=-1, verbose=1)

In [18]:
stacking.fit(X, y)

StackingClassifier(estimators=[('lgb_dart',
                                LGBMClassifier(boosting_type='dart',
                                               learning_rate=0.15,
                                               num_leaves=20, subsample=0.5)),
                               ('lgb_gbdt',
                                LGBMClassifier(max_depth=4, num_leaves=20,
                                               subsample=0.5)),
                               ('lgb_rf',
                                LGBMClassifier(bagging_fraction=0.75,
                                               bagging_freq=1,
                                               boosting_type='rf',
                                               feature_fraction=0.75,
                                               num_leaves=20)),
                               ('xgb_dart',
                                XGBClassifier(base_sco...
                               ('extraTree',
                                Ext

In [27]:
stacking.score(X_test, y_test)

0.8685

In [91]:
rf_grid.fit(X,y)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [17, 19, 21], 'max_features': ['log2'],
                         'min_samples_leaf': [1], 'min_samples_split': [4],
                         'n_estimators': [800, 1000, 1200, 1400]},
             scoring='roc_auc')

In [92]:
rf_grid.predict_proba(X_test)[:,1]

array([0.98991381, 0.24064671, 1.        , ..., 0.99875   , 0.9840873 ,
       0.99207261])

In [19]:
stacking.predict_proba(X_test)[:,1]

array([0.01147761, 0.01203465, 0.94886599, ..., 0.00627788, 1.        ,
       0.99583333])

In [66]:
stacking.predict_proba(X_test)[:,1]

array([0.99614888, 0.24602529, 1.        , ..., 0.994375  , 0.99875   ,
       0.99856252])

In [67]:
stacking.predict_proba(X_test)

array([[0.00385112, 0.99614888],
       [0.75397471, 0.24602529],
       [0.        , 1.        ],
       ...,
       [0.005625  , 0.994375  ],
       [0.00125   , 0.99875   ],
       [0.00143748, 0.99856252]])

In [46]:
from sklearn.metrics import roc_auc_score

In [21]:
roc_auc_score(y_test, stacking.predict_proba(X_test)[:,1])

0.9390968524213106

In [22]:
stacking.predict_proba(sub_features)

array([[0.84591862, 0.15408138],
       [0.67876987, 0.32123013],
       [0.001875  , 0.998125  ],
       ...,
       [0.73915965, 0.26084035],
       [0.60882032, 0.39117968],
       [0.6541461 , 0.3458539 ]])

In [23]:
sum(stacking.predict(sub_features))

948

In [24]:
y_sub = stacking.predict_proba(sub_features)

In [25]:
result = pd.DataFrame(data={"Id":range(1,len(y_sub)+1), "Predicted":[x[1] for x in y_sub]})

In [26]:
result

Unnamed: 0,Id,Predicted
0,1,0.154081
1,2,0.321230
2,3,0.998125
3,4,0.177157
4,5,0.146794
...,...,...
1995,1996,0.376561
1996,1997,0.029227
1997,1998,0.260840
1998,1999,0.391180


In [28]:
result.to_csv('submissioin.csv', index=False)