In [1]:
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [2]:
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X_submit = np.genfromtxt('data/X_test.txt', delimiter=None)

# Shuffle rows of X and Y in the same way
s = np.arange(X.shape[0])
np.random.shuffle(s)
X = X[s]
Y = Y[s]

# Split 80/20 into training/test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

X = X_train # CHANGE BACK
Y = Y_train # CHANGE BACK
X_submit = X_test # CHANGE BACK

In [3]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\nTime taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
def test(clf):
    if not do_tests:
        return
    clf.fit(X_train, Y_train)
    predictions = clf.predict_proba(X_test)
    score = roc_auc_score(Y_test, predictions[:,1])
    print(score)

In [4]:
from sklearn.ensemble import RandomForestClassifier

start_time = timer()
rf = RandomForestClassifier(n_estimators=120, max_depth=24, min_samples_leaf=5, n_jobs=-1)

rf.fit(X, Y)
rf_train = rf.predict_proba(X)
rf_test = rf.predict_proba(X_submit)
timer(start_time)


Time taken: 0 hours 0 minutes and 20.55 seconds.


In [5]:
from sklearn.ensemble import ExtraTreesClassifier

start_time = timer()
et = ExtraTreesClassifier(n_estimators=68, max_depth=39, min_samples_leaf=3, n_jobs=-1)

et.fit(X, Y)
et_train = et.predict_proba(X)
et_test = et.predict_proba(X_submit)
timer(start_time)


Time taken: 0 hours 0 minutes and 7.43 seconds.


In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

start_time = timer()
base_clf = DecisionTreeClassifier(max_depth=9)
ada = AdaBoostClassifier(base_estimator=base_clf, n_estimators=100)

ada.fit(X, Y)
ada_train = ada.predict_proba(X)
ada_test = ada.predict_proba(X_submit)
timer(start_time)


Time taken: 0 hours 3 minutes and 40.81 seconds.


In [7]:
from lightgbm import LGBMClassifier

start_time = timer()

gb = LGBMClassifier(
    max_depth=22,
    num_leaves=2514,
    scale_pos_weight=1.6,
    learning_rate=0.02,
    n_estimators=100,
    n_jobs=4,
    device='gpu',
    random_state=42,
)

gb.fit(X, Y)
gb_train = gb.predict_proba(X)
gb_test = gb.predict_proba(X_submit)

timer(start_time)


Time taken: 0 hours 2 minutes and 52.26 seconds.


In [8]:
from sklearn.model_selection import GridSearchCV

X_train = np.concatenate((et_train, rf_train, ada_train, gb_train), axis=1)
x_test = np.concatenate((et_test, rf_test, ada_test, gb_test), axis=1)

start_time = timer()
stack = xgb.XGBClassifier(
    n_estimators=8,
    max_depth=3,                   
    subsample=0.197,
    colsample_bytree=0.4,
    objective='binary:logistic',
    nthread=1,
    scale_pos_weight=1.336,
)

stack.fit(X_train, Y)
timer(start_time)


Time taken: 0 hours 0 minutes and 0.89 seconds.


In [9]:
predictions = stack.predict_proba(x_test)
score = roc_auc_score(Y_test, predictions[:,1])
print(score)
    
Y_submit = np.vstack((np.arange(x_test.shape[0]), predictions[:,1])).T
np.savetxt('Y_submit.txt', Y_submit, '%d, %.2f', header='ID,Prob1',comments='',delimiter=',')

0.7765162216030855
