In [8]:
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [9]:
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X_submit = np.genfromtxt('data/X_test.txt', delimiter=None)

print(X.shape)

(200000, 14)


In [10]:
# Scale the data, fitting only on training data
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_submit_scaled = scaler.transform(X_submit)

In [11]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [12]:
# A parameter grid for XGBoost
params = {
}

clf = XGBClassifier(
    learning_rate=0.02,
    n_estimators=100,
    nthread=1,
    subsample=1.0,
    min_child_weight=5,
    max_depth=13,
    gamma=0.5,
    colsample_bytree=0.8,
)

grid = GridSearchCV(
    clf, params, 
    scoring='roc_auc', 
    n_jobs=-1,
    verbose=2,
)

# Here we go!
start_time = timer()
grid.fit(X, Y)
timer(start_time)

print(grid.best_score_)
print(grid.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.4min finished



 Time taken: 0 hours 3 minutes and 5.68 seconds.
0.7556973362736767
{}


In [14]:
predictions = grid.predict_proba(X)
score = roc_auc_score(Y, predictions[:, 1])
print(score)

0.8512519674859262
