In [2]:
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

In [3]:
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X_submit = np.genfromtxt('data/X_test.txt', delimiter=None)

print(X.shape)

(200000, 14)


In [4]:
# Scale the data, fitting only on training data
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_submit_scaled = scaler.transform(X_submit)

In [5]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\nTime taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [6]:
# Create parameters to search
gridParams = {
    'num_leaves': np.arange(2014, 4014, 500),
    'max_depth': np.arange(19,26,2),
    'scale_pos_weight': np.arange(0.1, 2.1, 0.5),
    
    #'subsample': np.arange(0.4, 1, 0.1),
    #'colsample_bytree': np.arange(0.4, 1, 0.1),
    #'min_data_in_leaf': np.arange(1, 202, 50),
    #'reg_alpha' : np.arange(0.5, 1.5, 0.2),
    #'reg_lambda' : np.arange(0.5, 1.5, 0.2),
}


clf = LGBMClassifier(
    learning_rate=0.02,
    n_estimators=100,
    n_jobs=1,
    device='gpu',
    silent=True,
    random_state=42,
)

grid = GridSearchCV(
    clf, gridParams, 
    scoring='roc_auc', 
    n_jobs=4,
    verbose=1,
)

# Here we go! 
start_time = timer()
grid.fit(X, Y)
timer(start_time)

print(grid.best_score_)
print(grid.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 19.4min
[Parallel(n_jobs=4)]: Done 144 out of 144 | elapsed: 124.9min finished



Time taken: 2 hours 8 minutes and 9.67 seconds.
0.7658680856819227
{'max_depth': 22, 'num_leaves': 2514, 'scale_pos_weight': 1.6}


KeyError: 'colsample_bytree'

In [11]:
from sklearn.metrics import roc_auc_score

predictions = grid.predict_proba(X)
score = roc_auc_score(Y, predictions[:, 1])
print(score)

0.9080258007913724


In [7]:
Y_submit = np.vstack((np.arange(X_submit.shape[0]), grid.predict_proba(X_submit)[:,1])).T
np.savetxt('Y_submit.txt', Y_submit, '%d, %.2f', header='ID,Prob1',comments='',delimiter=',')

## Scores

#### Iteration 1 of parameter tuning
```
gridParams = {
    'num_leaves': np.arange(14, 6015, 1000),
    'max_depth': np.arange(2,25,10),
    'scale_pos_weight': np.arange(1, 10000, 1000),
}


clf = LGBMClassifier(
    learning_rate=0.02,
    n_estimators=100,
    n_jobs=1,
    device='gpu',
    silent=True,
)

Time taken: 4 hours 21 minutes and 19.44 seconds.
0.7639511706338389
{'max_depth': 22, 'num_leaves': 3014, 'scale_pos_weight': 1}
```

#### Iteration 2 of parameter tuning
```
gridParams = {
    'num_leaves': np.arange(2014, 4014, 500),
    'max_depth': np.arange(12,27,5),
    'scale_pos_weight': np.arange(0.1, 2.1, 0.5),
}

clf = LGBMClassifier(
    learning_rate=0.02,
    n_estimators=100,
    n_jobs=1,
    device='gpu',
    silent=True,
)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Time taken: 2 hours 8 minutes and 9.67 seconds.
0.7658680856819227
{'max_depth': 22, 'num_leaves': 2514, 'scale_pos_weight': 1.6}
```
