In [19]:
train_size = 47500

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
%matplotlib inline

In [3]:
np.set_printoptions(precision=9)
np.set_printoptions(linewidth=np.inf)

In [4]:
X = np.load('../../X_train.npz')['arr_0']
Y = np.load('../../Y_train.npz')['arr_0']
TX = np.load('../../X_test.npz')['arr_0']

In [5]:
XTX = np.concatenate((X, TX), axis=0)

In [6]:
X = None
TX = None

In [7]:
XTX_square = XTX ** 2
XTX_cumsum = np.concatenate((np.cumsum(XTX[:, :5000], axis=1), np.cumsum(XTX[:, 5000:], axis=1)), axis=1)
XTX_all = np.concatenate((XTX, XTX_square, XTX_cumsum), axis=1)

In [8]:
XTX_square = None
XTX_cumsum = None
XTX = None

In [9]:
XTX_all.shape

(50000, 30000)

In [10]:
X, X_test = XTX_all[:train_size], XTX_all[train_size:]
print(X.shape, X_test.shape)

(47500, 30000) (2500, 30000)


In [11]:
# calculate type 1 error
def err1(y, y_pred):
    return np.sum(1.0 * np.abs(y_pred - y)) / len(y_pred)

# calculate type 2 errr
def err2(y, y_pred):
    return np.sum(np.abs(y_pred - y) / y) / len(y_pred)

In [None]:
scorer = {'t1':make_scorer(err1, greater_is_better=False), 't2':make_scorer(err2, greater_is_better=False)}

In [None]:
# select best params
y_id = 0
idx = np.random.permutation(47500)[:1000]
X_tmp, y_tmp = X[idx], Y[idx]
params = {'boosting_type':['gbdt', 'dart'],
          'num_leaves':[31, 511],
          'learning_rate':[0.05],
          'n_estimators':[100],
          'n_jobs':[23]}

t1_err = np.Inf
t2_err = np.Inf
t1_best_params = {}
t2_best_params = {}
keys, values = zip(*params.items())
for v in itertools.product(*values):
    param = dict(zip(keys, v))
    print(param)
    t1_errr = 0
    t2_errr = 0
    kf = KFold(n_splits=5, shuffle=False)
    for train_idx, val_idx in kf.split(X_tmp):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = Y[train_idx, y_id], Y[val_idx, y_id]
        
        model = lgb.LGBMRegressor(**param)
        model.fit(X_train, y_train)
        t1_errr += err1(model.predict(X_val), y_val)
        t2_errr += err2(model.predict(X_val), y_val)
    
    if t1_errr / 5 < t1_err:
        t1_best_params = param
        t1_err = t1_errr / 5
    if t2_errr / 5 < t2_err:
        t2_best_params = param
        t2_err = t2_errr / 5

In [12]:
params = {'n_estimators':500, 'criterion':'mae', 'max_depth':5, 'max_features':'sqrt', 'bootstrap':True, 'n_jobs':-1, 'verbose':1}

In [None]:
ty = []
vy = []
feature_imp = []

for y_id in range(1):
    kf = KFold(n_splits=5, shuffle=False)
    tty = []
    tvy = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = Y[train_idx], Y[val_idx]
        
        model = ExtraTreesRegressor(**params)
        model.fit(X_train, y_train)
        feature_imp.append(model.feature_importances_)
        
        tvy.append(model.predict(X_val))
        tty.append(model.predict(X_test))
    vy = np.concatenate((tvy[0], tvy[1], tvy[2], tvy[3], tvy[4]), axis=0)
    ty = np.mean(np.array(tty), axis=0)

ty = ty.transpose()
vy = vy.transpose()
df = pd.DataFrame(ty)
df.to_csv('t_test.csv', index=False, header=False)
df = pd.DataFrame(vy)
df.to_csv('t_val.csv', index=False, header=False)
feature_imp = np.array(feature_imp).transpose()
df = pd.DataFrame(feature_imp)
df.to_csv('feature_imp.csv', index=False, header=False)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 76.3min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 194.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 240.8min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.3s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 74.7min
[Parallel(n_jobs=-1)]: 

In [33]:
df = pd.read_csv('t_val.csv', header=None)

In [24]:
df = df.transpose()

In [34]:
df.shape

(47500, 3)

In [35]:
df.head()

Unnamed: 0,0,1,2
0,0.460213,156.195177,0.785738
1,0.467361,162.414069,0.621672
2,0.458956,186.073495,0.590581
3,0.462112,150.266357,0.782895
4,0.472306,151.22309,0.616289


In [37]:
print(err1(np.array(df)[:, 0], Y[:, 0]) * 300)
print(err1(np.array(df)[:, 1], Y[:, 1]) * 1)
print(err1(np.array(df)[:, 2], Y[:, 2]) * 200)

71.71191251429006
37.37213659635686
7.188204132907542


In [38]:
print(err2(np.array(df)[:, 0], Y[:, 0]))
print(err2(np.array(df)[:, 1], Y[:, 1]))
print(err2(np.array(df)[:, 2], Y[:, 2]))

0.5067467139874647
0.28831892372090595
0.049928028674931124


In [None]:
0.5067467139874647
0.28831892372090595
0.049928028674931124

In [27]:
df.to_csv('t_val.csv', index=False, header=False)

In [28]:
df = pd.read_csv('t_test.csv', header=None)

In [29]:
df = df.transpose()

In [30]:
df.shape

(2500, 3)

In [31]:
df.head()

Unnamed: 0,0,1,2
0,0.474519,166.01848,0.603821
1,0.538965,145.684505,0.89676
2,0.490911,113.634429,0.794169
3,0.459936,164.424856,0.784917
4,0.454375,87.02936,0.749957


In [32]:
df.to_csv('t_test.csv', index=False, header=False)