In [None]:
import gc
import ubiquant
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr as p
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold


In [None]:
features = ['f_'+str(i) for i in range(300)]
B = np.loadtxt('../input/thereducertm/superduper_ls.csv', delimiter=',').astype(np.float16)
print(B.min(),B.max(),B.shape[1])

In [None]:
x_train = []
time_ids = []
invest_ids = []
y_train = []
for i, chunk in enumerate(pd.read_csv('../input/ubiquant-market-prediction/train.csv', chunksize=200000)):
    print(i)
    x_train.append(np.matmul(chunk[features].values,B).astype(np.float16))
    time_ids.append(chunk['time_id'].astype(np.int16)) 
    invest_ids.append(chunk['investment_id'].astype(np.int16)) 
    y_train.append(chunk['target'].astype(np.float16))
gc.collect()

In [None]:
x_train = np.concatenate(x_train)
print(x_train.shape)
invest_ids = np.concatenate(invest_ids)
print(invest_ids.shape)
time_ids = np.concatenate(time_ids)
print(time_ids.shape)
y_train = np.concatenate(y_train)
print(y_train.shape)
gc.collect()

In [None]:
x_train = np.concatenate([invest_ids.reshape(-1,1),x_train],axis=1)
#del invest_ids
x_train.shape
gc.collect()

In [None]:
x_train.shape[0]-x_train.shape[0]//5

In [None]:
x_train = x_train[2513128:]
invest_ids = invest_ids[2513128:]
time_ids = time_ids[2513128:]
y_train = y_train[2513128:]

In [None]:
#kfold = StratifiedKFold(10, shuffle=True, random_state=42)
kfold = GroupKFold(10)
models = []
for index, (train_indices, valid_indices) in enumerate(kfold.split(x_train,invest_ids,groups=time_ids)):
    print(index)
    model = xgb.XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=12,
        subsample=0.9,
        colsample_bytree=0.9,
        missing=-999,
        random_state=1,
        #nthread=8,
        tree_method='gpu_hist')

    model.fit(x_train[train_indices], y_train[train_indices], early_stopping_rounds=20, eval_set=[(x_train[valid_indices], y_train[valid_indices])], verbose=100)
    print(p(y_train[valid_indices],model.predict(x_train[valid_indices])))
    models.append(model)
    gc.collect()

In [None]:
from lightgbm import early_stopping,log_evaluation
callbacks = [early_stopping(20, verbose=0), log_evaluation(period=100)]

In [None]:
for index, (train_indices, valid_indices) in enumerate(kfold.split(x_train,invest_ids,groups=time_ids)):
    print(index)
    model = LGBMRegressor(
        device='gpu',
        boosting_type='gbdt',
        objective='regression',
        metric='rmse',
        tree_learner='serial',
        n_estimators=1000,
        num_leaves=64,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=2022)
    
    model.fit(x_train[train_indices], y_train[train_indices],
              eval_set=[(x_train[valid_indices], y_train[valid_indices])],
              eval_metric='rmse',
              callbacks=callbacks)
    print(p(y_train[valid_indices],model.predict(x_train[valid_indices])))
    models.append(model)
    gc.collect()

In [None]:
del x_train
del invest_ids
del time_ids
del y_train
gc.collect()

In [None]:
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    x_tt = np.matmul(test_df[features].values,B).astype(np.float16)
    invest_ids = test_df['investment_id'].values.astype(np.int16)
    x_tt = np.concatenate([invest_ids.reshape(-1,1),x_tt],axis=1)
    y_preds = []
    for model in models:
        y_pred = model.predict(x_tt)
        y_preds.append(y_pred)
    sample_prediction_df['target'] = np.mean(y_preds, axis=0)
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)
    del x_tt
    del y_preds
    gc.collect()