In [None]:
import os
import gc
import sys
import glob

import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns

from tqdm import tqdm
from itertools import cycle
from scipy.stats import pearsonr

from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import decomposition
from sklearn import tree
import category_encoders as ce

import cudf
import cuml
from cuml.linear_model import LogisticRegression, Ridge, Lasso
from cuml.neighbors import KNeighborsClassifier
from cuml.svm import SVC, SVR
from cuml.ensemble import RandomForestClassifier

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

pd.set_option("display.max_columns", None)

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
train_df = pd.read_pickle("../input/ump-train-picklefile/train.pkl")

train_df = cudf.from_pandas(train_df.sample(frac=0.25))
test_df = cudf.read_csv("../input/ubiquant-market-prediction/example_test.csv")
ss_df = cudf.read_csv("../input/ubiquant-market-prediction/example_sample_submission.csv")

train_df.shape, test_df.shape, ss_df.shape

In [None]:
target_col = 'target'
train_cols = [f for f in train_df.columns if f not in ['time_id', 'row_id', 'target']]

In [None]:
gkf = model_selection.GroupKFold(n_splits=5)

models = []
scores = []
oof_predictions = np.zeros(len(train_df))

for idx, (train_idx, valid_idx) in enumerate(gkf.split(train_df, train_df[target_col].to_array(), groups=train_df['time_id'].to_array())):
        
    print("="*100)
    print("FOLD : ", idx)
    print("_"*100)

    X_train = train_df.iloc[train_idx][train_cols]
    y_train = train_df.iloc[train_idx][target_col]

    X_valid = train_df.iloc[valid_idx][train_cols]
    y_valid = train_df.iloc[valid_idx][target_col]

    print("Trian :", X_train.shape, y_train.shape)
    print("Valid :", X_valid.shape, y_valid.shape)
    
    model = SVR()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    
    score = pearsonr(y_pred.to_array(), y_valid.to_array())[0]
    
    print(f"### FOLD {idx} SCORE : {score}")
    
    models.append(model)
    scores.append(score)
    
    oof_predictions[valid_idx] = y_pred.to_array()
    
print("FOLDS AVG : ", np.mean(scores))

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = cudf.from_pandas(test_df[train_cols])
    
    pred_0 = models[0].predict(test_df).to_array()
    pred_1 = models[1].predict(test_df).to_array()
    pred_2 = models[2].predict(test_df).to_array()
    pred_3 = models[3].predict(test_df).to_array()
    pred_4 = models[4].predict(test_df).to_array()
    
    pred = (pred_0 + pred_1 + pred_2 + pred_3 + pred_4) / 5
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 