In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)


# Standard plotly imports
import plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import cufflinks as cf
import plotly.figure_factory as ff
from sklearn import preprocessing
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
print("XGBoost version:", xgb.__version__)

In [None]:
print('# File sizes')
total_size = 0
start_path = '../input/ubiquant-market-prediction'  # To get size of current directory
for path, dirs, files in os.walk(start_path):
    for f in files:
        fp = os.path.join(path, f)
        total_size += os.path.getsize(fp)
print("Directory size: " + str(round(total_size/ 1000000, 2)) + 'MB')

In [None]:
%%time
# reduce cols for use to save memory capacity
basic_cols = ['target']
test_cols = ['time_id', 'investment_id']

num_feat = 50
features = [f'f_{i}' for i in range(num_feat)]
cols = basic_cols + features
train_cols = test_cols + features

# load data
train = pd.read_csv('../input/ubiquant-market-prediction/train.csv', usecols=cols)
test_df = pd.read_csv('../input/ubiquant-market-prediction/example_test.csv', usecols=features)
sample_prediction_df = pd.read_csv('../input/ubiquant-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
print('train shape is {}'.format(train.shape))
print('example_test shape is {}'.format(test_df.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

In [None]:
train.head()

In [None]:
# Missing Values Count

missing_values_count = train.isnull().sum()
print (missing_values_count)
total_cells = np.product(train.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

In [None]:
X_train = train.loc[:, features]
y_train = train.loc[:, 'target']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=0)
#del X_train, y_train

In [None]:
regressor = xgb.XGBRegressor(
                 tree_learner='serial',
                 n_estimators=1000,
                 num_leaves=64,
                 max_depth=8,
                 learning_rate=0.1,
                 subsample=0.8,
                 feature_fraction=0.6,
                 reg_alpha=0.1,
                 reg_lambda=0.1,
                 random_state=2022,
                 tree_method='gpu_hist'  # THE MAGICAL PARAMETER
                )

regressor.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=1)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    preds = regressor.predict(test_df[features].values)
    sample_prediction_df['target'] = preds
    env.predict(sample_prediction_df)