In [None]:
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px

In [None]:
# Import train data
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
train.head()

### Exploratory Data Analysis

Let's look at the shape of the training data set.

In [None]:
print(train.shape)

Unique number of time id

In [None]:
print(train['time_id'].nunique())

There are 1211 unique time Ids. Let's look at the min and max of the time id

In [None]:
print(min(train['time_id']))
print(max(train['time_id']))

Looks like some of the timesteps are missing. Let's look at the number of investment Ids.

In [None]:
print(train['investment_id'].nunique())

### Memory Reduction

In [None]:
# MEMORY
%reset -f

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### LightGBM Baseline

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from lightgbm import *
from sklearn.model_selection import *

In [None]:
# Import the pickle files
train_path = Path('../input/ump-train-picklefile')
test_path = Path('../input/ubiquant-market-prediction')

In [None]:
train = pd.read_pickle(train_path/'train.pkl')
train = reduce_mem_usage(train)

train.drop(['row_id', 'time_id'], axis=1, inplace=True)
X = train.drop(['target'], axis=1)
y = train["target"]
del train

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=False)
del X
del y

In [None]:
model = LGBMRegressor(
        objective="regression",
        metric="rmse",
        boosting_type="gbdt",
        n_estimators=1000,
        num_leaves=100,
        max_depth=25,
        learning_rate=0.01,
        subsample=0.8
)

model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric='rmse',
          callbacks=[early_stopping(10), log_evaluation(10)])

In [None]:
test = pd.read_csv(test_path/'example_test.csv')
sample = pd.read_csv(test_path/'example_sample_submission.csv')

In [None]:
display(test.head())

In [None]:
display(sample.head())

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = model.predict(test_df)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df)