In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook demonstrates the simple approach that trains LightGBM model that uses pearson correlation as a customized evaluation metric in hyperparameter tuning.

The notebook also shows data pre-processing and EDA to provide high-level understanding of target and feature distributions as well as their pairwise correlations.

Some of works are inspired by popular notebooks in the competition.

EDA:

- https://www.kaggle.com/ilialar/ubiquant-eda-and-baseline
- https://www.kaggle.com/lucamassaron/eda-target-analysis#Target-analysis
- https://www.kaggle.com/marketneutral/ubiquant-feature-exploration#Thinking-(and-Trading)-Fast-and-Slow

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import plotly.express as px

import lightgbm as lgb

from tqdm.notebook import tqdm
from sklearn.base import clone
from sklearn.metrics import (
    roc_auc_score,
    mean_squared_error
)

In [None]:
data_types_dict = {
    'time_id': 'int32',
    'investment_id': 'int16',
    "target": 'float16',
}

features = [f'f_{i}' for i in range(300)]

for f in features:
    data_types_dict[f] = 'float16'
    
target = 'target'

def load_data(data_folder, file_name, data_types_dict):
    return pd.read_csv(data_folder + '/' + file_name + '.csv'
                       , usecols = data_types_dict.keys()
                       , dtype=data_types_dict
                       , index_col = 0)

In [None]:
train_data = load_data('/kaggle/input/ubiquant-market-prediction', 'train', data_types_dict)

# EDA of feature and target

In [None]:
# Overall Standard deviation

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
std_target = train_data.groupby(['investment_id'])['target'].std()
std_target.plot.hist(bins=60)
plt.title("standard deviation of target distribution")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(3, 1, 1)
train_data.groupby(['time_id'])['target'].mean().plot()
plt.axhline(y= np.mean(train_data.groupby(['time_id'])['target'].mean()), color='r', linestyle='--', label="mean")
plt.title("Mean of target by time")
plt.show()

plt.subplot(3, 1, 2)
train_data.groupby(['time_id'])['target'].std().plot()
plt.axhline(y= np.mean(train_data.groupby(['time_id'])['target'].std()), color='r', linestyle='--', label="mean")
plt.title("STD of target by time")
plt.show()

plt.subplot(3, 1, 2)
train_data.groupby(['time_id'])['investment_id'].nunique().plot()
plt.title("Number of investment_id by time")
plt.show()

In [None]:
mean_target_by_time = train_data.groupby(['time_id'])['target'].mean()
std_target_by_time = train_data.groupby(['time_id'])['target'].std()

train_data['target_from_mean'] = train_data['target'] - train_data.groupby(['time_id'])['target'].transform(np.mean)
train_data['std_target_at_time'] = train_data.groupby(['time_id'])['target'].transform(np.std)

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
plt.plot(mean_target_by_time.index, mean_target_by_time, "o-", color="r")
plt.fill_between(
        mean_target_by_time.index,
        mean_target_by_time - std_target_by_time,
        mean_target_by_time + std_target_by_time,
        alpha=0.2,
        color="r")
plt.axhline(y= np.mean(mean_target_by_time), color='g', linestyle='--', label="mean")
ax.set_ylabel("target")
ax.set_xlabel("time")

In [None]:
train_data['abs_target_from_mean'] = np.abs(train_data['target_from_mean'])
train_data['abs_z_score'] = np.abs(train_data['target_from_mean'])/train_data['std_target_at_time']

train_data.groupby(['investment_id']).agg({
    'abs_target_from_mean': ['mean', 'std']
    , 'abs_z_score': ['mean', 'std']
}).reset_index()

# Time-series autocorrelation

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
plot_pacf(mean_target_by_time,lags=25,title="Partial Autocorrelation chart: (Mean Target)")
plt.show()

In [None]:
plot_pacf(std_target_by_time,lags=25,title="Partial Autocorrelation chart: (Std Target)")
plt.show()

I'm wondering if temporal diffence of historical targets correlates with the target

In [None]:
train_data['target_lag1'] = train_data.sort_values(by=['time_id'], ascending=True)\
.groupby(['investment_id'])['target'].shift(1)

train_data['target_lag2'] = train_data.sort_values(by=['time_id'], ascending=True)\
.groupby(['investment_id'])['target'].shift(2)

In [None]:
train_data['diff_target_lag1'] = train_data['target'] - \
np.where(train_data['target_lag1'].isna(), 0,  train_data['target_lag1'])

train_data['diff_target_lag1_2'] = np.where(train_data['target_lag1'].isna(), 0,  train_data['target_lag1']) - \
np.where(train_data['target_lag2'].isna(), 0,  train_data['target_lag2'])

In [None]:
np.corrcoef(train_data['diff_target_lag1_2'], train_data['target'])[0][1]

In [None]:
np.corrcoef(train_data['diff_target_lag1'], train_data['diff_target_lag1_2'])[0][1]

As correlation between $target_{t-1} - target_{t-2}$ and $target$ is low, this temporal difference feature in target doesn't appear to be a great predictor.

# Correlation between features and target

In [None]:
# Correlation between the target and each feature by time_id
corr_lists = list()
for feature in features:
    corr_lists.append(train_data.groupby('time_id')[['target', feature]].corr().unstack().iloc[:,1])

corr_data = np.stack(corr_lists, axis = 1)

In [None]:
corr_dataframe = pd.DataFrame(corr_data, columns = features).set_axis(np.unique(train_data.index), axis='index')
corr_dataframe.index.name = 'time_id'
corr_dataframe

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(3, 1, 1)
plt.axhline(y= np.mean(corr_dataframe['f_164']), color='r', linestyle='--', label="mean")
plt.plot(corr_dataframe.index, corr_dataframe['f_164'])
plt.title("Correaltion of between target and f_164 by time")
plt.show()

plt.subplot(3, 1, 2)
plt.axhline(y= np.mean(corr_dataframe['f_7']), color='r', linestyle='--', label="mean")
plt.plot(corr_dataframe.index, corr_dataframe['f_180'])
plt.title("Correaltion of between target and f_180 by time")
plt.show()

Standard deviation of correlations over time depends on the number of investments

In [None]:
def top_correlated_features(data, features, target):
    corrs = list()
    for feature in features:
        corr = np.corrcoef(train_data[target], train_data[feature])[0][1]
        corrs.append(corr)
    feature_corrs = pd.Series(np.abs(corrs), index = features)
    return feature_corrs

top_features_series = top_correlated_features(train_data, features, target).nlargest(20)
top_features_series.plot(kind='barh', figsize=(12, 6)).invert_yaxis()

Features highly correlated with the target might be correlated with each other. Adding all of them together wouldn't be so informative to the model. Let's try removing those with high correlation with each other.

In [None]:
top_feature_names = np.unique(top_features_series.nlargest(20).index).tolist()
top_20_correlations = train_data[top_feature_names].corr()
sns.clustermap(top_20_correlations, figsize=(20, 20), cmap="mako", vmin = -1, vmax = 1)

The correaltion clustering map shows f_270, f_119, and f_76 have strong negative correlation. f_25, f_71, and f_155 also have strong positive correlation. Of 2 groups, I select the features most correlated with the targets. This step can be automated later in modeling training, but I show a simple single step.

In [None]:
remove_list = ['f_76', 'f_270', 'f_71', 'f_155', 'f_119']

features_left = [f'f_{i}' for i in range(300)]
for f in remove_list:
    features_left.remove(f)

In [None]:
top_features_series_2 = top_correlated_features(train_data, features_left, target).nlargest(20)
top_features_series_2.plot(kind='barh', figsize=(12, 6)).invert_yaxis()

In [None]:
top_feature_names_2 = np.unique(top_features_series_2.index).tolist()
top_20_correlations_2 = train_data[top_feature_names_2].corr()
sns.clustermap(top_20_correlations_2, figsize=(20, 20), cmap="mako", vmin = -1, vmax = 1)

From the previous section, we see that the data has the consistent mean of target over time, seemingly consisting of investment types in a wide range of risks. Outperforming investment would have higher target than the mean, and vice versa. I wonder if the set of features correlated with the target will vary upon investment performance. In other words, does feature importance investment performance?

If this is the case, we could train different models with important features in each investment class. By removing unecessary features, a model would suffer less from noises.

In [None]:
underperform_investments = train_data[train_data['abs_z_score'] < 0.20]
outperform_investments = train_data[train_data['abs_z_score'] > 0.80]

In [None]:
underperform_features = top_correlated_features(underperform_investments, features, target).nlargest(20)
outperform_features = top_correlated_features(outperform_investments, features, target).nlargest(20)

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 6))
underperform_features.plot(kind='barh', title='Most Predictive Features for low-risk investment'\
                       , legend=False, ax=ax[0]).invert_yaxis()
ax[0].set_xlabel("Pearson Corr with Target")

outperform_features.plot(kind='barh', title='Most Predictive Features for high-risk investment'\
                       , legend=False, ax=ax[1]).invert_yaxis()
ax[1].set_xlabel("Pearson Corr with Target")

The graph shows the features importance is independent on the investment performance.


# Training model

We train the model with a set of the features determined in the previous step. Assuming this time-series data has `time_id` properly chronologically ranked, we should split the training and validation set to avoid spillover effects from leakge future information.

In [None]:
def corr(a, b, w):
    cov = lambda x, y: np.sum(w * (x - np.average(x, weights=w)) * (y - np.average(y, weights=w))) / np.sum(w)
    return cov(a, b) / np.sqrt(cov(a, a) * cov(b, b))

def corr_metric(labels, preds):
    return 'corr', corr(labels, preds, np.ones(len(labels))), True

def corr_eval(preds, dataset):
    labels = dataset.get_label()
    return 'corr', np.corrcoef(labels, labels)[0][1], True

Manual random search for hyperparameter tuning

In [None]:
from sklearn.model_selection import ParameterGrid

n_round = 10
dicts = list()

for i in range(n_round):
    random_params = {
    'num_leaves': 2 ** np.random.randint(3, 8),
    'learning_rate': 10 ** (-np.random.uniform(0.1,2)),
    'min_data_in_leaf': np.random.randint(50, 1000), 
    'bagging_fraction': 0.5,
    'feature_pre_filter': False
    }
    dicts = np.append(dicts, random_params)

merged_random_params = {k: [d[k] for d in dicts] for k in dicts[0]}    

In [None]:
lgb_train_data = lgb.Dataset(data=train_data.loc[:, top_feature_names_2], label=train_data.target)

def corr_eval(preds, dataset):
    labels = dataset.get_label()
    return 'corr', np.corrcoef(labels, labels)[0][1], True

def lgbCV(lgb_train_data, params, num_boost_round, early_stopping_rounds):
    eval_hist = lgb.cv(params,
                       lgb_train_data,
                       nfold=5,
                       num_boost_round=num_boost_round,
                       early_stopping_rounds=early_stopping_rounds,
                       verbose_eval=50,
                       seed=112,
                       feval=corr_eval,
                       stratified=False,
                       show_stdv=True)
    return eval_hist

In [None]:
# report_params = [(params, lgbCV(lgb_train_data, params,  num_boost_round = 2000, early_stopping_rounds = 100)) for params in ParameterGrid(merged_random_params)]

Time-based cross-validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit
import lightgbm
from lightgbm import LGBMRegressor

def time_split_cross_validation(train_data, features, target):
    models = dict()
    corr_scores = dict()
    
    tscv = TimeSeriesSplit(max_train_size=None, n_splits=10)
    
    for fold, (train_index, test_index) in enumerate(tscv.split(train_data)):
        
        train = train_data.iloc[train_index]
        valid = train_data.iloc[test_index]
        
        lgbm = LGBMRegressor(
            num_leaves=2 ** np.random.randint(3, 8),
            learning_rate = 10 ** (-np.random.uniform(0.1,2)),
            n_estimators = 2000,
            min_child_samples = 1000, 
            subsample=np.random.uniform(0.5,1.0), 
            subsample_freq=1    
        )
        
        lgbm.fit(train[features], train[target]
                 , eval_set = (valid[features], valid[target])
                 , eval_metric = corr_metric
                 , early_stopping_rounds = 100)
        
        preds = lgbm.predict(valid[features])
        
        models[fold] = lgbm
        corr_scores[fold] = np.corrcoef(valid[target], preds)[0][1]
                    
    return models, corr_scores

In [None]:
models, scores = time_split_cross_validation(train_data, top_feature_names_2, target = 'target')

When making predictions, we can use average of predictions from multiple models to mitigate overfitting.

In [None]:
def apply_model(models, df, features):
    for model in models.values:
        df['target'] += model.predict(df[features])
    
    df['target']/len(models)

In progress, let's do

* Time-span stratified cross-validation: stratifying the groups of count(time_id) * count(investment_id) to ensure equal number of target corresponding to each (time_id, investment_id)
* Hyperparameter optimization with Optuna

In [None]:
# import ubiquant
# env = ubiquant.make_env()   # initialize the environment

# iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

# for (test_df, sample_prediction_df) in iter_test:
#     apply_model(models, sample_prediction_df, top_feature_names_2)  # make your predictions here
#     env.predict(sample_prediction_df)   # register your predictions

# Other concerns and ideas

- Concerns
1. It's unclear what target is. Target could be the investment performance such as log-return, alpha where each value is overlapping. For example, target at time t derives from target from time t+1 or t+2, representing the next 2-period return. If this is the case, time-series split for cross-validation should have an appropriate gap for to validate the result a out-of-train-sample period.
2. There obviously are unequal number of investments over time, but I rarely leverage the information about missing investments in some periods. Number of investments could be a good signal to targets. For instance, the lower `count(investment_id)` is, the more target likely deviates from its $\mathbb{E}_{investment}[target]$

- Further ideas
1. We can conduct stationarity test on the target to investigate the long-term behavior. The plot of mean and standard deviation over time signifies overall target is mean-reverting. However, the long-term behavior of each investment_id can be different. We can compute p-value of unit-root testing from `adfuller(train_data.loc[train_data.investment_id == X])`.
2. I'd want to experiment transforming $target$ to be $\Delta(target)$ or $\Delta(target) - \Delta\mathbb{E}_{investment}[target]$ where $\Delta$ refers to the temporal difference of values. This should capture the derivative of target better than directly predicting the target. However, it doesn't guarantee the highest correlation (the evaluation metric of this competition).