In [None]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm.notebook import tqdm

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [9, 9]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 350)

## Update

- Changed the Pearson correlation calculation, from [here](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/303627)
- Changed GroupKFold to GroupTimeSeriesSplit from [here](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/304036)
- Using only 80% of most frequent investment ids due memory error  

---

- Using GroupKFold: 
    - CV: 0.12937332275514163
    - LB: 0.108
- Using GroupTimeSeriesSplit:
    - CV: 0.11928179218531812
    - LB: ?

In [None]:
PATH = '../input/ubiquant-market-prediction-half-precision-pickle'

In [None]:
os.listdir(PATH)

In [None]:
%%time

train = pd.read_pickle(f'{PATH}/train.pkl')

for col in ['time_id', 'investment_id']:
    train[col] = train[col].astype(int)

In [None]:
gc.collect()

In [None]:
train.info()

In [None]:
N_ROWS = len(train)
features = [f'f_{i}' for i in range(300)]

## EDA

In [None]:
train

In [None]:
# no NAs at all
train.isnull().sum().sum()

In [None]:
train['investment_id'].value_counts()

In [None]:
unique_investment_ids = train['investment_id'].unique()
print(unique_investment_ids)
print(unique_investment_ids.shape)

In [None]:
# for x in unique_investment_ids[:20]:
#     train[train['investment_id'] == x].plot('time_id', 'target', title=f'investment_id = {x}', figsize=(6, 6));

### On the target

In [None]:
train['target'].plot.hist();

In [None]:
train[['target']].describe()

In [None]:
train.sample(int(N_ROWS * 0.02), random_state=1)[features].describe().T['mean'].plot.hist(title='0 mean');

In [None]:
train.sample(int(N_ROWS * 0.02), random_state=1)[features].describe().T['std'].plot.hist('1 std');

In [None]:
# %%time

# many high correlated features, so some type of models can struggle with this
# train.sample(int(N_ROWS * 0.001), random_state=1)[['target'] + features].corr().style.background_gradient(axis=None)  

In [None]:
train.sample(int(N_ROWS * 0.02), random_state=1)[['f_109', 'target']].plot.scatter('f_109', 'target');

In [None]:
train.sample(int(N_ROWS * 0.02), random_state=1)[['f_108', 'target']].plot.scatter('f_108', 'target');

## Modelling

In [None]:
# take some investment ids 
N = int(len(unique_investment_ids) * 0.8)
investments_to_use = train['investment_id'].value_counts()[:N]
print('Selecting:', N)
print('Before:', train.shape)
train = train[train['investment_id'].isin(investments_to_use)].reset_index(drop=True)
print('After:', train.shape)

train

In [None]:
from sklearn import linear_model
from sklearn import metrics
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
import lightgbm as lgbm
from sklearn import svm

In [None]:
from typing import Tuple


class GroupTimeSeriesSplit:
    """
    From: https://www.kaggle.com/c/ubiquant-market-prediction/discussion/304036
    Custom class to create a Group Time Series Split. We ensure
    that the time id values that are in the testing data are not a part
    of the training data & the splits are temporal
    """
    def __init__(self, n_folds: int, holdout_size: int, groups: str) -> None:
        self.n_folds = n_folds
        self.holdout_size = holdout_size
        self.groups = groups

    def split(self, X) -> Tuple[np.array, np.array]:
        # Take the group column and get the unique values
        unique_time_ids = np.unique(self.groups.values)

        # Split the time ids into the length of the holdout size
        # and reverse so we work backwards in time. Also, makes
        # it easier to get the correct time_id values per
        # split
        array_split_time_ids = np.array_split(
            unique_time_ids, len(unique_time_ids) // self.holdout_size
        )[::-1]

        # Get the first n_folds values
        array_split_time_ids = array_split_time_ids[:self.n_folds]

        for time_ids in array_split_time_ids:
            # Get test index - time id values that are in the time_ids
            test_condition = X['time_id'].isin(time_ids)
            test_index = X.loc[test_condition].index

            # Get train index - The train index will be the time
            # id values right up until the minimum value in the test
            # data - we can also add a gap to this step by
            # time id < (min - gap)
            train_condition = X['time_id'] < (np.min(time_ids))
            train_index = X.loc[train_condition].index

            yield train_index, test_index

In [None]:
gc.collect()

In [None]:
%%time

FEATS = features + ['investment_id', 'time_id']

pearsons = []
models = []
scalers = []

FOLDS = 5
gtss = GroupTimeSeriesSplit(n_folds=FOLDS, holdout_size=20, groups=train['time_id'])
for fold, (tr, val) in enumerate(gtss.split(train)):
    print('FOLD:', fold)
    
    # use a fraction to training
    X_train = train.loc[tr, FEATS]
    y_train = train.loc[tr, 'target']
    del tr
    gc.collect()
    
    X_val = train.loc[val, FEATS]
    y_val = train.loc[val, 'target']
    del val
    gc.collect()
    
    print('Train time_id range:', X_train['time_id'].min(), '->', X_train['time_id'].max())
    print('Val time_id range:', X_val['time_id'].min(), '->', X_val['time_id'].max())
    
    # store time_id to calculate Pearson correlation
    time_ids_val = X_val['time_id'].values
    
    # standardize
#     scaler = StandardScaler()
#     X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
#     X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
#     scalers.append(scaler)
    
#     model = lgbm.LGBMRegressor(
#         random_state=1,
#         max_depth=3
#     )
#     model = svm.LinearSVR(
#         random_state=1,
#         loss='squared_epsilon_insensitive',
#         dual=False # when n_samples > n_features
#     )

    # fit
    model = linear_model.LinearRegression(
        n_jobs=-1
    )
    model.fit(X_train.drop(['investment_id', 'time_id'], axis=1), y_train)
    models.append(model)
    
    del X_train, y_train
    gc.collect()
    
    # metrics
    # submissions are evaluated on the mean of the Pearson correlation coefficient for each time ID
    X_val['y_pred'] = model.predict(X_val.drop(['investment_id', 'time_id'], axis=1))
    X_val['y_true'] = y_val.values
    X_val['time_id'] = time_ids_val
    
    del y_val, time_ids_val
    gc.collect()
    
    pearson = X_val[['time_id', 'y_true', 'y_pred']].groupby('time_id').apply(lambda x: pearsonr(x['y_true'], x['y_pred'])[0]).mean()
    print('Pearson:', pearson)
    print()
    pearsons.append(pearson)
    
    del X_val
    gc.collect()
    
print('-' * 30)
print('Mean:', np.mean(pearsons))
print('Std:', np.std(pearsons))

In [None]:
BEST_FOLD = np.argmax(pearsons)
print(BEST_FOLD)

In [None]:
# Linear Regression (GroupKFold) -> LB: 0.108
# Mean: 0.12937332275514163
# Std: 0.006988315782118249

# Linear Regression (GroupTimeSeriesSplit) -> LB: 0.102
# Mean: 0.11933183135552154
# Std: 0.0416659840190710

## Submission

In [None]:
import ubiquant

env = ubiquant.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    
    # time_id is not present in test set 
    test_df['time_id'] = test_df['row_id'].apply(lambda x: int(x.split('_')[0]))
        
    # predict using each model
    final_pred = models[BEST_FOLD].predict(test_df[features])
    
    # average
    sample_prediction_df['target'] = final_pred
    
    env.predict(sample_prediction_df)
    display(sample_prediction_df)