In [None]:
import pandas as pd
import numpy as np
import os
from scipy.stats import pearsonr
from sklearn.model_selection import GroupKFold
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import mixed_precision
from tensorflow.keras import backend as K
from tqdm.notebook import tqdm
import random
import warnings
import gc
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [None]:
# Function to get hardware strategy
def get_hardware_strategy():
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        policy = mixed_precision.Policy('mixed_bfloat16')
        mixed_precision.set_global_policy(policy)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    return tpu, strategy

tpu, strategy = get_hardware_strategy()
# Configuration
EPOCHS = 15
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
# Model Seed 
MODEL_SEED = 42
# Learning rate
LR = 0.0008
# Folds
FOLDS = 5
# Verbosity
VERBOSE = 2
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything(MODEL_SEED)

In [None]:
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')

In [None]:
# Feature list
features = [col for col in train.columns if col not in ['row_id', 'time_id', 'investment_id', 'target']]

In [None]:
# Some feature engineering
# Get the correlations with the target to encode time_id
corr1 = train[features[0:100] + ['target']].corr()['target'].reset_index()
corr2 = train[features[100:200] + ['target']].corr()['target'].reset_index()
corr3 = train[features[200:] + ['target']].corr()['target'].reset_index()
corr = pd.concat([corr1, corr2, corr3], axis = 0, ignore_index = True)
corr['target'] = abs(corr['target'])
corr.sort_values('target', ascending = False, inplace = True)
best_corr = corr.iloc[3:103, 0].to_list()
del corr1, corr2, corr3, corr

In [None]:
# Add time id related features (market general features to relate time_ids)
time_id_features = []
for col in tqdm(best_corr):
    mapper = train.groupby(['time_id'])[col].mean().to_dict()
    train[f'time_id_{col}'] = train['time_id'].map(mapper)
    train[f'time_id_{col}'] = train[f'time_id_{col}'].astype(np.float16)
    time_id_features.append(f'time_id_{col}')
print(f'We added {len(time_id_features)} features related to time_id')

In [None]:
train.shape

In [None]:
train.head()

In [None]:
# Update feature list
features += time_id_features
np.save('features.npy', np.array(features))
np.save('best_corr.npy', np.array(best_corr))
# Store out of folds predictions
oof_predictions = np.zeros(len(train))
# Initiate GroupKFold (all investment_id should be in the same fold, we want to predict new investment_id)
kfold = GroupKFold(n_splits = FOLDS)
# Create groups based on time_id
train.loc[(train['time_id'] >= 0) & (train['time_id'] < 280), 'group'] = 0
train.loc[(train['time_id'] >= 280) & (train['time_id'] < 585), 'group'] = 1
train.loc[(train['time_id'] >= 585) & (train['time_id'] < 825), 'group'] = 2
train.loc[(train['time_id'] >= 825) & (train['time_id'] < 1030), 'group'] = 3
train.loc[(train['time_id'] >= 1030) & (train['time_id'] < 1400), 'group'] = 4
train['group'] = train['group'].astype(np.int16)

In [None]:
train.head()

In [None]:
def transform_csv2pickle(path, usecols, dtype):
    train = pd.read_csv(
        path,
        usecols=usecols,
        dtype=dtypes
    )
    train.to_pickle('train.pkl')


path = '../input/ubiquant-market-prediction/train.csv'

basecols = ['row_id', 'time_id', 'investment_id', 'target']
features = [f'f_{i}' for i in range(300)]

dtypes = {
    'row_id': 'str',
    'time_id': 'uint16',
    'investment_id': 'uint16',
    'target': 'float32',
}
for col in features:
    dtypes[col] = 'float32'

# transform_csv2pickle(path, basecols+features, dtypes)

In [None]:

for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, groups = train['group'])):
        print(f'Training fold {fold + 1}')
        train[features].loc[trn_ind]
        
        x_train, x_val = train[features].loc[trn_ind], train[features].loc[val_ind]
        y_train, y_val = train['target'].loc[trn_ind], train['target'].loc[val_ind]
        
        x_train.to_pickle('./x_train_' + str(fold) + '.pkl')
        x_val.to_pickle('./x_val_' + str(fold) + '.pkl')
        y_train.to_pickle('./y_train_' + str(fold) + '.pkl')
        y_val.to_pickle('./y_val_' + str(fold) + '.pkl')