# Ranking problem

As LG mentioned in the [discussion](https://www.kaggle.com/competitions/ubiquant-market-prediction/discussion/314237), this task can be simplified as a ranking problem due to the following reasons.

(1) The metric is the average of pearsonrs each time_id.

(2) The distibution of targets each time_id can be fitted to a normal distribution.

Then I tried a LightGBM Ranker to sort investment_ids by targets every time_ids. If you have an idea to improve scores by using a ranking predictor, please leave a comment. Thanks.

I referred the [codes](https://github.com/masahiro-mochizuki/signate-fundamentals-challange-1st-place) of Mochizuki, which was 1st-place resolution to volatility prediction competition in Tokyo market.

In [None]:
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import seaborn as sns

#from sklearn.model_selection import GroupKFold
#from sklearn import linear_model
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import lightgbm
from sklearn import linear_model

from lightgbm import LGBMRanker
#from lightgbm import LGBMRegressor
#import sklearn
#from sklearn.decomposition import PCA
#import matplotlib.ticker as ticker

import warnings
warnings.filterwarnings("ignore")

# Load Dataset

In [None]:
%%time
train = (pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
         .sort_values(['time_id', 'investment_id'])
         .drop(columns=['row_id'])
         .query('time_id > 599')
         .reset_index(drop=True));

In [None]:
train.info()

In [None]:
features = [f'f_{i}' for i in range(300)]
for col in features:
    train[col] = train[col].astype(np.float16)
train.info()

In [None]:
train.groupby('time_id').target.mean().plot()

In [None]:
train.groupby('time_id').target.std().plot()

In [None]:
train.groupby('time_id').investment_id.nunique().plot()

In [None]:
for col in features:
    train[f'{col}_zscore'] = (train[col] - train.groupby('time_id')[col].transform(np.mean)) / train.groupby('time_id')[col].transform(np.std)
    train[f'{col}_zscore'] = train[f'{col}_zscore'].astype(np.float16)

In [None]:
features_zscore = [f'f_{i}_zscore' for i in range(300)]
#train[features_zscore].head()

# LightGBM Ranker

In [None]:
target = 'target'
def get_model_input(df):
    query_list = df.loc[:,'time_id'].value_counts()
    query_list = query_list.sort_index()
    #df = df.set_index('time_id')
    #df = df.sort_index(inplace = True)
    df.loc[:,'target_rank'] = df.groupby('time_id')[target].rank(method='min', ascending = False).astype(np.int)
    df = df.set_index(['time_id'])
    return df, query_list

In [None]:
from typing import Tuple


class GroupTimeSeriesSplit:
    """
    From: https://www.kaggle.com/c/ubiquant-market-prediction/discussion/304036
    Custom class to create a Group Time Series Split. We ensure
    that the time id values that are in the testing data are not a part
    of the training data & the splits are temporal
    """
    def __init__(self, n_folds: int, holdout_size: int, groups: str) -> None:
        self.n_folds = n_folds
        self.holdout_size = holdout_size
        self.groups = groups

    def split(self, X) -> Tuple[np.array, np.array]:
        # Take the group column and get the unique values
        unique_time_ids = np.unique(self.groups.values)

        # Split the time ids into the length of the holdout size
        # and reverse so we work backwards in time. Also, makes
        # it easier to get the correct time_id values per
        # split
        array_split_time_ids = np.array_split(
            unique_time_ids, len(unique_time_ids) // self.holdout_size
        )[::-1]

        # Get the first n_folds values
        array_split_time_ids = array_split_time_ids[:self.n_folds]

        for time_ids in array_split_time_ids:
            # Get test index - time id values that are in the time_ids
            test_condition = X['time_id'].isin(time_ids)
            test_index = X.loc[test_condition].index

            # Get train index - The train index will be the time
            # id values right up until the minimum value in the test
            # data - we can also add a gap to this step by
            # time id < (min - gap)
            train_condition = X['time_id'] < (np.min(time_ids))
            train_index = X.loc[train_condition].index

            yield train_index, test_index

In [None]:
max_labels = train.investment_id.nunique()
print(max_labels)
#del train

In [None]:
def custom_metric(y, t):
    from scipy.stats import spearmanr
    score = spearmanr(t, y, nan_policy="propagate")[0]
    return 'rho', score, True

In [None]:
%%time

#FEATS = features + ['investment_id', 'time_id']

pearsonrs_folds = []
rankmodels = []
avgmodels = []
stdmodels = []
#pearsonrs_bytimeid = {}

FOLDS = 5
gtss = GroupTimeSeriesSplit(n_folds=FOLDS, holdout_size=60, groups=train['time_id'])
for fold, (tr, val) in enumerate(gtss.split(train)):
    print('FOLD:', fold)
    
    # use a fraction to training
    tr_df = train.loc[tr]
    tr_df, tr_query_list = get_model_input(tr_df)
    
    del tr
    gc.collect()
    
    val_df = train.loc[val]
    val_df, val_query_list = get_model_input(val_df)
    del val
    gc.collect()
    
    print('Train time_id range:', tr_df.index.min(), '->',tr_df.index.max())
    print('Val time_id range:', val_df.index.min(), '->', val_df.index.max())
    
    # store time_id to calculate Pearson correlation
    #time_ids_val = val_df.index.values

    rankmodel = LGBMRanker(
    #device="",
    boosting_type="gbdt",
    objective="lambdarank",
    metric="None",
    label_gain=np.arange(1, max_labels+1),
    lambdarank_truncation_level=max_labels,
    num_estimators=100,
    early_stopping_round=10,
    num_leaves=2**6-1,
    learning_rate=0.1,
    max_bin=128,
    #max_drop=0,
    #bagging_freq=1,
    #bagging_fraction=0.8,
    #feature_fraction=0.5,
    #lambdarank_norm=False,
    #seed=123,
    #min_data_in_leaf=100,
    #min_sum_hessian_in_leaf=1e-2,
    n_jobs=-1,
)
    rankmodel.fit(
    tr_df[features_zscore],
    tr_df['target_rank'],
    group=tr_query_list,
    eval_set = [(val_df[features_zscore], val_df['target_rank'])],
    eval_group=[list(val_query_list)],
    eval_metric=custom_metric
)
    rankmodels.append(rankmodel)

    tr_avg = tr_df.groupby('time_id')[features + [target]].mean()
    tr_target_std = tr_df.groupby('time_id')[target].std()
    #val_avg = val_df.groupby('time_id')[features + [target]].mean()
    #val_target_std = val_df.groupby('time_id')[target].std()

    del tr_df
    gc.collect()
    
    avgmodel = linear_model.Ridge()
    stdmodel = linear_model.Ridge()
    
    avgmodel.fit(tr_avg[features], tr_avg[target])#, eval_set = (val_avg[features], val_avg[target]), early_stopping_rounds = 10)
    stdmodel.fit(tr_avg[features], tr_target_std)#, eval_set = (val_avg[features], val_target_std), early_stopping_rounds = 10)
    
    avgmodels.append(avgmodel)
    stdmodels.append(stdmodel)
    
    del tr_avg, tr_target_std#, val_avg, val_target_std
    
    #avg_pred = tr_df.groupby('time_id')[target].mean().median()
    #std_pred = tr_df.groupby('time_id')[target].std().median()
    
    time_ids=[]
    pearsonrs=[]
    for t_id in val_df.index.unique():
    # pred = model.predict(df[df["era"]== era][features])
        df_tmp = val_df.loc[val_df.index == t_id]
        df_tmp.loc[:,"rank_pred"] = rankmodel.predict(df_tmp[features_zscore])
        #df_time_id.loc[:,"rank_pred"] = df_time_id.loc[:,"rank_pred"].rank()
        #ranks.extend(list(pred))
        
        avg_pred = avgmodel.predict(df_tmp[features].mean().values.reshape(1,-1))
        std_pred = stdmodel.predict(df_tmp[features].mean().values.reshape(1,-1))
        std_pred = max([std_pred, 0])
        
        df_tmp = df_tmp.sort_values('rank_pred')
        df_tmp.loc[:,'preds'] = np.sort(np.random.normal(avg_pred, std_pred, df_tmp.shape[0]))[::-1]
        
        metric = pearsonr(df_tmp['target'], df_tmp['preds'])[0]
        time_ids.append(t_id)
        pearsonrs.append(metric)
    
    res_df = pd.Series(pearsonrs, index = time_ids)
    print(f'fold{fold}_Pearsonr:',np.mean(res_df))
    
    #pearsonrs_bytimeid[fold] = res_df
    pearsonrs_folds.append(np.mean(res_df))
    del res_df, val_df, rankmodel, avgmodel, stdmodel
    gc.collect()
    
print('-' * 30)
print('Mean:', np.mean(pearsonrs_folds))
print('Std:', np.std(pearsonrs_folds))

In [None]:
#avg_pred = train.groupby('time_id')[target].mean().median()
#std_pred = train.groupby('time_id')[target].std().median()
#print('avg:', avg_pred)
#print('std:', std_pred)

In [None]:
%%time

import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

for (test_df, sample_prediction_df) in iter_test:
    test_df['time_id'] = test_df['row_id'].apply(lambda x: int(x.split('_')[0]))
    test_df['target'] = 0
    for col in features:
        test_df[f'{col}_zscore'] = (test_df[col] - test_df.groupby('time_id')[col].transform(np.mean)) / test_df.groupby('time_id')[col].transform(np.std)
    
    for rankmodel, avgmodel, stdmodel in zip(rankmodels, avgmodels, stdmodels):
        test_df['rank_pred'] = rankmodel.predict(test_df[features_zscore])
    
        avg_pred = avgmodel.predict(test_df[features].mean().values.reshape(1,-1))
        std_pred = stdmodel.predict(test_df[features].mean().values.reshape(1,-1))
        std_pred = max([std_pred, 0])
    
        test_df = test_df.sort_values('rank_pred')
        test_df['target'] += np.sort(np.random.normal(avg_pred, std_pred, test_df.shape[0]))[::-1]
    # initialize columns
    #test_df['target']  = 0
    test_df['target'] /= len(rankmodels)
    test_df = test_df.sort_values(['time_id', 'investment_id'])
    
    env.predict(test_df[['row_id','target']])

## Thanks!