# Motivation

The metric of this competition is the means of Pearson correlation each time-id.

As mentioned in the [notebook](https://www.kaggle.com/code/gunesevitan/ubiquant-market-prediction-eda) of GUNES EVITAN,
Pearson correlation won't change if standard deviations and order of values are kept same for time IDs.
Therefore, actual values of predictions doesn't matter and it is similar to a ranking metric in that sense.

It suggests to me that the model, which learned z-scores of features and targets in each time_id, would be a good predictor of the order of target values in each time_id.

And If the standard diviation is assumed to 1.0, The value of targets can be replaced with its Z-score.

In [None]:
import pandas as pd
import numpy as np
#import gc

#import matplotlib.pyplot as plt
#import seaborn as sns

#import sklearn
#from sklearn.decomposition import PCA
#import matplotlib.ticker as ticker

from sklearn.model_selection import GroupKFold
#from sklearn import linear_model
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import lightgbm
from sklearn import linear_model

# Data Loading

In [None]:
train = (pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
         .sort_values(['time_id', 'investment_id'])
         .drop(columns=['row_id'])
         .query('time_id > 599')
         .reset_index(drop=True));

In [None]:
train.info()

In [None]:
train.dtypes

In [None]:
train.head()

In [None]:
features = [f'f_{i}' for i in range(300)]
for col in features:
    train[col] = train[col].astype(np.float16)
train.info()

# Convert values to Zscore

In [None]:
for col in features:
    train[f'{col}_zscore'] = (train[col] - train.groupby('time_id')[col].transform(np.mean)) / train.groupby('time_id')[col].transform(np.std)
    train[f'{col}_zscore'] = train[f'{col}_zscore'].astype(np.float16)

In [None]:
features_zscore = [f'f_{i}_zscore' for i in range(300)]
#train[features_zscore].head()

In [None]:
nan_info = train[features_zscore].isnull().sum()
nan_info[nan_info > 0]

In [None]:
nan_cols = ['f_102', 'f_124', 'f_153', 'f_170', 'f_175', 'f_182', 'f_200', 'f_272']
for col in nan_cols:
    train.groupby('time_id')[col].std().plot()

In [None]:
zscore_nan_cols = ['f_102_zscore', 'f_124_zscore', 'f_153_zscore', 'f_170_zscore', 'f_175_zscore', 'f_182_zscore', 'f_200_zscore', 'f_272_zscore']
for col in zscore_nan_cols:
    features_zscore.remove(col)
print(len(features_zscore))

In [None]:
train['target_zscore'] = (train.target - train.groupby('time_id').target.transform(np.mean)) / train.groupby('time_id').target.transform(np.std)
train['target_zscore'] = train['target_zscore'].astype(np.float32)

In [None]:
train.target_zscore.isnull().sum()

In [None]:
#train = train[train['investment_id'].isin(random_id)].reset_index(drop=True)
train = train.reset_index(drop=True)
groups = train['time_id']

In [None]:
# target
y = train['target_zscore']
X = train[features_zscore]

In [None]:
del train

# Let's train

In [None]:
#from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMRegressor

folds = 5
models = []
#target = 'target'
#train['preds']=0

# Note we use a group k-fold based on time
kf = GroupKFold(n_splits=folds)
    
for fold, (tr, val) in enumerate(kf.split(X, y, groups)):
    print('FOLD:', fold)
    train_X = X.loc[tr, features_zscore]
    train_y = y.loc[tr]
    valid_X = X.loc[val, features_zscore]
    valid_y = y.loc[val]
    
    lgbm = LGBMRegressor(
        num_leaves=2 ** np.random.randint(3, 8),
        learning_rate = 10 ** (-np.random.uniform(0.1,2)),
        n_estimators = 100,
        min_child_samples = 1000, 
        subsample=np.random.uniform(0.5,1.0), 
        subsample_freq=1,
        n_jobs= -1
    )

    lgbm.fit(train_X, train_y, eval_set = (valid_X, valid_y), early_stopping_rounds = 10)
    #model = linear_model.LinearRegression(
    #    n_jobs=-1
    #)
    #model.fit(train_X, train_y)
    preds_valid = lgbm.predict(valid_X)
    #train.loc[val, "preds"] = preds
    print('rmse:',np.sqrt(mean_squared_error(valid_y.values, preds_valid)))
    print('pearsonr:', pearsonr(valid_y.values, preds_valid)[0])
    models.append(lgbm)
    
    del preds_valid, train_X, valid_X, train_y, valid_y
    
    #if fold == 0:
    #    importance_df= pd.DataFrame(lightgbm.feature_importance(), index=features, columns=fold)
    #else:
    #    importance_df[fold]=lightgbm.feature_importance()

In [None]:
lightgbm.plot_importance(lgbm, figsize = (20, 60))

# Prediction

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_df['time_id'] = test_df['row_id'].apply(lambda x: int(x.split('_')[0]))
    test_df.loc[:, 'target']=0
    for col in features:
        test_df[f'{col}_zscore'] = (test_df[col] - test_df.groupby('time_id')[col].transform(np.mean)) / test_df.groupby('time_id')[col].transform(np.std)
        test_df[f'{col}_zscore'] = test_df[f'{col}_zscore'].astype(np.float16)
    
    for lgbm in models:
        test_df.loc[:, 'target'] += lgbm.predict(test_df[features_zscore])
    test_df.loc[:, 'target'] /= len(models)
    
    env.predict(test_df[['row_id','target']])