Ver1:

I made two changes to this notebook https://www.kaggle.com/yosukeyama/ubiquant-simple-lgbm-train-infer

* added a custom metric 
* Verification set is separated by time_id

Ver2:

* use all the features.

Ver4:

* Use recent data of time_id for training

Ver5:

* 5seed averaging

Ver6:

* np.corrcoef -> scipy.stats.pearsonr
* Check predictions by changing iteration.



In [None]:
!pip install  ../input/numpy-indexed-v035/numpy_indexed-0.3.5-py2.py3-none-any.whl -qq

In [None]:
import pickle
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import numpy_indexed as npi
gc.enable()

In [None]:
# reduce cols for use to save memory capacity
features = [f'f_{i}' for i in range(300)]

# load data
train_df = pd.read_parquet('../input/speed-up-reading-csv-to-pickle/train_low_mem.parquet')
display(train_df)

In [None]:
# split train data
train = train_df[train_df['time_id'].isin(range(1000))]
valid = train_df[train_df['time_id'].isin(range(1000, 1220))]

display(train)
display(valid)

In [None]:
# prepare for training
tr_y = train['target'].values
tr_x = train[['investment_id'] + features ].values
val_y = valid['target'].values
val_x = valid[['investment_id'] + features].values
val_time_id = valid['time_id']

In [None]:
del train_df, train, valid
gc.collect()

## metric

In [None]:
import numpy_indexed as npi

# numpy_indexed example 

groups = np.array([1, 1, 1, 2, 2, 3, 3, 1])
group = npi.group_by(groups)
group.split(np.arange(len(groups)))

In [None]:
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

class MetricCorr:
    def __init__(self, time_id: pd.Series):
        self.time_id = npi.group_by(time_id.values)

    def corrs(self, preds, target):
        labels = self.time_id.split(target)
        corrs = [pearsonr(preds, rank)[0] for preds, rank in zip(self.time_id.split(preds), labels)]
        return corrs

    def corr_mean_lgb(self, preds: np.ndarray, data: lgb.Dataset):
        labels = self.time_id.split(data.get_label())
        corr_mean = np.mean([pearsonr(preds, rank)[0] for preds, rank in zip(self.time_id.split(preds), labels)])
        return 'corr_mean', corr_mean, True

    def rmse_lgb(self, preds: np.ndarray, data: lgb.Dataset):
        labels = data.get_label()
        rmse = np.sqrt(mean_squared_error(preds, labels))
        return 'rmse', rmse, False

metric = MetricCorr(val_time_id)

## lgb model

In [None]:
params = {
        'objective': 'regression',
        'learning_rate': 0.05,
        'num_leaves': 63,
        'max_depth': 6,
        'verbosity': -1,
        'min_data_in_leaf': 300,
        'metrics': 'None',
    }

def single_lightgbm(tr_x, tr_y, val_x, val_y, params, categorical_features=[], seed=None):
    lgb_train = lgb.Dataset(tr_x, tr_y, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(val_x, val_y, categorical_feature=categorical_features)

    if seed is not None:
        params['seed'] = seed
        print('seed', seed)
    evals_result = { }

    model = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                      verbose_eval=50,
                      num_boost_round=1000,
                      evals_result=evals_result,
                      feval=[metric.corr_mean_lgb, metric.rmse_lgb])
    
    pred = model.predict(val_x)
    return pred, model, evals_result

In [None]:
pred, model, evals_result = single_lightgbm(tr_x, tr_y, val_x, val_y, params, categorical_features=[0], seed=0)

In [None]:
import matplotlib.pyplot as plt
plt.subplot(211)
plt.plot(evals_result['valid_0']['corr_mean'], label='Pearson correlation')
plt.legend()
plt.subplot(212)
plt.plot(evals_result['valid_0']['rmse'], label='RMSE')
plt.legend()
plt.xlabel('itration');

In [None]:
# Histogram of predicted values per iteration
itr_list = [100, 300, 500, 1000]
preds = []
for itr in itr_list:
    pred_ = model.predict(val_x, num_iteration=itr)
    preds.append(pred_)

for i in range(4):
    plt.subplot(2, 2, i + 1)
    plt.hist(preds[i], bins=50, range=(-1.5, 1.5))
    plt.ylim(0, 280000)
    plt.title(f'num_iteration: {itr_list[i]}')
plt.tight_layout()

In [None]:
for i in range(4):
    plt.hist(preds[i], alpha=0.4, bins=50, range=(-1.5, 1.5), label=f'num_round{itr_list[i]}')
plt.legend()
plt.tight_layout()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
for i in range(4):
    valid_corrs = metric.corrs(val_y, preds[i])
    plt.plot(range(1000, 1220), valid_corrs, label=f'num_round {itr_list[i]}')
    plt.xlabel('time_id')
    plt.ylabel('corr')
plt.legend()
plt.title('Pearson correlation by time_id')
plt.tight_layout()

In [None]:
del tr_y, tr_x, val_y, val_x
gc.collect()