# 0.Import libraries and pickle file

- The original data is too large to load, so we will load the Pickle file created in [this note](https://www.kaggle.com/shashimo/ubiquant-how-to-make-pickle-file).

In [None]:
import pandas as pd
import pickle
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import gc
import warnings
warnings.simplefilter('ignore')

In [None]:
with open('/kaggle/input/ubiquanttrainpicklefile/train.pickle', 'rb') as f:
    train = pickle.load(f)
gc.collect()

In [None]:
train['investment_id'] = train['investment_id'].astype('category')
gc.collect()

# 1.Check dataset

In [None]:
train.shape

In [None]:
train.head()

In [None]:
print(train['time_id'].unique())

# 2.Feature engineering

In [None]:
def feature_eng(df,features):
    df['mean'] = df[features].mean(axis=1)
    #df['median'] = df[features].median(axis=1)
    #df['q01'] = df[features].quantile(q=0.01, axis=1)
    #df['q05'] = df[features].quantile(q=0.05, axis=1)
    #df['q10'] = df[features].quantile(q=0.10, axis=1)
    #df['q25'] = df[features].quantile(q=0.25, axis=1)
    #df['q75'] = df[features].quantile(q=0.75, axis=1)
    #df['q90'] = df[features].quantile(q=0.90, axis=1)
    #df['q95'] = df[features].quantile(q=0.95, axis=1)
    #df['q99'] = df[features].quantile(q=0.99, axis=1)
    #df['max'] = df[features].max(axis=1)
    #df['min'] = df[features].min(axis=1)
    gc.collect()
    return df

# 3.Pearson correlation coefficient

- Evaluation metrics of this competition is the mean of the Pearson correlation coefficient(PCC) for each time ID.
- Prepare here to calculate score of PCC.

In [None]:
def pcc_score(t,x,y):
    _t_df = pd.DataFrame(t)
    _t_df.reset_index(drop=True,inplace=True)
    _x_df = pd.DataFrame(x)
    _x_df.reset_index(drop=True,inplace=True)
    _y_df = pd.DataFrame(y)
    _y_df.reset_index(drop=True,inplace=True)
    _calc_df=pd.concat([_t_df,_x_df],axis=1)
    _calc_df['pred'] = _y_df
    _pccs=[]
    test=[]
    for i,j in enumerate(_calc_df['time_id'].unique()):
        _tmp=_calc_df.loc[_calc_df['time_id']==j,['target']]
        _tmp2=_calc_df.loc[_calc_df['time_id']==j,['pred']]
        _pcc=pcc(_tmp.target,_tmp2.pred)
        test.append(j)
        _pccs.append(_pcc)
    return np.mean(_pccs)

def pcc(x,y):
    x_diff = x - np.mean(x)
    y_diff = y - np.mean(y)
    return np.dot(x_diff, y_diff) / (np.sqrt(sum(x_diff ** 2)) * np.sqrt(sum(y_diff ** 2)))

# 4.LGBM with TimeSeriesSplit Cross-Validation

- TimeSeriesSplit can't exactly split 'time_id' without leaks in multiple time series (like this competition), but I guess it is one of the best options that are easily available.

In [None]:
train.drop(['row_id'], axis=1, inplace=True)
y=train.target
x=train.drop('target',axis=1)
del train
gc.collect()

In [None]:
splits=5
cv = TimeSeriesSplit(n_splits=splits)
scores_rmse=[]
scores_pcc=[]
models = []
categorical_features = ['investment_id']
feats = [f for f in x.columns if f not in ['time_id','investment_id']]
gc.collect()

x=feature_eng(x,feats)
gc.collect()

for fold, (train_idx, val_idx) in enumerate(cv.split(x)):
    train_x,val_x = x.iloc[train_idx], x.iloc[val_idx]
    train_y,val_y = y.iloc[train_idx], y.iloc[val_idx]
    print(f'Start Fold:{fold}')
    #print(f'time_id of train_x is{train_x.time_id.unique()}') #check time_id
    #print(f'time_id of val_x is{val_x.time_id.unique()}')
    train_x.drop(['time_id'], axis=1, inplace=True)
    val_time=val_x.time_id
    val_x.drop(['time_id'], axis=1, inplace=True)
    print(f'size of train_x is{train_x.shape}',f'size of val_x is{val_x.shape}')
    
    model=lgb.LGBMRegressor(random_state=0,learning_rate=0.05,n_estimators=10000)
    
    model.fit(train_x,train_y,eval_set=[(val_x,val_y),(train_x,train_y)],
              categorical_feature='auto',verbose=20,eval_metric='rmse',early_stopping_rounds=10)

    pred=model.predict(val_x)
    score_rmse=np.sqrt(mean_squared_error(val_y,pred))
    score_pcc=pcc_score(val_time,val_y,pred)
    scores_rmse.append(score_rmse)
    scores_pcc.append(score_pcc)
    models.append(model)
    print(f'Score of Fold{fold} is RMSE:{score_rmse}, PCC:{score_pcc}')
    print('*'*80)
    gc.collect()

print(f'Result RMSE:{np.mean(scores_rmse)}, PCC:{np.mean(scores_pcc)}')

# 5.Submit

In [None]:
def inference(models,df):
    y_preds = []
    df['investment_id'] = df['investment_id'].astype('category')
    df=feature_eng(df,feats)
    for model in models:
        y_pred = model.predict(df)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
test_cols = ['investment_id']
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
features = test_cols + features

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = inference(models,test_df[features])
    env.predict(sample_prediction_df) 