In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.stats import pearsonr as pc
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostRegressor

In [None]:
df=pd.read_parquet("../input/ubiquant-parquet/train_low_mem.parquet")

## Model

In [None]:
models = []
n_splits=5
features = [f for f in df.columns if f not in ['time_id', 'row_id', 'target']]
target = 'target'
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
def run(train):
    scores=[]
    for fold,(train_id,valid_id) in enumerate(kfold.split(train[features],train['investment_id'])):
        x_train,y_train=train.iloc[train_id][features],train.iloc[train_id][target]
        x_valid,y_valid=train.iloc[valid_id][features],train.iloc[valid_id][target]

        model=CatBoostRegressor(n_estimators=2000,max_depth=3,task_type='GPU')
        model.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],verbose=20,early_stopping_rounds=100)
        models.append(model)
        y_pred=model.predict(x_valid)
        score=pc(y_pred,y_valid)
        scores.append(score)
        print(f"Fold{fold}:{score}")
        #del model,y_pred,score
        gc.collect()
    print(np.mean(scores))

In [None]:
run(df[-150000:])

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df[features]
    pred = np.zeros(len(test_df))
    
    for i in range(len(models)):
        pred += models[i].predict(test_df)
    
    pred /= len(models)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df)