In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir /content/drive/MyDrive/sarwar/catboost_model

In [None]:
import os
import numpy as np
import pandas as pd 
from tqdm import tqdm_notebook
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
import gc
import pickle
from IPython.display import FileLink
import random
import joblib
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Activation,BatchNormalization
from tensorflow.keras import regularizers
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import backend as K
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',100)

In [None]:
seed=2022
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED']=str(seed)

In [None]:
%%time
data_path="/content/drive/MyDrive/sarwar/"
train=pd.read_pickle(os.path.join(data_path,"train.pickle"))

In [None]:
train.head(10)

In [None]:
print("Train shape:",train.shape)

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

In [None]:
# metric
def pearson_correlation(df):
    corr=df.groupby('time_id')[['target','prediction']].corr().unstack().iloc[:,1].mean()
    return corr

In [None]:
from scipy.stats import ttest_ind
features_group=[] # highly related
for i in range(300):
  tresult=[]
  for j in range(300):
    if i!=j:
       ttest=ttest_ind(train[f'f_{i}'],train[f'f_{j}'],equal_var=False).pvalue
       tresult.append((f'f_{j}',ttest))
  # sort 
  tresult.sort(key=lambda x:x[1],reverse=True)
  features_group.append((f'f_{i}',tresult[0][0],tresult[0][1]))
features_group.sort(key=lambda x:x[2],reverse=True)

In [None]:
features_group[0:10]

In [None]:
np.save("/content/drive/MyDrive/sarwar/catboost_model/features_group.npy",np.array(features_group))

In [None]:
features_group=np.load("/content/drive/MyDrive/sarwar/catboost_model/features_group.npy")

In [None]:
features_group[0:10]

In [None]:
filter_features=features_group[0:251:2]
for col1,col2,_ in filter_features:
  col=col1+col2
  train[col]=train[col1]*train[col2]


In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
features=['investment_id']+train.columns.to_list()[4:]
kfold=KFold(n_splits=5,shuffle=False)
# Store out of folds predictions
oof_predictions = np.zeros(len(train))
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
    y_train, y_val = train['target'].iloc[trn_ind], train['target'].iloc[val_ind]
    n_training_rows = x_train.shape[0]
    n_validation_rows = x_val.shape[0]
    print(f'Training with {n_training_rows} rows')
    print(f'Validating with {n_validation_rows} rows')
    print(f'Training dart boosting model with {len(features)} features...')
    model=CatBoostRegressor(iterations=1500,
                        learning_rate=0.03,
                        depth=10,
                        l2_leaf_reg=3.0,
                        model_size_reg=1,
                        leaf_estimation_method='Gradient',
                        thread_count=-1,
                        use_best_model=True,
                        od_pval=10**(-5),
                        od_wait=20,
                        od_type='IncToDec',
                        random_strength=1,
                        eval_metric='MAE',
                        train_dir='metric_visualization',
                        bagging_temperature=2,
                        task_type='GPU',
                        devices='0',
                        bootstrap_type='Bayesian',
                        name='Ubiquant',
                        random_state=seed,
                        max_bin=100,
                        min_data_in_leaf=300,
                        score_function='L2',
                        )
    model.fit(x_train,y_train,
    cat_features=[0],
    eval_set=(x_val,y_val),
    verbose=50,
    plot=False,
    metric_period=None,
    early_stopping_rounds=30)

    # Predict validation set
    val_pred = model.predict(x_val)
    # Add validation prediction to out of folds array
    oof_predictions[val_ind] = val_pred.reshape(-1)
    # Save model to disk for inference
    model.save_model(f"/content/drive/MyDrive/sarwar/catboost_model/fold_{fold+1}.bin")
    del x_val,y_val,model,x_train,y_train,val_pred
    gc.collect()
# Compute out of folds Pearson Correlation Coefficient (for each time_id)
oof_df = pd.DataFrame({'time_id': train['time_id'], 'target': train['target'], 'prediction': oof_predictions})
# Save out of folds csv for blending
oof_df.to_csv('/content/drive/MyDrive/sarwar/catboost_model/simple_cat.csv', index = False)
score = pearson_correlation(oof_df)
print(f'Our out of folds mean pearson correlation coefficient is {score}')  
