In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir /content/drive/MyDrive/sarwar/lightgbm_model

In [None]:
import os
import numpy as np
import pandas as pd 
from tqdm import tqdm_notebook
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
import gc
import pickle
from IPython.display import FileLink
import random
import joblib
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Activation,BatchNormalization
from tensorflow.keras import regularizers
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import backend as K
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',100)

In [None]:
seed=2022
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED']=str(seed)

In [None]:
%%time
data_path="/content/drive/MyDrive/sarwar/"
train=pd.read_pickle(os.path.join(data_path,"train.pickle"))

In [None]:
train.head(10)

In [None]:
print("Train shape:",train.shape)

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

In [None]:
# metric
def pearson_correlation(df):
    corr=df.groupby('time_id')[['target','prediction']].corr().unstack().iloc[:,1].mean()
    return corr

In [None]:
investment_id=train['investment_id']

In [None]:
EPOCHS=20
BATCH_SIZE=1024
LR=0.01
VERBOSE=2

In [None]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
with tf.device("cpu"):
    investment_id_lookup_layer.adapt(investment_id)

In [None]:
params = {  'task':'train',
                'max_depth':10,
                'min_gain_to_split':0.01, # 0.0 --->0.01
                'min_sum_hessian_in_leaf':1e-2,
               'force_col_wise':'true',
               'objective':'regression',
               'boosting':'dart',
               'feature_fraction': 0.75,
               'metric': ['mse','rmse','l1'],
               'num_threads':-1, 
               'extra_trees':'true',
               'extra_seed':7,  
               'min_data_in_leaf': 300, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.1, 
               'bagging_seed': seed, 
               'num_leaves': 100, # 80 --> 100
               'bagging_freq':5,  # 1 ---> 5
               'lambda_l1':0.5,
               'lambda_l2':1,
               'drop_rate':0.3,
               'xgboost_dart_mode':'true',
               'max_bin':200,
               'min_data_in_bin':40,
                'path_smooth':10**(-2),
                 'verbosity':-1
              }

In [None]:
features = [f'f_{i}' for i in range(n_features)]
# Initiate GroupKFold
#kfold = GroupKFold(n_splits = 5)
# Create groups based on time_id
#train.loc[(train['time_id'] >= 0) & (train['time_id'] < 280), 'group'] = 0
#train.loc[(train['time_id'] >= 280) & (train['time_id'] < 585), 'group'] = 1
#train.loc[(train['time_id'] >= 585) & (train['time_id'] < 825), 'group'] = 2
#train.loc[(train['time_id'] >= 825) & (train['time_id'] < 1030), 'group'] = 3
#train.loc[(train['time_id'] >= 1030) & (train['time_id'] < 1400), 'group'] = 4
#train['group'] = train['group'].astype(np.int16)
kfold=KFold(n_splits=10,shuffle=False)
# Store out of folds predictions
oof_predictions = np.zeros(len(train))
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
    y_train, y_val = train['target'].iloc[trn_ind], train['target'].iloc[val_ind]
    n_training_rows = x_train.shape[0]
    n_validation_rows = x_val.shape[0]
    # Build lgbm dataset
    train_set= lgb.Dataset(x_train.values, y_train.values.ravel(),free_raw_data=True)
    raw_data=None
    gc.collect()
    val_set=lgb.Dataset(x_val.values, y_val.values.ravel(),free_raw_data=True)
    raw_data=None
    gc.collect()
    del x_train,y_train
    gc.collect()
    print(f'Training with {n_training_rows} rows')
    print(f'Validating with {n_validation_rows} rows')
    print(f'Training dart boosting model with {len(features)} features...')
    # Train and evaluate
    model = lgb.train(
            params, 
            train_set, 
            num_boost_round = 160, # 150--->160
            callbacks=[lgb.early_stopping(stopping_rounds=20)],
            valid_sets = [train_set, val_set], 
            verbose_eval = 50,
        valid_names=['train','valid']
        )
    # Predict validation set
    val_pred = model.predict(x_val.values)
    # Add validation prediction to out of folds array
    oof_predictions[val_ind] = val_pred
    # Save model to disk for inference
    joblib.dump(model, f'/content/drive/MyDrive/sarwar/lightgbm_model/lgbm_{fold + 1}.pkl')
    del x_val,y_val,train_set, val_set,model
    gc.collect()
# Compute out of folds Pearson Correlation Coefficient (for each time_id)
oof_df = pd.DataFrame({'time_id': train['time_id'], 'target': train['target'], 'prediction': oof_predictions})
# Save out of folds csv for blending
oof_df.to_csv('/content/drive/MyDrive/sarwar/lightgbm_model/simple_lgbm.csv', index = False)
score = pearson_correlation(oof_df)
print(f'Our out of folds mean pearson correlation coefficient is {score}')  