In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc
import psutil
import pickle
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.model_selection import GroupKFold, KFold
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import utils

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib

import warnings
warnings.filterwarnings('ignore')
path_submissions = '/'
target_name = 'target'
scores_folds = {}


Get preprocessing data from https://www.kaggle.com/res1235/preprocessing-rapids-finish-in-3-mins

## Prepare data

In [None]:
train = pd.read_feather('../input/preprocessing-rapids-finish-in-3-mins/feature_train.feather')

In [None]:
le = LabelEncoder()
le.fit(list(train['stock_id'].astype(str).values))
train['stock_id'] = le.transform(list(train['stock_id'].astype(str).values))

In [None]:
# F
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
get_memory_usage()

In [None]:
train = reduce_mem_usage(train)

## Recursive feature elimination cross validation

In [None]:
features = [col for col in train.columns if col not in {'time_id','target','row_id'}]

In [None]:
train.fillna(-999, inplace=True)

In [None]:
gc.collect()

In [None]:

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':255,
    'min_data_in_leaf':750,
    'learning_rate': 0.1,
    'subsample': 0.72,
    'subsample_freq': 3,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':2021,
    'n_jobs':-1,
    'verbose': -1,
}

In [None]:
%time
clf = lgb.LGBMRegressor(**params)
rfe = RFECV(estimator=clf, step=20, cv=GroupKFold(n_splits=5), scoring= 'neg_mean_squared_error', verbose=2)
rfe.fit(X = train[features], y = train['target'], groups = train['time_id'])

In [None]:
gc.collect()

In [None]:
print('Optimal number of features:', rfe.n_features_)

In [None]:
plt.figure(figsize=(14, 8))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()

In [None]:
features = train[features].columns[rfe.ranking_ == 1]

## Modeling

In [None]:
train = pd.read_feather('../input/preprocessing-rapids-finish-in-3-mins/feature_train.feather')
# Function to early stop with root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_and_evaluate_lgb(train, params, features):
    # Hyperparammeters (just basic)
    
    if 'stock_id' not in features:
        features.insert(0,'stock_id') 
    #features = [col for col in features if col not in nnn]
    y = train['target']
    #train[features] = train[features].fillna(-999)
    # Create out of folds array
    oof_predictions = np.zeros(train.shape[0])
    
    # Create a KFold object
    gfold = GroupKFold(n_splits = 5)
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
    groups = train['time_id']
    models = []
    counter = 0
    # Iterate through each fold
    NUM_FOLDS =5
    for fold, (train_idx, val_idx) in enumerate(gfold.split(train, train['target'], train['time_id'])):
        print('CV {}/{}'.format(fold+1,5)) 

        x_train = train.loc[train_idx,features]
        y_train = train.loc[train_idx, target_name]
        x_val = train.loc[val_idx, features]
        y_val = train.loc[val_idx, target_name]
    ##################################################################
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)

        
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights ,categorical_feature = ['stock_id']  )
        model = lgb.train(params = params,
                          num_boost_round=1000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 50,
                          early_stopping_rounds=20,
                          feval = feval_rmspe)
        # Add predictions to the out of folds array
        lgb.plot_importance(model,max_num_features=20, importance_type = 'gain')
        models.append(model)
        oof_predictions[val_idx] = model.predict(x_val[features])
        # Predict the test set
        #test_predictions += model.predict(test[features]) / 5
         
    rmspe_score = rmspe(y, oof_predictions)
    _ = gc.collect()
    print(f'Our out of folds RMSPE is {rmspe_score}')
    
    
    # Return test predictions
    return models
# Traing and evaluate
models= train_and_evaluate_lgb(train,params, features)

In [None]:
pickle.dump(models, open('./models.pkl','wb'))