Infer
- https://www.kaggle.com/code/swimmy/lgbm-model-fe-portfolio/notebook Thanks!

In this notebook,I use optuna to search params in lightGBM, and draw different diagrams to analysis impact of kinds of parameters combiantion and importances.

If you wanna take a look at Optuna, you can see my notion page [[Optuna]Tutorial for Optuna](https://dylanheddedly.notion.site/Optuna-Basic-Tutorial-for-Optuna-f440ce5746c8404586895252e6c17a63)

In [None]:
import optuna
import pandas as pd
import numpy as np
import sklearn.model_selection as sms 
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
import joblib
import seaborn as sns
import os
import gc
from sklearn.metrics import mean_squared_error
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _BaseKFold
from optuna.integration import LightGBMPruningCallback
from optuna.visualization import plot_contour
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
import warnings
warnings.filterwarnings("ignore")


In [None]:
# calculate factors functions
def returnMAE(df, n):
    avr = pd.Series()
    for id in SecuritiesCode:
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Close.rolling(window=n, min_periods=1).mean()], ignore_index = False)    
    df[f'average{n}'] = avr 
    return df


def returnEWM(df):
    avr = pd.Series()
    for id in SecuritiesCode:
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Close.ewm(com=0.5, adjust=True).mean()], ignore_index = False)    
    
    df[f'EWM'] = avr 
    return df

def returnBoll(df,n=20):
    bl1 = pd.Series()
    bl2 = pd.Series()
    for id in SecuritiesCode:
        bl1 = pd.concat([bl1, df[df.SecuritiesCode == id].Close.rolling(window=n, min_periods=1).apply(lambda x: x.mean()-2*x.std(), raw=False)], ignore_index = False)    
        bl2 = pd.concat([bl2, df[df.SecuritiesCode == id].Close.rolling(window=n, min_periods=1).apply(lambda x: x.mean()+2*x.std(), raw=False)], ignore_index = False)
    df[f'bollDown'] = bl1 
    df[f'bollUp'] = bl2
    return df

def returnMAEVolume(df, n=12):
    avr = pd.Series()
    for id in SecuritiesCode:
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Volume.rolling(window=n, min_periods=1).mean()], ignore_index = False)    
    
    df[f'MAEVolume_{n}'] = avr 
    return df

def returnSTDVolume(df, n=10):
    avr = pd.Series()
    for id in SecuritiesCode:
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Volume.rolling(window=n, min_periods=1).std()], ignore_index = False)    
    
    df[f'STDVolume_{n}'] = avr 
    return df

def calUpNumber(x):
    data = x.iloc[1:]
    data_shift = x.shift(1).iloc[2:]
    new = data_shift - data
    return new[new > 0].count()
        
def returnUpDate(df, n=13):
    number = pd.Series(dtype='float64')
    for id in SecuritiesCode:
        number = pd.concat([number, df[df.SecuritiesCode == id].Volume.rolling(window=n, min_periods=1).apply(calUpNumber, raw=False)], ignore_index = False)
    
    df[f'NumberUp_{n-1}'] = number 
    return df

def returnWillingness(df, n=26):
    will = pd.Series(dtype='float64')
    for id in SecuritiesCode:
        df1 = df[df.SecuritiesCode == id]
        df1['diff1'] = df1.High - df1.Close.shift(1)
        df1['diff2'] = df1.Close.shift(1) - df1.Low
        sum1 = df1.diff1.rolling(window=n, min_periods=1).sum()
        sum2 = df1.diff2.rolling(window=n, min_periods=1).sum()
        will = pd.concat([will, sum1/sum2], ignore_index=False)
    df['Willness'] = will
    return df

In [None]:
 df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')

In [None]:
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y%m%d').astype(int)
df = df.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'])
SecuritiesCode = np.sort(df.SecuritiesCode.unique())
df = df.sort_values(by=['SecuritiesCode', 'Date'])

In [None]:
df['money_flow'] = (df.Close + df.High + df.Low)/3 * df.Volume
df = returnMAE(df, 5) 
df = returnMAE(df, 10) 
df = returnMAE(df, 20) 
df = returnMAE(df, 60) 
df = returnMAE(df, 120) 
df = returnBoll(df,n=20)
df = returnUpDate(df)
df = returnMAEVolume(df)
df = returnSTDVolume(df)
df = returnSTDVolume(df, 20)
df = returnWillingness(df)

In [None]:
stock_price_data = df

In [None]:
new_features = df.columns.to_list()[12:]
features = ['Date', 'Open', 'Close', 'High', 'Low', 'SecuritiesCode' ] + new_features

In [None]:
train_x = stock_price_data[features]
train_y = stock_price_data[['Target']]

Note that I called the `Light GBM Pruning Callback` here, the use of this Callback can help optuna to select parameters faster, but when using the parameter search method, you need to use a Bayesian and tree model-based approach (default) instead of random search.

In [None]:


def Objective(trial, X, y):
    # Params Grid
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 200, 3000),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        'learning_rate': trial.suggest_float('learning_rate',0.01, 0.3),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.9, step=0.1),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.9, step=0.1),
    }
    # Cross-validation
    cv = TimeSeriesSplit(5)
    cv_scores = np.empty(5)
    # pruning_callback
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "l2")
    for index, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        train_x, validation_x = X.iloc[train_idx] ,X.iloc[test_idx]
        train_y, validation_y = y.iloc[train_idx] ,y.iloc[test_idx]
        # Model
        lgb_train = LGBMRegressor(**params)
        lgb_train.fit(train_x, train_y, eval_set=[(validation_x, validation_y)], early_stopping_rounds=100, callbacks=[pruning_callback])
        
        score = lgb_train.predict(validation_x)
        cv_scores[index] = np.sqrt(mean_squared_error(validation_y, score))
        
    
    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction='minimize', study_name='LGBM Regression')
func = lambda trial : Objective(trial, train_x, train_y)
study.optimize(func, n_trials=50)

In [None]:
params_best = dict(study.best_params.items())
params_best['random_seed'] = 16 # set for test 

In [None]:
def lgb_2(train_x, train_y, validation_x, validation_y, index, params):
    lgb_train = LGBMRegressor(**params)
    lgb_train.fit(train_x, train_y, eval_set=[(validation_x, validation_y)], early_stopping_rounds=200)
    return lgb_train

In [None]:
cv = TimeSeriesSplit()
for index, (train_id, val_id) in enumerate(cv.split(train_x, train_y)):
    model_lightgbm = lgb_2(train_x.iloc[train_id], train_y.iloc[train_id], train_x.iloc[val_id], train_y.iloc[val_id], index, params_best)
    
    print('Output of LightGBM Model training..')

In [None]:
'''import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices['Date'] = pd.to_datetime(prices['Date']).dt.strftime('%Y%m%d').astype(int)
    df = prices
    df['money_flow'] = (df.Close + df.High + df.Low)/3 * df.Volume
    df = returnMAE(df, 5) 
    df = returnMAE(df, 10) 
    df = returnMAE(df, 20) 
    df = returnMAE(df, 60) 
    df = returnMAE(df, 120) 
    df = returnBoll(df,n=20)
    df = returnUpDate(df)
    df = returnMAEVolume(df)
    df = returnSTDVolume(df)
    df = returnSTDVolume(df, 20)
    df = returnWillingness(df)
    df = df[features]
    lgbm_pred = list()
    for i in range(5):
        joblib_model = joblib.load(f'./model_lightgbm_sk_{i}.pkl')
        pred = joblib_model.predict(df)
        lgbm_pred.append(pred)
    lgbm_pred = np.mean(lgbm_pred, axis = 0)
    print(lgbm_pred)
    sample_prediction['prediction'] = lgbm_pred
    print(sample_prediction)
    sample_prediction = sample_prediction.sort_values(by='prediction', ascending=False)
    print(sample_prediction)
    sample_prediction['Rank'] = np.arange(0, 2000)
    sample_prediction = sample_prediction.sort_values(by='SecuritiesCode', ascending=True)
    sample_prediction = sample_prediction.drop(columns=['prediction'])
    print(sample_prediction)
    submission = sample_prediction[['Date', 'SecuritiesCode', 'Rank']]
    print(submission)
    env.predict(submission)
'''

In [None]:
plot_optimization_history(study)

In [None]:
plot_intermediate_values(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)