Check my new work here [[Visualization]Useful Stock Market Feature | Kaggle](https://www.kaggle.com/code/dylanhedded/visualization-useful-stock-market-factors)


In [None]:

import optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
import joblib
import seaborn as sns
import os
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _BaseKFold
from optuna.integration import LightGBMPruningCallback
import warnings
warnings.filterwarnings("ignore")


In [None]:
stock_price_data = pd.read_csv('../input/reduced-stock-prices-std-outlier/reduced_stock_prices_std_outlier.csv')


In [None]:
stock_price_data['Date'] = pd.to_datetime(stock_price_data['Date']).dt.strftime('%Y%m%d').astype(int)


In [None]:
# self defined GroupTimeSeriesSplit
class GroupTimeSeriesSplit(_BaseKFold):

    def __init__(self, n_splits=5, *, max_train_size=None):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):


        n_splits = self.n_splits
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_folds = n_splits + 1
        indices = np.arange(n_samples)
        group_counts = np.unique(groups, return_counts=True)[1]
        groups = np.split(indices, np.cumsum(group_counts)[:-1])
        n_groups = _num_samples(groups)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of groups: {1}.").format(n_folds, n_groups))
        test_size = (n_groups // n_folds)
        test_starts = range(test_size + n_groups % n_folds,
                            n_groups, test_size)
        for test_start in test_starts:
            if self.max_train_size:
                train_start = np.searchsorted(
                    np.cumsum(
                        group_counts[:test_start][::-1])[::-1] < self.max_train_size + 1, 
                        True)
                yield (np.concatenate(groups[train_start:test_start]),
                       np.concatenate(groups[test_start:test_start + test_size]))
            else:
                yield (np.concatenate(groups[:test_start]),
                       np.concatenate(groups[test_start:test_start + test_size]))

In [None]:
def lgbm_model(train_x, train_y, validation_x, valiastion_y,index):
    params = {
        # baseline parameters
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 100,
        "learning_rate" : 0.05,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_seed" : 42,
        "seed": 42
    }
    

    lg_train = lgb.Dataset(train_x, label=train_y)
    lg_validation = lgb.Dataset(validation_x, label=valiastion_y)
    evals_result_lgbm = {}
    
    model_lightgbm = lgb.train(params,lg_train, valid_sets=[lg_train, lg_validation], num_boost_round=2000 
                            , early_stopping_rounds = 200, evals_result=evals_result_lgbm, verbose_eval=100)
    
    model_lightgbm.save_model(f'model_lightgbm_{index}.txt')
    # pre_test_lightgbm = model_lightgbm.predict(test_x, num_iteration=model_lightgbm.best_iteration)
    
    
    # return pre_test_lightgbm, model_lightgbm, evals_result_lgbm
    return  model_lightgbm, evals_result_lgbm

In [None]:
train_x = stock_price_data[['Date', 'Open', 'Close', 'High', 'Low', 'Volume']]
groups = stock_price_data[['Date']]
train_y = stock_price_data[['Target']]

In [None]:
gtscv = TimeSeriesSplit(n_splits=5, test_size=2*2000,)
for index, (train_id, val_id) in enumerate(gtscv.split(train_x)):
    model_lightgbm, evals_result_lgbm = lgbm_model(train_x.iloc[train_id], train_y.iloc[train_id], train_x.iloc[val_id], train_y.iloc[val_id], index)
    ax = lgb.plot_metric(evals_result_lgbm, metric='rmse')
    plt.show()
    print('Output of LightGBM Model training..')

In [None]:

import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    print(prices)
    prices['Date'] = pd.to_datetime(prices['Date']).dt.strftime('%Y%m%d').astype(int)
    df = prices[['Date', 'Open', 'Close', 'High', 'Low', 'Volume']]
    lgbm_pred = list()
    for i in range(5):
        model = lgb.Booster(model_file=f'./model_lightgbm_{i}.txt')
        prediction = model.predict(df, num_iteration=model.best_iteration)
        lgbm_pred.append(prediction)
        print(lgbm_pred)
    lgbm_pred = np.mean(lgbm_pred, axis = 0)
    print(lgbm_pred)
    sample_prediction['prediction'] = lgbm_pred
    print(sample_prediction)
    sample_prediction = sample_prediction.sort_values(by='prediction', ascending=False)
    print(sample_prediction)
    sample_prediction['Rank'] = np.arange(0, 2000)
    sample_prediction = sample_prediction.sort_values(by='SecuritiesCode', ascending=True)
    sample_prediction = sample_prediction.drop(columns=['prediction'])
    print(sample_prediction)
    submission = sample_prediction[['Date', 'SecuritiesCode', 'Rank']]
    print(submission)
    env.predict(submission)
