### まずは有効なすべてのTickerとTimestampの組合せを取得する

In [None]:
WINDOW = 200
RADIUS = 1
BATCH_SIZE = 32

In [None]:
import numpy as np
import pandas as pd

def generate_data(df):
    target_indices = []
    timestamps = []
    target_changes = []
    
    for i in range(WINDOW, len(df)):
        target_indices.append(i)
        timestamps.append(df.iloc[i].name)
        target_changes.append(np.log(df.iloc[i]['Close'] + 1e-10) - np.log(df.iloc[i]['Open'] + 1e-10))
        
    return target_indices, timestamps, target_changes

In [None]:
import os
import joblib
from tqdm import tqdm
from joblib import Parallel, delayed

input_dir = 'history'

def process_file(filename, input_dir):
    """各ファイルを処理する関数"""
    if not filename.endswith('.joblib'):
        return None
    
    file_path = os.path.join(input_dir, filename)
    history_data = joblib.load(file_path)
    
    ticker = history_data['ticker']
    history_df = history_data['history_df']
    
    # データの作成
    target_indices, timestamps, target_changes = generate_data(history_df)
    
    return {
        'tickers': [ticker] * len(target_indices),
        'target_indices': target_indices,
        'timestamps': timestamps,
        'target_changes': target_changes
    }

# 並列処理の実行
files = [f for f in os.listdir(input_dir) if f.endswith('.joblib')]

# n_jobs=-1で全CPUコアを使用
results = Parallel(n_jobs=-1)(
    delayed(process_file)(filename, input_dir) 
    for filename in tqdm(files, desc="Processing files")
)

# 結果の結合
tickers = []
target_indices = []
timestamps = []
target_changes = []

for result in results:
    if result is not None:
        tickers.extend(result['tickers'])
        target_indices.extend(result['target_indices'])
        timestamps.extend(result['timestamps'])
        target_changes.extend(result['target_changes'])

### test_start_timestamp よりも前のデータを学習に使用する

In [None]:
from datetime import datetime, timedelta
test_start_timestamp = pd.Timestamp(datetime.now() - timedelta(days=365)).tz_localize('Asia/Tokyo')

is_train = [timestamp < test_start_timestamp for timestamp in timestamps]
train_indices = np.arange(len(tickers))[is_train]

from sklearn.model_selection import train_test_split
train_indices, val_indices = train_test_split(train_indices)

is_test = [timestamp >= test_start_timestamp for timestamp in timestamps]
test_indices = np.arange(len(tickers))[is_test]

In [None]:
import seaborn as sns

sns.histplot(target_changes)

In [None]:
1 / np.nanstd(target_changes)