In [100]:
import yfinance as yf
import numpy as np
import pandas as pd

# ティッカーシンボル指定
ticker = "^N225"
# ticker = "^TOPX"
# ticker = "7203.T" # トヨタ
# ticker = "6758.T" # SONY
# ticker = "9434.T" # ソフトバンク
# ticker = "6502.T" # 東芝
# ticker = "8306.T" # 三菱UFJ
# ticker = "6501.T" # 日立
# ticker = "6861.T" # キーエンス
# ticker = "6098.T" # リクルート
# ticker = "9983.T" # ファストリ
# ticker = "9432.T" # NTT
stock = yf.Ticker(ticker)
start = "2009-01-01"
end = "2023-12-31"
data_master = stock.history(start = start, end = end)

# 欠損値除去
data = data_master.dropna().copy()

# 対数差分 (log差分) を計算する関数
def log_diff(series):
    return np.log(series) - np.log(series.shift(1))

# OHLCVに対して対数差分を計算
data["LogDiff_Open"] = log_diff(data["Open"])
data["LogDiff_High"] = log_diff(data["High"])
data["LogDiff_Low"] = log_diff(data["Low"])
data["LogDiff_Close"] = log_diff(data["Close"])
data["LogDiff_Volume"] = log_diff(data["Volume"])

# 終値の階差特徴量（当日終値 - 過去 n 営業日前終値）
data["Close_diff_2"] = data["Close"] - data["Close"].shift(2)
data["Close_diff_4"] = data["Close"] - data["Close"].shift(4)
data["Close_diff_8"] = data["Close"] - data["Close"].shift(8)
data["Close_diff_16"] = data["Close"] - data["Close"].shift(16)

# 正解ラベルの作成
# 16営業日後の終値 (Close.shift(-16)) と翌営業日の始値 (Open.shift(-15)) を比較
# i日目のラベル = (Close_{i+16} > Open_{i+1}) ? 1 : 0
data["future_close"] = data["Close"].shift(-16)
data["next_open"] = data["Open"].shift(-15)

data["y_label"] = (data["future_close"] > data["next_open"]).astype(int)

# 不要列や欠損のある行を除去
# future_close, next_open はラベル計算用の中間列なので削除
data = data.drop(columns=["future_close", "next_open"])
data = data.dropna()

# これで data にはOHLCVの対数差分、終値の階差、および y_label が揃ったDataFrameが完成
print(data.head())


                                  Open         High          Low        Close  \
Date                                                                            
2009-01-28 00:00:00+09:00  8052.250000  8171.629883  7936.589844  8106.290039   
2009-01-29 00:00:00+09:00  8201.160156  8305.379883  8138.990234  8251.240234   
2009-01-30 00:00:00+09:00  8142.879883  8142.879883  7922.390137  7994.049805   
2009-02-02 00:00:00+09:00  7908.509766  7955.750000  7795.270020  7873.979980   
2009-02-03 00:00:00+09:00  7862.950195  8084.410156  7800.799805  7825.509766   

                              Volume  Dividends  Stock Splits  LogDiff_Open  \
Date                                                                          
2009-01-28 00:00:00+09:00  140000000        0.0           0.0      0.034023   
2009-01-29 00:00:00+09:00  160300000        0.0           0.0      0.018324   
2009-01-30 00:00:00+09:00  148000000        0.0           0.0     -0.007132   
2009-02-02 00:00:00+09:00  159700000 

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [94]:
!pip install PyWavelets

112798.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable


In [101]:
import numpy as np
import pandas as pd

# Haarフィルター
h = np.array([1/np.sqrt(2), 1/np.sqrt(2)])
g = np.array([1/np.sqrt(2), -1/np.sqrt(2)])

v_current = data["Close"].values
N = len(v_current)
max_level = 4

def line_index_for_level(j, t):
    # j: level (1-based), t: 0-basedインデックス
    # line = (2^j - 1) + 2^j * t
    return (2**j - 1) + (2**j)*t

for j in range(1, max_level+1):
    length_prev = len(v_current)
    length_new = length_prev // 2

    v_j = np.zeros(length_new)
    w_j = np.zeros(length_new)

    for t in range(length_new):
        # 2t, 2t+1から計算
        v_j[t] = g[0]*v_current[2*t] + g[1]*v_current[2*t+1]
        w_j[t] = h[0]*v_current[2*t] + h[1]*v_current[2*t+1]

    # 元フレームに合わせた列作成
    w_col = np.full(N, np.nan)
    v_col = np.full(N, np.nan)

    # 各tについて、指定の行番号に配置
    # 行番号は0始まりと仮定（問題文も0行目としている）
    for t in range(length_new):
        idx = line_index_for_level(j, t)
        if idx < N:
            w_col[idx] = w_j[t]
            v_col[idx] = v_j[t]
        else:
            # インデックス範囲外なら終了
            break

    data[f"w_{j}"] = w_col
    data[f"v_{j}"] = v_col

    # 次のレベルへ
    v_current = v_j

# 前方補間で不足分を過去の値で埋める
data = data.ffill()

print(data.head())



                                  Open         High          Low        Close  \
Date                                                                            
2009-01-28 00:00:00+09:00  8052.250000  8171.629883  7936.589844  8106.290039   
2009-01-29 00:00:00+09:00  8201.160156  8305.379883  8138.990234  8251.240234   
2009-01-30 00:00:00+09:00  8142.879883  8142.879883  7922.390137  7994.049805   
2009-02-02 00:00:00+09:00  7908.509766  7955.750000  7795.270020  7873.979980   
2009-02-03 00:00:00+09:00  7862.950195  8084.410156  7800.799805  7825.509766   

                              Volume  Dividends  Stock Splits  LogDiff_Open  \
Date                                                                          
2009-01-28 00:00:00+09:00  140000000        0.0           0.0      0.034023   
2009-01-29 00:00:00+09:00  160300000        0.0           0.0      0.018324   
2009-01-30 00:00:00+09:00  148000000        0.0           0.0     -0.007132   
2009-02-02 00:00:00+09:00  159700000 

In [102]:
data.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
       'LogDiff_Open', 'LogDiff_High', 'LogDiff_Low', 'LogDiff_Close',
       'LogDiff_Volume', 'Close_diff_2', 'Close_diff_4', 'Close_diff_8',
       'Close_diff_16', 'y_label', 'w_1', 'v_1', 'w_2', 'v_2', 'w_3', 'v_3',
       'w_4', 'v_4'],
      dtype='object')

時系列交差検証

In [108]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

# InfをNaNに置換
data.replace([np.inf, -np.inf], np.nan, inplace=True)
# # NaNを削除
# data = data.dropna()
# 全てのNaNを0で置き換える
data = data.fillna(0)

In [116]:
# 特徴量列を指定
# feature_cols = [
#     "LogDiff_Open", "LogDiff_High", "LogDiff_Low", "LogDiff_Close", "LogDiff_Volume",
#     "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
# ]
# feature_cols = [
#     "LogDiff_Open", "LogDiff_High", "LogDiff_Low", "LogDiff_Close", "LogDiff_Volume",
#     'w_1', 'v_1', 'w_2', 'v_2', 'w_3', 'v_3', 'w_4', 'v_4'
# ]

# feature_cols = [
#     "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
# ]
feature_cols = [
    'w_1', 'v_1', 'w_2', 'v_2', 'w_3', 'v_3', 'w_4', 'v_4'
]

X = data[feature_cols].astype(float).values
y = data["y_label"].astype(int).values

print("Number of NaNs in X:", np.isnan(X).sum())
print("Number of Infs in X:", np.isinf(X).sum())


Number of NaNs in X: 0
Number of Infs in X: 0


In [117]:
# TimeSeriesSplitを使用
# n_splitsは分割数で、たとえば5なら5回の時系列分割を行う
tscv = TimeSeriesSplit(n_splits=30)

all_predictions = []
all_actual = []

start_time = time.time()

# 時系列クロスバリデーションループ
for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train, y_train)
    
    # テスト予測
    preds = model.predict(X_test)
    
    # 結果を蓄積
    all_predictions.extend(preds)
    all_actual.extend(y_test)

end_time = time.time()
execution_time = end_time - start_time

# numpy配列に変換
all_predictions = np.array(all_predictions)
all_actual = np.array(all_actual)

# 評価指標算出
accuracy = accuracy_score(all_actual, all_predictions)
precision = precision_score(all_actual, all_predictions, zero_division=0)
recall = recall_score(all_actual, all_predictions, zero_division=0)
f1 = f1_score(all_actual, all_predictions, zero_division=0)

print("Execution Time:", execution_time, "seconds")
print("Accuracy:", accuracy)
print("Precision(適合率):", precision)
print("Recall(再現率):", recall)
print("F1:", f1)


Execution Time: 3.3525989055633545 seconds
Accuracy: 0.517816091954023
Precision(適合率): 0.5377922784121806
Recall(再現率): 0.5443037974683544
F1: 0.5410284463894968


In [45]:
# (1) 通常GBRT用データ: 既存のfeature_colsをそのまま使用
feature_cols = [
    "LogDiff_Open", "LogDiff_High", "LogDiff_Low", "LogDiff_Close", "LogDiff_Volume",
    "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
]

X = data[feature_cols].values
y = data["y_label"].values

# (2) Window-based GBRT用の特徴量生成例
#   ここでは簡易例として、過去window_size=16日の終値を全て特徴量として使うとします。
#   実際には過去16日分のLogDiff_～などを展開して特徴量にすることも可能です。
window_size = 16
close_values = data["Close"].values

def create_window_features(values, window_size):
    # valuesは1次元配列: shape (total_samples,)
    # window_size日分を1サンプルにまとめる -> 出来るサンプル数は total_samples - window_size
    samples = []
    for i in range(window_size, len(values)):
        # i行目を予測するために、直近window_size日分を特徴量にする
        samples.append(values[i-window_size:i])
    return np.array(samples)

X_window = create_window_features(close_values, window_size)
# yもwindow_size日後からが有効なので、yも同期させる
y_window = y[window_size:]  # window_size日分シフト

# (3) 評価用関数
def run_timeseries_cv(X, y, n_splits=30):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    all_preds = []
    all_true = []
    start_time = time.time()
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = XGBClassifier(eval_metric='logloss')
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        all_preds.extend(preds)
        all_true.extend(y_test)
    
    end_time = time.time()
    accuracy = accuracy_score(all_true, all_preds)
    precision = precision_score(all_true, all_preds, zero_division=0)
    recall = recall_score(all_true, all_preds, zero_division=0)
    f1 = f1_score(all_true, all_preds, zero_division=0)
    exec_time = end_time - start_time
    return accuracy, precision, recall, f1, exec_time

# (4) 通常GBRT結果
acc_norm, prec_norm, rec_norm, f1_norm, time_norm = run_timeseries_cv(X, y, n_splits=3)
print("Normal GBRT:")
print("Execution Time:", time_norm, "seconds")
print("Accuracy:", acc_norm)
print("Precision:", prec_norm)
print("Recall:", rec_norm)
print("F1:", f1_norm)

# (5) Window-based GBRT結果
acc_w, prec_w, rec_w, f1_w, time_w = run_timeseries_cv(X_window, y_window, n_splits=3)
print("Window-based GBRT:")
print("Execution Time:", time_w, "seconds")
print("Accuracy:", acc_w)
print("Precision:", prec_w)
print("Recall:", rec_w)
print("F1:", f1_w)

# (6) 比較
print("Comparison:")
print("Normal GBRT vs Window-based GBRT")
print(f"Accuracy: {acc_norm:.4f} vs {acc_w:.4f}")
print(f"Precision: {prec_norm:.4f} vs {prec_w:.4f}")
print(f"Recall: {rec_norm:.4f} vs {rec_w:.4f}")
print(f"F1: {f1_norm:.4f} vs {f1_w:.4f}")

Normal GBRT:
Execution Time: 0.33135008811950684 seconds
Accuracy: 0.511944138184491
Precision: 0.538866930171278
Recall: 0.5656984785615491
F1: 0.5519568151147098
Window-based GBRT:
Execution Time: 0.3197751045227051 seconds
Accuracy: 0.4968623108157992
Precision: 0.5364485981308411
Recall: 0.3983344899375434
F1: 0.4571883711668658
Comparison:
Normal GBRT vs Window-based GBRT
Accuracy: 0.5119 vs 0.4969
Precision: 0.5389 vs 0.5364
Recall: 0.5657 vs 0.3983
F1: 0.5520 vs 0.4572


In [54]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

feature_cols = [
    "LogDiff_Open", "LogDiff_High", "LogDiff_Low", "LogDiff_Close", "LogDiff_Volume",
    "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
]

X = data[feature_cols].values
y = data["y_label"].values

# 分割数設定
n_splits = 30
tscv = TimeSeriesSplit(n_splits=n_splits)

def run_timeseries_cv(X, y, tscv, use_window=False, window_size=100):
    all_predictions = []
    all_actual = []
    start_time = time.time()

    for train_index, test_index in tscv.split(X):
        if use_window:
            # Window-based: 学習データは直近window_sizeサンプルのみ使用
            if len(train_index) < window_size:
                # データがwindow_sizeよりも少ない場合はスキップまたは全データ使用
                X_train, y_train = X[train_index], y[train_index]
            else:
                X_train, y_train = X[train_index[-window_size:]], y[train_index[-window_size:]]
        else:
            # 通常のGBRT: train_index全て使用
            X_train, y_train = X[train_index], y[train_index]

        X_test, y_test = X[test_index], y[test_index]

        # モデル学習
        model = XGBClassifier(eval_metric='logloss')
        model.fit(X_train, y_train)

        # テスト予測
        preds = model.predict(X_test)

        # 結果を蓄積
        all_predictions.extend(preds)
        all_actual.extend(y_test)

    end_time = time.time()
    execution_time = end_time - start_time

    all_predictions = np.array(all_predictions)
    all_actual = np.array(all_actual)

    accuracy = accuracy_score(all_actual, all_predictions)
    precision = precision_score(all_actual, all_predictions, zero_division=0)
    recall = recall_score(all_actual, all_predictions, zero_division=0)
    f1 = f1_score(all_actual, all_predictions, zero_division=0)

    return accuracy, precision, recall, f1, execution_time

# 通常GBRT（全学習データ使用）
acc_normal, prec_normal, rec_normal, f1_normal, time_normal = run_timeseries_cv(X, y, tscv, use_window=False)

# Window-based GBRT（直近100サンプル使用例）
acc_window, prec_window, rec_window, f1_window, time_window = run_timeseries_cv(X, y, tscv, use_window=True, window_size=900)

print("=== Normal GBRT ===")
print("Accuracy:", acc_normal)
print("Precision:", prec_normal)
print("Recall:", rec_normal)
print("F1:", f1_normal)
print("Execution Time:", time_normal)

print("=== Window-based GBRT ===")
print("Accuracy:", acc_window)
print("Precision:", prec_window)
print("Recall:", rec_window)
print("F1:", f1_window)
print("Execution Time:", time_window)


=== Normal GBRT ===
Accuracy: 0.5230769230769231
Precision: 0.5406397482957525
Recall: 0.5636960087479497
F1: 0.5519271948608137
Execution Time: 3.3207638263702393
=== Window-based GBRT ===
Accuracy: 0.5048433048433049
Precision: 0.5241123476417594
Recall: 0.5407326407873154
F1: 0.5322927879440258
Execution Time: 2.958173990249634
