In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
import time
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
import math

# iisignature をインポート
import iisignature

warnings.filterwarnings("ignore")

In [11]:


# 処理対象のティッカー一覧
tickers = [
    "^N225",
    "^TOPX",
    "7203.T",  # トヨタ
    "6758.T",  # SONY
    "9434.T",  # ソフトバンク
    "6502.T",  # 東芝
    "8306.T",  # 三菱UFJ
    "6501.T",  # 日立
    "6861.T",  # キーエンス
    "6098.T",  # リクルート
    "9983.T",  # ファストリ
    "9432.T"   # NTT
]

# 特徴量セット定義（名称を付けて管理）
# 既存セットに加え、Signatureセットを追加
feature_sets = {
    # "Set3": [
    #     "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
    # ],
    # "Set4": [
    #     "w_1", "v_1", "w_2", "v_2", "w_3", "v_3", "w_4", "v_4"
    # ],
    # 新しく追加：Signatureセット
    # シグネチャは生のClose系列から抽出する例を示す。
    # 16日分のCloseを入力し、2階までのシグネチャを求めるなど。
    "Signature": []
}

# 日付範囲
start = "2009-01-01"
end = "2023-12-31"

# 結果格納用リスト
results = []

# 処理開始
overall_start_time = time.time()

for ticker in tickers:
    try:
        print(f"Processing ticker: {ticker}...")
        ticker_start_time = time.time()

        # データ取得
        stock = yf.Ticker(ticker)
        data_master = stock.history(start=start, end=end)

        # 欠損値除去
        data = data_master.dropna().copy()

        # 終値の階差特徴量
        data["Close_diff_2"] = data["Close"] - data["Close"].shift(2)
        data["Close_diff_4"] = data["Close"] - data["Close"].shift(4)
        data["Close_diff_8"] = data["Close"] - data["Close"].shift(8)
        data["Close_diff_16"] = data["Close"] - data["Close"].shift(16)

        # ラベル作成
        data["future_close"] = data["Close"].shift(-16)
        data["next_open"] = data["Open"].shift(-15)
        data["y_label"] = (data["future_close"] > data["next_open"]).astype(int)

        # Haarフィルター定義
        h = np.array([1/np.sqrt(2), 1/np.sqrt(2)])
        g = np.array([1/np.sqrt(2), -1/np.sqrt(2)])

        v_current = data["Close"].values
        N = len(v_current)
        max_level = 4

        def line_index_for_level(j, t):
            return (2**j - 1) + (2**j)*t

        for j in range(1, max_level+1):
            length_prev = len(v_current)
            length_new = length_prev // 2

            v_j = np.zeros(length_new)
            w_j = np.zeros(length_new)

            for t_i in range(length_new):
                v_j[t_i] = g[0]*v_current[2*t_i] + g[1]*v_current[2*t_i+1]
                w_j[t_i] = h[0]*v_current[2*t_i] + h[1]*v_current[2*t_i+1]

            w_col = np.full(N, np.nan)
            v_col = np.full(N, np.nan)

            for t_i in range(length_new):
                idx = line_index_for_level(j, t_i)
                if idx < N:
                    w_col[idx] = w_j[t_i]
                    v_col[idx] = v_j[t_i]
                else:
                    break

            data[f"w_{j}"] = w_col
            data[f"v_{j}"] = v_col

            v_current = v_j

        # 不要列削除
        data = data.drop(columns=["future_close", "next_open"])
        # 前方補間
        data = data.ffill()

        # InfをNaNに
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        # NaNを0で埋める
        data = data.fillna(0)

        # y_labelの取得
        y = data["y_label"].values

        # ウィンドウサイズ設定
        window_size = 16

        # シグネチャ計算用設定
        # ここではClose値を1次元系列として使用
        # iisignature.siglevel(dim, level)でシグネチャの次元決定
        # dim=1（Closeのみ）、level=2などを設定
        dim = 1
        sig_level = 2
        # sig_level = 4
        sig_size = iisignature.siglength(dim, sig_level)  # シグネチャの次元数

        # 特徴量セット毎に評価
        for set_name, feature_cols in feature_sets.items():
            print(f"  Evaluating feature set: {set_name}...")
            feature_start_time = time.time()

            # Signatureセットの場合は、feature_colsが空（定義していない）なので
            # シグネチャを計算して特徴量を生成する
            if set_name == "Signature":
                # Close列を取得
                close_values = data["Close"].values
                X_list = []
                y_list = []
                num_samples = len(data)

                for t_i in range(window_size-1, num_samples):
                    # 過去16日分のClose値を抽出
                    window_data = close_values[t_i - window_size + 1 : t_i + 1]
                    # シグネチャ計算には (length, dim) の2次元配列が必要
                    window_data_2d = window_data.reshape(-1, 1)
                    sig_features = iisignature.sig(window_data_2d, sig_level)
                    X_list.append(sig_features)
                    y_list.append(y[t_i])

                X_w = np.array(X_list)
                y_w = np.array(y_list)

            else:
                # 既存手法：特徴量列が存在するか確認
                missing_cols = [col for col in feature_cols if col not in data.columns]
                if missing_cols:
                    print(f"  Missing columns in feature set {set_name}: {missing_cols}")
                    continue  # スキップ

                feature_values = data[feature_cols].values
                num_samples = len(data)
                X_list = []
                y_list = []

                for t_i in range(window_size-1, num_samples):
                    window_data = feature_values[t_i - window_size + 1 : t_i + 1, :]
                    window_features = window_data.flatten()
                    X_list.append(window_features)
                    y_list.append(y[t_i])

                X_w = np.array(X_list)
                y_w = np.array(y_list)

            # 分割数
            n_splits = min(30, len(X_w) - 1)
            if n_splits < 2:
                print(f"  Not enough data for TimeSeriesSplit. Skipping feature set {set_name}.")
                continue

            tscv = TimeSeriesSplit(n_splits=n_splits)
            all_predictions = []
            all_actual = []

            for train_index, test_index in tscv.split(X_w):
                X_train, X_test = X_w[train_index], X_w[test_index]
                y_train, y_test = y_w[train_index], y_w[test_index]

                model = XGBClassifier(eval_metric='logloss')
                model.fit(X_train, y_train)
                preds = model.predict(X_test)

                all_predictions.extend(preds)
                all_actual.extend(y_test)

            feature_end_time = time.time()

            all_predictions = np.array(all_predictions)
            all_actual = np.array(all_actual)

            accuracy = accuracy_score(all_actual, all_predictions)
            precision = precision_score(all_actual, all_predictions, zero_division=0)
            recall = recall_score(all_actual, all_predictions, zero_division=0)
            f1 = f1_score(all_actual, all_predictions, zero_division=0)

            results.append({
                "Ticker": ticker,
                "Feature_Set": set_name,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1": f1,
                "Execution_Time(sec)": feature_end_time - feature_start_time
            })

        ticker_end_time = time.time()
        print(f"Finished processing ticker: {ticker}. Time taken: {ticker_end_time - ticker_start_time:.2f} seconds")

    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")
        continue

# 結果出力
print("All Results:")
for res in results:
    print(res)


Processing ticker: ^N225...
  Evaluating feature set: Signature...
Finished processing ticker: ^N225. Time taken: 3.07 seconds
Processing ticker: ^TOPX...
  Evaluating feature set: Signature...
Finished processing ticker: ^TOPX. Time taken: 2.75 seconds
Processing ticker: 7203.T...
  Evaluating feature set: Signature...
Finished processing ticker: 7203.T. Time taken: 2.97 seconds
Processing ticker: 6758.T...
  Evaluating feature set: Signature...
Finished processing ticker: 6758.T. Time taken: 3.11 seconds
Processing ticker: 9434.T...
  Evaluating feature set: Signature...


$6502.T: possibly delisted; no timezone found


Finished processing ticker: 9434.T. Time taken: 2.69 seconds
Processing ticker: 6502.T...
  Evaluating feature set: Signature...
  Not enough data for TimeSeriesSplit. Skipping feature set Signature.
Finished processing ticker: 6502.T. Time taken: 0.00 seconds
Processing ticker: 8306.T...
  Evaluating feature set: Signature...
Finished processing ticker: 8306.T. Time taken: 2.91 seconds
Processing ticker: 6501.T...
  Evaluating feature set: Signature...
Finished processing ticker: 6501.T. Time taken: 2.69 seconds
Processing ticker: 6861.T...
  Evaluating feature set: Signature...
Finished processing ticker: 6861.T. Time taken: 2.89 seconds
Processing ticker: 6098.T...
  Evaluating feature set: Signature...
Finished processing ticker: 6098.T. Time taken: 2.91 seconds
Processing ticker: 9983.T...
  Evaluating feature set: Signature...
Finished processing ticker: 9983.T. Time taken: 3.03 seconds
Processing ticker: 9432.T...
  Evaluating feature set: Signature...
Finished processing ticker

In [12]:
# 結果をDataFrameに変換
results_df = pd.DataFrame(results)
results_df = results_df.drop(columns=['Execution_Time(sec)'])

In [13]:
results_df['Feature_Set'] = results_df['Feature_Set'].replace({'Set3': 'Difference', 'Set4': 'Wavelet'})

In [14]:
results_df

Unnamed: 0,Ticker,Feature_Set,Accuracy,Precision,Recall,F1
0,^N225,Signature,0.502849,0.522594,0.537158,0.529776
1,^TOPX,Signature,0.501389,0.524899,0.516556,0.520694
2,7203.T,Signature,0.49435,0.492786,0.503401,0.498037
3,6758.T,Signature,0.498305,0.487358,0.444124,0.464738
4,9434.T,Signature,0.479487,0.499205,0.516447,0.50768
5,8306.T,Signature,0.495763,0.473582,0.431591,0.451613
6,6501.T,Signature,0.495198,0.496898,0.496059,0.496478
7,6861.T,Signature,0.498305,0.50225,0.500561,0.501404
8,6098.T,Signature,0.497222,0.521739,0.518062,0.519894
9,9983.T,Signature,0.485876,0.495505,0.519689,0.507309
