In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
import time
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings("ignore")



In [7]:
import time
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
import math

# 処理対象のティッカー一覧
tickers = [
    "^N225",
    "^TOPX",
    "7203.T",  # トヨタ
    "6758.T",  # SONY
    "9434.T",  # ソフトバンク
    "6502.T",  # 東芝
    "8306.T",  # 三菱UFJ
    "6501.T",  # 日立
    "6861.T",  # キーエンス
    "6098.T",  # リクルート
    "9983.T",  # ファストリ
    "9432.T"   # NTT
]

# 特徴量
feature_sets = {
    "Set3": [
        "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
    ],
    "Set4": [
        "w_1", "v_1", "w_2", "v_2", "w_3", "v_3", "w_4", "v_4"
    ]
}

start = "2009-01-01"
end = "2023-12-31"

results = []

# 処理開始
overall_start_time = time.time()

for ticker in tickers:
    try:
        print(f"Processing ticker: {ticker}...")
        ticker_start_time = time.time()

        stock = yf.Ticker(ticker)
        data_master = stock.history(start=start, end=end)

        # 欠損値除去
        data = data_master.dropna().copy()

        # 終値の階差特徴量
        data["Close_diff_2"] = data["Close"] - data["Close"].shift(2)
        data["Close_diff_4"] = data["Close"] - data["Close"].shift(4)
        data["Close_diff_8"] = data["Close"] - data["Close"].shift(8)
        data["Close_diff_16"] = data["Close"] - data["Close"].shift(16)

        # ラベル作成
        data["future_close"] = data["Close"].shift(-16)
        data["next_open"] = data["Open"].shift(-15)
        data["y_label"] = (data["future_close"] > data["next_open"]).astype(int)

        # Haarフィルター
        h = np.array([1/np.sqrt(2), 1/np.sqrt(2)])
        g = np.array([1/np.sqrt(2), -1/np.sqrt(2)])

        v_current = data["Close"].values
        N = len(v_current)
        max_level = 4

        def line_index_for_level(j, t):
            return (2**j - 1) + (2**j)*t

        for j in range(1, max_level+1):
            length_prev = len(v_current)
            length_new = length_prev // 2

            v_j = np.zeros(length_new)
            w_j = np.zeros(length_new)

            for t_i in range(length_new):
                v_j[t_i] = g[0]*v_current[2*t_i] + g[1]*v_current[2*t_i+1]
                w_j[t_i] = h[0]*v_current[2*t_i] + h[1]*v_current[2*t_i+1]

            w_col = np.full(N, np.nan)
            v_col = np.full(N, np.nan)

            for t_i in range(length_new):
                idx = line_index_for_level(j, t_i)
                if idx < N:
                    w_col[idx] = w_j[t_i]
                    v_col[idx] = v_j[t_i]
                else:
                    break

            data[f"w_{j}"] = w_col
            data[f"v_{j}"] = v_col

            v_current = v_j

        data = data.drop(columns=["future_close", "next_open"])
        data = data.ffill()
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        data = data.fillna(0)

        y = data["y_label"].values

        # ウィンドウベース特徴量生成
        window_size = 16

        # 特徴量セット毎に評価
        for set_name, feature_cols in feature_sets.items():
            print(f"  Evaluating feature set: {set_name}...")
            feature_start_time = time.time()

            # 特徴量列が存在するか確認
            missing_cols = [col for col in feature_cols if col not in data.columns]
            if missing_cols:
                print(f"  Missing columns in feature set {set_name}: {missing_cols}")
                continue

            # windowベースでXを再構築
            # １次元化
            feature_values = data[feature_cols].values
            num_samples = len(data)
            # t行目のラベルを予測するときにはt-15〜t行を使うためtは15行目から始める
            valid_start = window_size - 1
            X_list = []
            y_list = []

            for t_i in range(valid_start, num_samples):
                window_data = feature_values[t_i - window_size + 1 : t_i + 1, :]  
                window_features = window_data.flatten()
                X_list.append(window_features)
                y_list.append(y[t_i])

            X_w = np.array(X_list)
            y_w = np.array(y_list)

            n_splits = min(30, len(X_w) - 1)
            if n_splits < 2:
                print(f"  Not enough data for TimeSeriesSplit. Skipping feature set {set_name}.")
                continue

            tscv = TimeSeriesSplit(n_splits=n_splits)

            all_predictions = []
            all_actual = []

            for train_index, test_index in tscv.split(X_w):
                X_train, X_test = X_w[train_index], X_w[test_index]
                y_train, y_test = y_w[train_index], y_w[test_index]

                model = XGBClassifier(eval_metric='logloss')
                model.fit(X_train, y_train)
                preds = model.predict(X_test)

                all_predictions.extend(preds)
                all_actual.extend(y_test)

            feature_end_time = time.time()

            all_predictions = np.array(all_predictions)
            all_actual = np.array(all_actual)

            accuracy = accuracy_score(all_actual, all_predictions)
            precision = precision_score(all_actual, all_predictions, zero_division=0)
            recall = recall_score(all_actual, all_predictions, zero_division=0)
            f1 = f1_score(all_actual, all_predictions, zero_division=0)

            results.append({
                "Ticker": ticker,
                "Feature_Set": set_name,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1": f1,
                "Execution_Time(sec)": feature_end_time - feature_start_time
            })

        ticker_end_time = time.time()
        print(f"Finished processing ticker: {ticker}. Time taken: {ticker_end_time - ticker_start_time:.2f} seconds")

    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")
        continue  # エラーが発生したティッカーをスキップ

print("All Results:")
for res in results:
    print(res)


Processing ticker: ^N225...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: ^N225. Time taken: 17.16 seconds
Processing ticker: ^TOPX...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: ^TOPX. Time taken: 16.18 seconds
Processing ticker: 7203.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 7203.T. Time taken: 17.55 seconds
Processing ticker: 6758.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 6758.T. Time taken: 17.31 seconds
Processing ticker: 9434.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 9434.T. Time taken: 9.48 seconds
Processing ticker: 6502.T...


$6502.T: possibly delisted; no timezone found


  Evaluating feature set: Set3...
  Not enough data for TimeSeriesSplit. Skipping feature set Set3.
  Evaluating feature set: Set4...
  Not enough data for TimeSeriesSplit. Skipping feature set Set4.
Finished processing ticker: 6502.T. Time taken: 3.28 seconds
Processing ticker: 8306.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 8306.T. Time taken: 17.29 seconds
Processing ticker: 6501.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 6501.T. Time taken: 16.92 seconds
Processing ticker: 6861.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 6861.T. Time taken: 16.95 seconds
Processing ticker: 6098.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 6098.T. Time taken: 13.27 seconds
Processing ticker: 9983.T...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finish

In [8]:
results_df = pd.DataFrame(results)
results_df = results_df.drop(columns=['Execution_Time(sec)'])

In [11]:
results_df['Feature_Set'] = results_df['Feature_Set'].replace({'Set3': 'Difference', 'Set4': 'Wavelet'})


In [12]:
results_df

Unnamed: 0,Ticker,Feature_Set,Accuracy,Precision,Recall,F1
0,^N225,Difference,0.503989,0.522903,0.555191,0.538563
1,^N225,Wavelet,0.496866,0.517003,0.531694,0.524246
2,^TOPX,Difference,0.505903,0.527103,0.560265,0.543178
3,^TOPX,Wavelet,0.492708,0.516301,0.513907,0.515101
4,7203.T,Difference,0.507345,0.504783,0.50937,0.507066
5,7203.T,Wavelet,0.505932,0.503663,0.468484,0.485437
6,6758.T,Difference,0.501695,0.492771,0.470115,0.481176
7,6758.T,Wavelet,0.505085,0.49654,0.494828,0.495682
8,9434.T,Difference,0.482051,0.502423,0.510673,0.506515
9,9434.T,Wavelet,0.486325,0.50641,0.518883,0.512571


In [13]:
# 必要な列追加
results_df.insert(1, 'Company_Name', None)
results_df.insert(2, 'Industry', None)
results_df.insert(3, 'Sector', None)
results_df.insert(4, 'Market_Cap', None)

# Ticker列から情報を取得して列に追加
for index, ticker in enumerate(results_df['Ticker']):
    if pd.notna(ticker):
        try:
            stock = yf.Ticker(str(ticker))
            info = stock.info

            results_df.at[index, 'Company_Name'] = info.get('shortName', 'N/A')
            results_df.at[index, 'Industry'] = info.get('industry', 'N/A')
            results_df.at[index, 'Sector'] = info.get('sector', 'N/A')
            results_df.at[index, 'Market_Cap'] = info.get('marketCap', 'N/A')

        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
    else:
        print(f"Skipping NaN value at index {index}.")

In [14]:
results_df

Unnamed: 0,Ticker,Company_Name,Industry,Sector,Market_Cap,Feature_Set,Accuracy,Precision,Recall,F1
0,^N225,Nikkei 225,,,,Difference,0.503989,0.522903,0.555191,0.538563
1,^N225,Nikkei 225,,,,Wavelet,0.496866,0.517003,0.531694,0.524246
2,^TOPX,,,,,Difference,0.505903,0.527103,0.560265,0.543178
3,^TOPX,,,,,Wavelet,0.492708,0.516301,0.513907,0.515101
4,7203.T,TOYOTA MOTOR CORP,Auto Manufacturers,Consumer Cyclical,34685944070144.0,Difference,0.507345,0.504783,0.50937,0.507066
5,7203.T,TOYOTA MOTOR CORP,Auto Manufacturers,Consumer Cyclical,34685944070144.0,Wavelet,0.505932,0.503663,0.468484,0.485437
6,6758.T,SONY GROUP CORPORATION,Consumer Electronics,Technology,19283639795712.0,Difference,0.501695,0.492771,0.470115,0.481176
7,6758.T,SONY GROUP CORPORATION,Consumer Electronics,Technology,19283639795712.0,Wavelet,0.505085,0.49654,0.494828,0.495682
8,9434.T,SOFTBANK CORP.,Telecom Services,Communication Services,9489193369600.0,Difference,0.482051,0.502423,0.510673,0.506515
9,9434.T,SOFTBANK CORP.,Telecom Services,Communication Services,9489193369600.0,Wavelet,0.486325,0.50641,0.518883,0.512571


In [15]:
output_file = "evaluation_results_WBgbrt.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to evaluation_results_WBgbrt.csv


この結果はgbrtの結果　