In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
import time
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings("ignore")



In [3]:
# 処理対象のティッカー一覧
tickers = [
    "^N225",
    "^TOPX",
    "7203.T",  # トヨタ
    "6758.T",  # SONY
    "9434.T",  # ソフトバンク
    "6502.T",  # 東芝
    "8306.T",  # 三菱UFJ
    "6501.T",  # 日立
    "6861.T",  # キーエンス
    "6098.T",  # リクルート
    "9983.T",  # ファストリ
    "9432.T"   # NTT
]

# 特徴量
feature_sets = {
    "Set1": [
        "LogDiff_Open", "LogDiff_High", "LogDiff_Low", "LogDiff_Close", "LogDiff_Volume",
        "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
    ],
    "Set2": [
        "LogDiff_Open", "LogDiff_High", "LogDiff_Low", "LogDiff_Close", "LogDiff_Volume",
        "w_1", "v_1", "w_2", "v_2", "w_3", "v_3", "w_4", "v_4"
    ],
    "Set3": [
        "Close_diff_2", "Close_diff_4", "Close_diff_8", "Close_diff_16"
    ],
    "Set4": [
        "w_1", "v_1", "w_2", "v_2", "w_3", "v_3", "w_4", "v_4"
    ]
}

start = "2009-01-01"
end = "2023-12-31"

results = []

# 処理開始
overall_start_time = time.time()



for ticker in tickers:
    try:
        print(f"Processing ticker: {ticker}...")
        ticker_start_time = time.time()

        stock = yf.Ticker(ticker)
        data_master = stock.history(start=start, end=end)

        # 欠損値除去
        data = data_master.dropna().copy()

        # 対数差分計算用関数
        def log_diff(series):
            return np.log(series) - np.log(series.shift(1))

        # OHLCVに対して対数差分を計算
        data["LogDiff_Open"] = log_diff(data["Open"])
        data["LogDiff_High"] = log_diff(data["High"])
        data["LogDiff_Low"] = log_diff(data["Low"])
        data["LogDiff_Close"] = log_diff(data["Close"])
        data["LogDiff_Volume"] = log_diff(data["Volume"])

        # 終値の階差特徴量
        data["Close_diff_2"] = data["Close"] - data["Close"].shift(2)
        data["Close_diff_4"] = data["Close"] - data["Close"].shift(4)
        data["Close_diff_8"] = data["Close"] - data["Close"].shift(8)
        data["Close_diff_16"] = data["Close"] - data["Close"].shift(16)

        # ラベル作成
        data["future_close"] = data["Close"].shift(-16)
        data["next_open"] = data["Open"].shift(-15)
        data["y_label"] = (data["future_close"] > data["next_open"]).astype(int)

        # Haarフィルター
        h = np.array([1/np.sqrt(2), 1/np.sqrt(2)])
        g = np.array([1/np.sqrt(2), -1/np.sqrt(2)])

        v_current = data["Close"].values
        N = len(v_current)
        max_level = 4

        def line_index_for_level(j, t):
            return (2**j - 1) + (2**j)*t

        for j in range(1, max_level+1):
            length_prev = len(v_current)
            length_new = length_prev // 2

            v_j = np.zeros(length_new)
            w_j = np.zeros(length_new)

            for t in range(length_new):
                v_j[t] = g[0]*v_current[2*t] + g[1]*v_current[2*t+1]
                w_j[t] = h[0]*v_current[2*t] + h[1]*v_current[2*t+1]

            w_col = np.full(N, np.nan)
            v_col = np.full(N, np.nan)

            for t in range(length_new):
                idx = line_index_for_level(j, t)
                if idx < N:
                    w_col[idx] = w_j[t]
                    v_col[idx] = v_j[t]
                else:
                    break

            data[f"w_{j}"] = w_col
            data[f"v_{j}"] = v_col

            v_current = v_j

        data = data.drop(columns=["future_close", "next_open"])
        data = data.ffill()
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        data = data.fillna(0)

        X_all = data
        y = data["y_label"].values

        # 分割器
        tscv = TimeSeriesSplit(n_splits=10)

        # 特徴量セット毎に評価
        for set_name, feature_cols in feature_sets.items():
            print(f"  Evaluating feature set: {set_name}...")
            feature_start_time = time.time()

            # 特徴量列が存在するか確認
            missing_cols = [col for col in feature_cols if col not in data.columns]
            if missing_cols:
                print(f"  Missing columns in feature set {set_name}: {missing_cols}")
                continue

            # 特徴量抽出
            X = X_all[feature_cols].astype(float).values
            y = data["y_label"].values

            # 分割数を調整
            n_splits = min(30, len(X) - 1)
            if n_splits < 2:
                print(f"  Not enough data for TimeSeriesSplit. Skipping feature set {set_name}.")
                continue

            tscv = TimeSeriesSplit(n_splits=n_splits)

            all_predictions = []
            all_actual = []

            for train_index, test_index in tscv.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model = XGBClassifier(eval_metric='logloss')
                model.fit(X_train, y_train)
                preds = model.predict(X_test)

                all_predictions.extend(preds)
                all_actual.extend(y_test)

            feature_end_time = time.time()

            all_predictions = np.array(all_predictions)
            all_actual = np.array(all_actual)

            accuracy = accuracy_score(all_actual, all_predictions)
            precision = precision_score(all_actual, all_predictions, zero_division=0)
            recall = recall_score(all_actual, all_predictions, zero_division=0)
            f1 = f1_score(all_actual, all_predictions, zero_division=0)

            results.append({
                "Ticker": ticker,
                "Feature_Set": set_name,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1": f1,
                "Execution_Time(sec)": feature_end_time - feature_start_time
            })

        ticker_end_time = time.time()
        print(f"Finished processing ticker: {ticker}. Time taken: {ticker_end_time - ticker_start_time:.2f} seconds")

    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")
        continue  # エラーが発生したティッカーをスキップ


Processing ticker: ^N225...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: ^N225. Time taken: 13.81 seconds
Processing ticker: ^TOPX...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: ^TOPX. Time taken: 12.01 seconds
Processing ticker: 7203.T...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 7203.T. Time taken: 12.38 seconds
Processing ticker: 6758.T...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 6758.T. Time taken: 12.23 seconds
Processing ticker: 9434.T...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...

$6502.T: possibly delisted; no timezone found


  Evaluating feature set: Set1...
  Not enough data for TimeSeriesSplit. Skipping feature set Set1.
  Evaluating feature set: Set2...
  Not enough data for TimeSeriesSplit. Skipping feature set Set2.
  Evaluating feature set: Set3...
  Not enough data for TimeSeriesSplit. Skipping feature set Set3.
  Evaluating feature set: Set4...
  Not enough data for TimeSeriesSplit. Skipping feature set Set4.
Finished processing ticker: 6502.T. Time taken: 2.65 seconds
Processing ticker: 8306.T...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 8306.T. Time taken: 14.19 seconds
Processing ticker: 6501.T...
  Evaluating feature set: Set1...
  Evaluating feature set: Set2...
  Evaluating feature set: Set3...
  Evaluating feature set: Set4...
Finished processing ticker: 6501.T. Time taken: 13.94 seconds
Processing ticker: 6861.T...
  Evaluating feature set: Set1...
  Evaluating feature s

In [34]:
results_df = pd.DataFrame(results)
results_df = results_df.drop(columns=['Execution_Time(sec)'])

In [35]:
results_df

Unnamed: 0,Ticker,Feature_Set,Accuracy,Precision,Recall,F1
0,^N225,Set1,0.506215,0.524845,0.549892,0.537076
1,^N225,Set2,0.491525,0.51236,0.494577,0.503311
2,^N225,Set3,0.518644,0.536496,0.558026,0.547049
3,^N225,Set4,0.494915,0.515184,0.515184,0.515184
4,^TOPX,Set1,0.514583,0.535443,0.560265,0.547573
5,^TOPX,Set2,0.511458,0.532782,0.554305,0.54333
6,^TOPX,Set3,0.504167,0.52635,0.542384,0.534247
7,^TOPX,Set4,0.505556,0.527529,0.545695,0.536458
8,7203.T,Set1,0.496359,0.494955,0.468276,0.481246
9,7203.T,Set2,0.502241,0.501131,0.497473,0.499296


In [36]:
results_df = results_df[~results_df['Feature_Set'].isin(['Set1', 'Set2'])]
results_df['Feature_Set'] = results_df['Feature_Set'].replace({'Set3': 'Difference', 'Set4': 'Wavelet'})
results_df = results_df.reset_index(drop=True)



In [37]:
results_df

Unnamed: 0,Ticker,Feature_Set,Accuracy,Precision,Recall,F1
0,^N225,Difference,0.518644,0.536496,0.558026,0.547049
1,^N225,Wavelet,0.494915,0.515184,0.515184,0.515184
2,^TOPX,Difference,0.504167,0.52635,0.542384,0.534247
3,^TOPX,Wavelet,0.505556,0.527529,0.545695,0.536458
4,7203.T,Difference,0.509804,0.508965,0.494104,0.501425
5,7203.T,Wavelet,0.494678,0.49365,0.501965,0.497773
6,6758.T,Difference,0.512045,0.502304,0.498286,0.500287
7,6758.T,Wavelet,0.512045,0.50213,0.538857,0.519846
8,9434.T,Difference,0.5,0.516611,0.51405,0.515327
9,9434.T,Wavelet,0.525641,0.534916,0.633058,0.579864


In [38]:
# 必要な列追加
results_df.insert(1, 'Company_Name', None)
results_df.insert(2, 'Industry', None)
results_df.insert(3, 'Sector', None)
results_df.insert(4, 'Market_Cap', None)

# Ticker列から情報を取得して列に追加
for index, ticker in enumerate(results_df['Ticker']):
    if pd.notna(ticker):
        try:
            stock = yf.Ticker(str(ticker))
            info = stock.info

            results_df.at[index, 'Company_Name'] = info.get('shortName', 'N/A')
            results_df.at[index, 'Industry'] = info.get('industry', 'N/A')
            results_df.at[index, 'Sector'] = info.get('sector', 'N/A')
            results_df.at[index, 'Market_Cap'] = info.get('marketCap', 'N/A')

        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
    else:
        print(f"Skipping NaN value at index {index}.")


In [39]:
results_df

Unnamed: 0,Ticker,Company_Name,Industry,Sector,Market_Cap,Feature_Set,Accuracy,Precision,Recall,F1
0,^N225,Nikkei 225,,,,Difference,0.518644,0.536496,0.558026,0.547049
1,^N225,Nikkei 225,,,,Wavelet,0.494915,0.515184,0.515184,0.515184
2,^TOPX,,,,,Difference,0.504167,0.52635,0.542384,0.534247
3,^TOPX,,,,,Wavelet,0.505556,0.527529,0.545695,0.536458
4,7203.T,TOYOTA MOTOR CORP,Auto Manufacturers,Consumer Cyclical,34685944070144.0,Difference,0.509804,0.508965,0.494104,0.501425
5,7203.T,TOYOTA MOTOR CORP,Auto Manufacturers,Consumer Cyclical,34685944070144.0,Wavelet,0.494678,0.49365,0.501965,0.497773
6,6758.T,SONY GROUP CORPORATION,Consumer Electronics,Technology,19283639795712.0,Difference,0.512045,0.502304,0.498286,0.500287
7,6758.T,SONY GROUP CORPORATION,Consumer Electronics,Technology,19283639795712.0,Wavelet,0.512045,0.50213,0.538857,0.519846
8,9434.T,SOFTBANK CORP.,Telecom Services,Communication Services,9489193369600.0,Difference,0.5,0.516611,0.51405,0.515327
9,9434.T,SOFTBANK CORP.,Telecom Services,Communication Services,9489193369600.0,Wavelet,0.525641,0.534916,0.633058,0.579864


In [40]:
output_file = "evaluation_results_gbrt.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to evaluation_results_gbrt.csv


この結果はgbrtの結果　

In [1]:
data

NameError: name 'data' is not defined