In [9]:
import yfinance as yf
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV


# 7日間ごとにデータを取得し、結合する関数
def get_last_days_data_in_chunks(ticker, interval="1m"):
    end_date = datetime.now()  # 現在の日付を終了日に設定
    start_date = end_date - timedelta(days=4)  # XX日前を開始日に設定
    all_data = pd.DataFrame()  # 取得した全データを格納するデータフレーム

    while start_date < end_date:
        # 7日間の終了日を設定
        period_end = min(start_date + timedelta(days=7), end_date)
        print(f"Fetching data from {start_date.strftime('%Y-%m-%d')} to {period_end.strftime('%Y-%m-%d')}")
        
        # データの取得
        data = yf.download(ticker, interval=interval, start=start_date.strftime('%Y-%m-%d'), end=period_end.strftime('%Y-%m-%d'))
        
        # データを結合
        all_data = pd.concat([all_data, data])
        
        # 次の開始日に移動
        start_date = period_end

    return all_data

# データの取得
ticker = "^N225"
data = get_last_days_data_in_chunks(ticker)

# 取得したデータの確認
print(data)

Fetching data from 2024-11-21 to 2024-11-25
[*********************100%%**********************]  1 of 1 completed
                                   Open          High           Low  \
Datetime                                                              
2024-11-21 09:00:00+09:00  38352.871094  38374.691406  38352.871094   
2024-11-21 09:01:00+09:00  38342.800781  38342.800781  38301.820312   
2024-11-21 09:02:00+09:00  38294.769531  38294.769531  38248.671875   
2024-11-21 09:03:00+09:00  38239.238281  38247.820312  38204.109375   
2024-11-21 09:04:00+09:00  38212.968750  38212.968750  38161.230469   
...                                 ...           ...           ...   
2024-11-22 15:25:00+09:00  38357.421875  38357.421875  38357.421875   
2024-11-22 15:26:00+09:00  38357.421875  38357.421875  38357.421875   
2024-11-22 15:27:00+09:00  38357.421875  38357.421875  38357.421875   
2024-11-22 15:28:00+09:00  38357.421875  38357.421875  38357.421875   
2024-11-22 15:29:00+09:00  38357.42

In [10]:
# 欠損値の確認
missing_values = data.isnull().sum()
print("欠損値の数:\n", missing_values)

# 欠損値のある行を削除
data = data.dropna()

欠損値の数:
 Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [11]:
# 四本値の範囲を確認（負の値や極端に高い値を検出）
for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
    print(f'{column} の異常値:', data[(data[column] < 0) | (data[column] > data[column].mean() + 3 * data[column].std())])

# 異常値を補正（例: 負の値を0に置き換える）
for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
    data[column] = np.where(data[column] < 0, 0, data[column])

Open の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
High の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Low の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Close の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Volume の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []


In [12]:
# 必要な列のみを選択して標準化
scaler = StandardScaler()
data[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(data[['Open', 'High', 'Low', 'Close', 'Volume']])

In [13]:
# 必要な列のみを選択
features = data[['Open', 'High', 'Low', 'Close', 'Volume']]

# 1. 5期間移動平均
data['SMA_5'] = data['Close'].rolling(window=5).mean()

# 2. 20期間移動平均
data['SMA_20'] = data['Close'].rolling(window=20).mean()

# 4. 短期・中期移動平均差分（5期間 - 20期間）
data['SMA_diff_5_20'] = data['SMA_5'] - data['SMA_20']

# 5. ボリンジャーバンド上限（20期間）
data['BB_upper'] = data['SMA_20'] + (data['Close'].rolling(window=20).std() * 2)

# 6. ボリンジャーバンド下限（20期間）
data['BB_lower'] = data['SMA_20'] - (data['Close'].rolling(window=20).std() * 2)

# 7. ボリンジャーバンド幅
data['BB_width'] = data['BB_upper'] - data['BB_lower']

# 8. 標準偏差（20期間）
data['Volatility_20'] = data['Close'].rolling(window=20).std()

# 9. ATR（平均値レンジ）
data['Prev_Close'] = data['Close'].shift(1)  # 前日の終値を新しい列に保存

data['TR'] = data[['High', 'Low', 'Prev_Close']].apply(
    lambda x: max(
        x['High'] - x['Low'],
        abs(x['High'] - x['Prev_Close']),
        abs(x['Low'] - x['Prev_Close'])
    ),
    axis=1
)

data['ATR_14'] = data['TR'].rolling(window=14).mean()

# 'Prev_Close'列は不要なので削除する場合は次の行を追加
data.drop(columns=['Prev_Close'], inplace=True)


# 10. RSI（14期間）
delta = data['Close'].diff(1)
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
data['RSI_14'] = 100 - (100 / (1 + rs))

# 11. MACD
data['EMA_12'] = data['Close'].ewm(span=12, adjust=False).mean()
data['EMA_26'] = data['Close'].ewm(span=26, adjust=False).mean()
data['MACD'] = data['EMA_12'] - data['EMA_26']

# 12. MACDシグナル
data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()

# 13. MACDヒストグラム
data['MACD_histogram'] = data['MACD'] - data['MACD_signal']

# 14. ストキャスティクス（%K）
data['L14'] = data['Low'].rolling(window=14).min()
data['H14'] = data['High'].rolling(window=14).max()
data['%K'] = (data['Close'] - data['L14']) * 100 / (data['H14'] - data['L14'])

# 15. ストキャスティクス（%D）
data['%D'] = data['%K'].rolling(window=3).mean()

# 17. 移動平均の傾き（20期間）
data['SMA_slope_20'] = data['SMA_20'].diff()

# 18. サポートラインとの距離
data['Support_Distance'] = data['Low'] - data['Low'].rolling(window=20).min()

# 19. レジスタンスラインとの距離
data['Resistance_Distance'] = data['High'].rolling(window=20).max() - data['High']

# 20. 価格変化率（5期間）
data['Price_Change_5'] = data['Close'].pct_change(periods=5)

# 21. 価格変化率（20期間）
data['Price_Change_20'] = data['Close'].pct_change(periods=20)

# 22. 瞬間ボラティリティ（差分平方）
data['Instant_Volatility'] = data['Close'].diff().pow(2).rolling(window=5).sum()

# 23. エンベロープ（±2%）
data['Envelope_upper'] = data['SMA_20'] * 1.02
data['Envelope_lower'] = data['SMA_20'] * 0.98

# 24. DMIのADX（14期間）
data['Plus_DM'] = data['High'].diff()
data['Minus_DM'] = -data['Low'].diff()
data['Plus_DI'] = 100 * (data['Plus_DM'].where(data['Plus_DM'] > 0, 0)).ewm(alpha=1/14, min_periods=0).mean() / data['ATR_14']
data['Minus_DI'] = 100 * (data['Minus_DM'].where(data['Minus_DM'] > 0, 0)).ewm(alpha=1/14, min_periods=0).mean() / data['ATR_14']
data['DX'] = (abs(data['Plus_DI'] - data['Minus_DI']) / abs(data['Plus_DI'] + data['Minus_DI'])) * 100
data['ADX'] = data['DX'].ewm(alpha=1/14, min_periods=0).mean()

# 25. 移動平均乖離率（20期間）
data['MA_deviation_20'] = (data['Close'] - data['SMA_20']) / data['SMA_20']

# 26. ウィリアムズ％R（14期間）
data['Williams_R'] = (data['H14'] - data['Close']) / (data['H14'] - data['L14']) * -100

# 27. 終値/始値の比率
data['Close_to_Open'] = data['Close'] / data['Open']

# 28. 終値の変化幅（前日比）
data['Close_Change'] = data['Close'].diff()

# 29. 終値の変化率（前日比、パーセンテージ）
data['Close_Pct_Change'] = data['Close'].pct_change() * 100

# 30. 前のローソク足との比率
data['Previous_Ratio'] = data['Close'] / data['Close'].shift(1)

# 欠損値の発生する先頭行を削除
data = data.dropna()

print(data.head())

                               Open      High       Low     Close  \
Datetime                                                            
2024-11-21 09:20:00+09:00 -0.587188 -0.624834 -0.776734 -0.754246   
2024-11-21 09:21:00+09:00 -0.734837 -0.733086 -0.816783 -0.807898   
2024-11-21 09:22:00+09:00 -0.808477 -0.854041 -0.954521 -0.980190   
2024-11-21 09:23:00+09:00 -0.976817 -0.891263 -0.956519 -0.947002   
2024-11-21 09:24:00+09:00 -0.942823 -0.872442 -0.895276 -0.875010   

                              Adj Close  Volume     SMA_5    SMA_20  \
Datetime                                                              
2024-11-21 09:20:00+09:00  38077.929688     0.0 -0.649417 -0.205616   
2024-11-21 09:21:00+09:00  38069.941406     0.0 -0.674131 -0.285880   
2024-11-21 09:22:00+09:00  38044.289062     0.0 -0.754970 -0.354516   
2024-11-21 09:23:00+09:00  38049.230469     0.0 -0.811776 -0.406527   
2024-11-21 09:24:00+09:00  38059.949219     0.0 -0.872869 -0.444412   

                  

In [14]:
# 終値の1日後の値を追加
data['Next_Close'] = data['Close'].shift(-1)

# 終値が上がるか下がるかのラベルを作成
#　Tureなら1、Falseなら0
data.loc[:, 'Price_Up'] = (data['Next_Close'] > data['Close']).astype(int)

# 最後の行は次の値がないため削除
data = data.dropna()

print(data[[ 'Close', 'Next_Close', 'Price_Up']].head())

                              Close  Next_Close  Price_Up
Datetime                                                 
2024-11-21 09:20:00+09:00 -0.754246   -0.807898         0
2024-11-21 09:21:00+09:00 -0.807898   -0.980190         0
2024-11-21 09:22:00+09:00 -0.980190   -0.947002         1
2024-11-21 09:23:00+09:00 -0.947002   -0.875010         1
2024-11-21 09:24:00+09:00 -0.875010   -0.890122         0


In [15]:
columns_to_scale = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_5', 'SMA_20', 'SMA_diff_5_20', 
                    'BB_upper', 'BB_lower', 'BB_width', 'Volatility_20', 'ATR_14', 'RSI_14', 
                    'MACD', 'MACD_signal', 'MACD_histogram', '%K', '%D', 'SMA_slope_20', 
                    'Support_Distance', 'Resistance_Distance', 'Price_Change_5', 'Price_Change_20', 
                    'Instant_Volatility', 'Envelope_upper', 'Envelope_lower', 'Plus_DI', 
                    'Minus_DI', 'DX', 'ADX', 'MA_deviation_20', 'Williams_R', 'Close_to_Open', 
                    'Close_Change', 'Close_Pct_Change', 'Previous_Ratio']

# 特徴量とラベルの定義
X = data[columns_to_scale]
y = data['Price_Up']

In [25]:
#単純なランダムフォレストによる勝率の計算

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ランダムフォレストの学習と予測
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# 正解率の計算
accuracy = accuracy_score(y_test, y_pred)
print(f'ランダムフォレストのAccuracy: {accuracy:.3f}')

ランダムフォレストのAccuracy: 0.550


In [26]:
#クロスバリエーションを組み込んだランダムフォレストによる勝率の計算

# 時系列クロスバリデーションの設定
tscv = TimeSeriesSplit(n_splits=5)

# モデルの初期化
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# クロスバリデーションの実行
cv_scores = []
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # モデルの訓練
    rf_model.fit(X_train, y_train)
    
    # テストデータで予測
    y_pred = rf_model.predict(X_test)
    
    # 正解率の計算
    accuracy = accuracy_score(y_test, y_pred)
    cv_scores.append(accuracy)

# 各分割のスコアと平均スコアを表示
print(f'各FoldのAccuracy: {cv_scores}')
print(f'時系列クロスバリデーションの平均Accuracy: {np.mean(cv_scores):.3f}')

各FoldのAccuracy: [0.5046728971962616, 0.5514018691588785, 0.4766355140186916, 0.5327102803738317, 0.5981308411214953]
時系列クロスバリデーションの平均Accuracy: 0.533


In [22]:
#クロスバリエーション×ハイパラメータの設定×ランダムフォレスト
#めっちゃ時間かかるからあんまりやりたくない。

# 時系列クロスバリデーションの設定
tscv = TimeSeriesSplit(n_splits=5)

# ハイパーパラメータの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# ランダムフォレストの初期化
rf_model = RandomForestClassifier(random_state=42)

# GridSearchCVの初期化
grid_search = GridSearchCV(
    estimator=rf_model, 
    param_grid=param_grid, 
    cv=tscv, 
    scoring='accuracy', 
    n_jobs=-1, 
    verbose=2
)

# ハイパーパラメータチューニングの実行
grid_search.fit(X, y)

# 最適なモデルとそのスコアを表示
print(f'最適なハイパーパラメータ: {grid_search.best_params_}')
print(f'最適なモデルの平均Accuracy: {grid_search.best_score_:.3f}')


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
最適なハイパーパラメータ: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
最適なモデルの平均Accuracy: 0.540
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_l

In [23]:
#チューニング後に値を入れ直すためのコード
best_params = {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


rf_model_optimized = RandomForestClassifier(**best_params, random_state=42)
rf_model_optimized.fit(X_train, y_train)

y_pred = rf_model_optimized.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
print(f'最適ハイパーパラメータモデルのAccuracy: {final_accuracy:.3f}')


最適ハイパーパラメータモデルのAccuracy: 0.628


In [27]:
#xgboostで、時短してみた。1つ上と内容は同様のはず。

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# データの欠損値を処理
X = X.fillna(0)  # 特徴量の欠損値を処理
y = y.dropna()   # ラベルの欠損値を削除

# ラベルの不均衡を修正
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 時系列クロスバリデーションの設定
tscv = TimeSeriesSplit(n_splits=3)

# XGBoostモデルの初期化
xgb_model = XGBClassifier(random_state=42, tree_method='hist')

# パラメータグリッドの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# GridSearchCVの初期化
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=tscv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=3,
    error_score='raise'
)

# ハイパーパラメータチューニングの実行
grid_search.fit(X_resampled, y_resampled)

# 最適なモデルとそのスコアを表示
print(f'最適なハイパーパラメータ: {grid_search.best_params_}')
print(f'最適なモデルの平均Accuracy: {grid_search.best_score_:.3f}')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
最適なハイパーパラメータ: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
最適なモデルの平均Accuracy: 0.556
[CV 2/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.485 total time=   0.4s
[CV 1/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8;, score=0.528 total time=   1.0s
[CV 3/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=50, subsample=0.8;, score=0.558 total time=   1.0s
[CV 2/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.491 total time=   1.5s
[CV 3/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.583 total time=   2.6s
[CV 2/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=200, subsample=0.8;, score=0.485 total time=   3.8

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# データを訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# 最適なハイパーパラメータでモデルを再構築
best_params = {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 
               'n_estimators': 200, 'subsample': 1.0}
final_model = XGBClassifier(random_state=42, tree_method='hist', **best_params)

# モデルの訓練
final_model.fit(X_train, y_train)

# テストデータで予測
y_pred = final_model.predict(X_test)

# 勝率（Accuracy）を計算
accuracy = accuracy_score(y_test, y_pred)
print(f'最終モデルのテストデータでの勝率（Accuracy）: {accuracy:.3f}')


最終モデルのテストデータでの勝率（Accuracy）: 0.605
