In [32]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# 7日間ごとにデータを取得し、結合する関数
def get_last_days_data_in_chunks(ticker, interval="1m"):
    end_date = datetime.now()  # 現在の日付を終了日に設定
    start_date = end_date - timedelta(days=10)  # XX日前を開始日に設定
    all_data = pd.DataFrame()  # 取得した全データを格納するデータフレーム

    while start_date < end_date:
        # 7日間の終了日を設定
        period_end = min(start_date + timedelta(days=7), end_date)
        print(f"Fetching data from {start_date.strftime('%Y-%m-%d')} to {period_end.strftime('%Y-%m-%d')}")
        
        # データの取得
        data = yf.download(ticker, interval=interval, start=start_date.strftime('%Y-%m-%d'), end=period_end.strftime('%Y-%m-%d'))
        
        # データを結合
        all_data = pd.concat([all_data, data])
        
        # 次の開始日に移動
        start_date = period_end

    return all_data

# データの取得
ticker = "^N225"
data = get_last_days_data_in_chunks(ticker)

# 取得したデータの確認
print(data)

Fetching data from 2024-10-28 to 2024-11-04
[*********************100%%**********************]  1 of 1 completed
Fetching data from 2024-11-04 to 2024-11-07
[*********************100%%**********************]  1 of 1 completed
                                   Open          High           Low  \
Datetime                                                              
2024-10-28 09:00:00+09:00  37757.949219  37791.781250  37757.949219   
2024-10-28 09:01:00+09:00  37795.851562  37814.398438  37795.851562   
2024-10-28 09:03:00+09:00  37960.089844  38019.519531  37960.070312   
2024-10-28 09:04:00+09:00  38036.320312  38051.921875  37994.968750   
2024-10-28 09:05:00+09:00  38008.761719  38008.761719  37921.660156   
...                                 ...           ...           ...   
2024-11-06 14:55:00+09:00  39382.179688  39383.558594  39367.261719   
2024-11-06 14:56:00+09:00  39376.781250  39383.878906  39376.781250   
2024-11-06 14:57:00+09:00  39379.710938  39400.781250  39379.710

In [26]:
# 欠損値の確認
missing_values = data.isnull().sum()
print("欠損値の数:\n", missing_values)

# 欠損値のある行を削除
data = data.dropna()

欠損値の数:
 Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [27]:
# 四本値の範囲を確認（負の値や極端に高い値を検出）
for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
    print(f'{column} の異常値:', data[(data[column] < 0) | (data[column] > data[column].mean() + 3 * data[column].std())])

# 異常値を補正（例: 負の値を0に置き換える）
for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
    data[column] = np.where(data[column] < 0, 0, data[column])

Open の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
High の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Low の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Close の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Volume の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []


In [28]:
from sklearn.preprocessing import StandardScaler

# 必要な列のみを選択して標準化
scaler = StandardScaler()
data[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(data[['Open', 'High', 'Low', 'Close', 'Volume']])

In [29]:
# 必要な列のみを選択
features = data[['Open', 'High', 'Low', 'Close', 'Volume']]

# 1. 5期間移動平均
data['SMA_5'] = data['Close'].rolling(window=5).mean()

# 2. 20期間移動平均
data['SMA_20'] = data['Close'].rolling(window=20).mean()

# 4. 短期・中期移動平均差分（5期間 - 20期間）
data['SMA_diff_5_20'] = data['SMA_5'] - data['SMA_20']

# 5. ボリンジャーバンド上限（20期間）
data['BB_upper'] = data['SMA_20'] + (data['Close'].rolling(window=20).std() * 2)

# 6. ボリンジャーバンド下限（20期間）
data['BB_lower'] = data['SMA_20'] - (data['Close'].rolling(window=20).std() * 2)

# 7. ボリンジャーバンド幅
data['BB_width'] = data['BB_upper'] - data['BB_lower']

# 8. 標準偏差（20期間）
data['Volatility_20'] = data['Close'].rolling(window=20).std()

# 9. ATR（平均値レンジ）
data['Prev_Close'] = data['Close'].shift(1)  # 前日の終値を新しい列に保存

data['TR'] = data[['High', 'Low', 'Prev_Close']].apply(
    lambda x: max(
        x['High'] - x['Low'],
        abs(x['High'] - x['Prev_Close']),
        abs(x['Low'] - x['Prev_Close'])
    ),
    axis=1
)

data['ATR_14'] = data['TR'].rolling(window=14).mean()

# 'Prev_Close'列は不要なので削除する場合は次の行を追加
data.drop(columns=['Prev_Close'], inplace=True)


# 10. RSI（14期間）
delta = data['Close'].diff(1)
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
data['RSI_14'] = 100 - (100 / (1 + rs))

# 11. MACD
data['EMA_12'] = data['Close'].ewm(span=12, adjust=False).mean()
data['EMA_26'] = data['Close'].ewm(span=26, adjust=False).mean()
data['MACD'] = data['EMA_12'] - data['EMA_26']

# 12. MACDシグナル
data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()

# 13. MACDヒストグラム
data['MACD_histogram'] = data['MACD'] - data['MACD_signal']

# 14. ストキャスティクス（%K）
data['L14'] = data['Low'].rolling(window=14).min()
data['H14'] = data['High'].rolling(window=14).max()
data['%K'] = (data['Close'] - data['L14']) * 100 / (data['H14'] - data['L14'])

# 15. ストキャスティクス（%D）
data['%D'] = data['%K'].rolling(window=3).mean()

# 17. 移動平均の傾き（20期間）
data['SMA_slope_20'] = data['SMA_20'].diff()

# 18. サポートラインとの距離
data['Support_Distance'] = data['Low'] - data['Low'].rolling(window=20).min()

# 19. レジスタンスラインとの距離
data['Resistance_Distance'] = data['High'].rolling(window=20).max() - data['High']

# 20. 価格変化率（5期間）
data['Price_Change_5'] = data['Close'].pct_change(periods=5)

# 21. 価格変化率（20期間）
data['Price_Change_20'] = data['Close'].pct_change(periods=20)

# 22. 瞬間ボラティリティ（差分平方）
data['Instant_Volatility'] = data['Close'].diff().pow(2).rolling(window=5).sum()

# 23. エンベロープ（±2%）
data['Envelope_upper'] = data['SMA_20'] * 1.02
data['Envelope_lower'] = data['SMA_20'] * 0.98

# 24. DMIのADX（14期間）
data['Plus_DM'] = data['High'].diff()
data['Minus_DM'] = -data['Low'].diff()
data['Plus_DI'] = 100 * (data['Plus_DM'].where(data['Plus_DM'] > 0, 0)).ewm(alpha=1/14, min_periods=0).mean() / data['ATR_14']
data['Minus_DI'] = 100 * (data['Minus_DM'].where(data['Minus_DM'] > 0, 0)).ewm(alpha=1/14, min_periods=0).mean() / data['ATR_14']
data['DX'] = (abs(data['Plus_DI'] - data['Minus_DI']) / abs(data['Plus_DI'] + data['Minus_DI'])) * 100
data['ADX'] = data['DX'].ewm(alpha=1/14, min_periods=0).mean()

# 25. 移動平均乖離率（20期間）
data['MA_deviation_20'] = (data['Close'] - data['SMA_20']) / data['SMA_20']

# 26. ウィリアムズ％R（14期間）
data['Williams_R'] = (data['H14'] - data['Close']) / (data['H14'] - data['L14']) * -100

# 27. 終値/始値の比率
data['Close_to_Open'] = data['Close'] / data['Open']

# 28. 終値の変化幅（前日比）
data['Close_Change'] = data['Close'].diff()

# 29. 終値の変化率（前日比、パーセンテージ）
data['Close_Pct_Change'] = data['Close'].pct_change() * 100

# 30. 前のローソク足との比率
data['Previous_Ratio'] = data['Close'] / data['Close'].shift(1)

# 欠損値の発生する先頭行を削除
data = data.dropna()

print(data)

                               Open      High       Low     Close  \
Datetime                                                            
2024-10-28 09:21:00+09:00 -0.427093 -0.364837 -0.488210 -0.375629   
2024-10-28 09:22:00+09:00 -0.383178 -0.396806 -0.604819 -0.628348   
2024-10-28 09:23:00+09:00 -0.619264 -0.559871 -0.616763 -0.583311   
2024-10-28 09:24:00+09:00 -0.529430 -0.433500 -0.507797 -0.423003   
2024-10-28 09:25:00+09:00 -0.382637 -0.388149 -0.499373 -0.440075   
...                             ...       ...       ...       ...   
2024-11-06 14:55:00+09:00  1.515407  1.496467  1.499341  1.495000   
2024-11-06 14:56:00+09:00  1.501550  1.497290  1.523782  1.507037   
2024-11-06 14:57:00+09:00  1.509070  1.540694  1.531304  1.542575   
2024-11-06 14:58:00+09:00  1.543670  1.587098  1.546548  1.603691   
2024-11-06 14:59:00+09:00  1.620691  1.653373  1.642958  1.656311   

                              Adj Close  Volume     SMA_5    SMA_20  \
Datetime                       

In [30]:
# 終値の1日後の値を追加
data['Next_Close'] = data['Close'].shift(-1)

# 終値が上がるか下がるかのラベルを作成
#　Tureなら1、Falseなら0
data.loc[:, 'Price_Up'] = (data['Next_Close'] > data['Close']).astype(int)

# 最後の行は次の値がないため削除
data = data.dropna()

print(data[[ 'Close', 'Next_Close', 'Price_Up']].head())

                              Close  Next_Close  Price_Up
Datetime                                                 
2024-10-28 09:21:00+09:00 -0.375629   -0.628348         0
2024-10-28 09:22:00+09:00 -0.628348   -0.583311         1
2024-10-28 09:23:00+09:00 -0.583311   -0.423003         1
2024-10-28 09:24:00+09:00 -0.423003   -0.440075         0
2024-10-28 09:25:00+09:00 -0.440075   -0.536318         0


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

columns_to_scale = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_5', 'SMA_20', 'SMA_diff_5_20', 
                    'BB_upper', 'BB_lower', 'BB_width', 'Volatility_20', 'ATR_14', 'RSI_14', 
                    'MACD', 'MACD_signal', 'MACD_histogram', '%K', '%D', 'SMA_slope_20', 
                    'Support_Distance', 'Resistance_Distance', 'Price_Change_5', 'Price_Change_20', 
                    'Instant_Volatility', 'Envelope_upper', 'Envelope_lower', 'Plus_DI', 
                    'Minus_DI', 'DX', 'ADX', 'MA_deviation_20', 'Williams_R', 'Close_to_Open', 
                    'Close_Change', 'Close_Pct_Change', 'Previous_Ratio']

# 特徴量とラベルの定義
X = data[columns_to_scale]  # 標準化した特徴量
y = data['Price_Up']

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ランダムフォレストの学習と予測
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# 正解率の計算
accuracy = accuracy_score(y_test, y_pred)
print(f'ランダムフォレストのAccuracy: {accuracy:.3f}')

ランダムフォレストのAccuracy: 0.512
