In [102]:
import yfinance as yf
import pandas as pdimport 
import numpy as np

# データの取得
ticker = "^N225"
data = yf.download(ticker, interval="1m", period="3d")
print(data.head())

[*********************100%%**********************]  1 of 1 completed
                                   Open          High           Low  \
Datetime                                                              
2024-10-31 09:00:00+09:00  39179.718750  39179.718750  39126.699219   
2024-10-31 09:01:00+09:00  39167.718750  39171.160156  39145.000000   
2024-10-31 09:02:00+09:00  39156.609375  39184.351562  39154.789062   
2024-10-31 09:03:00+09:00  39151.011719  39158.230469  39145.058594   
2024-10-31 09:04:00+09:00  39164.781250  39185.960938  39159.250000   

                                  Close     Adj Close  Volume  
Datetime                                                       
2024-10-31 09:00:00+09:00  39138.699219  39138.699219       0  
2024-10-31 09:01:00+09:00  39153.070312  39153.070312       0  
2024-10-31 09:02:00+09:00  39154.789062  39154.789062       0  
2024-10-31 09:03:00+09:00  39149.390625  39149.390625       0  
2024-10-31 09:04:00+09:00  39180.601562  39180.60

In [103]:
# 欠損値の確認
missing_values = data.isnull().sum()
print("欠損値の数:\n", missing_values)

# 欠損値のある行を削除
data = data.dropna()

欠損値の数:
 Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [104]:
# 四本値の範囲を確認（負の値や極端に高い値を検出）
for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
    print(f'{column} の異常値:', data[(data[column] < 0) | (data[column] > data[column].mean() + 3 * data[column].std())])

# 異常値を補正（例: 負の値を0に置き換える）
for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
    data[column] = np.where(data[column] < 0, 0, data[column])

Open の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
High の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Low の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Close の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []
Volume の異常値: Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume]
Index: []


In [105]:
from sklearn.preprocessing import StandardScaler

# 必要な列のみを選択して標準化
scaler = StandardScaler()
data[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(data[['Open', 'High', 'Low', 'Close', 'Volume']])

In [106]:
# 必要な列のみを選択
features = data[['Open', 'High', 'Low', 'Close', 'Volume']]

# 1. 5期間移動平均
data['SMA_5'] = data['Close'].rolling(window=5).mean()

# 2. 20期間移動平均
data['SMA_20'] = data['Close'].rolling(window=20).mean()

# 4. 短期・中期移動平均差分（5期間 - 20期間）
data['SMA_diff_5_20'] = data['SMA_5'] - data['SMA_20']

# 5. ボリンジャーバンド上限（20期間）
data['BB_upper'] = data['SMA_20'] + (data['Close'].rolling(window=20).std() * 2)

# 6. ボリンジャーバンド下限（20期間）
data['BB_lower'] = data['SMA_20'] - (data['Close'].rolling(window=20).std() * 2)

# 7. ボリンジャーバンド幅
data['BB_width'] = data['BB_upper'] - data['BB_lower']

# 8. 標準偏差（20期間）
data['Volatility_20'] = data['Close'].rolling(window=20).std()

# 9. ATR（平均値レンジ）
data['Prev_Close'] = data['Close'].shift(1)  # 前日の終値を新しい列に保存

data['TR'] = data[['High', 'Low', 'Prev_Close']].apply(
    lambda x: max(
        x['High'] - x['Low'],
        abs(x['High'] - x['Prev_Close']),
        abs(x['Low'] - x['Prev_Close'])
    ),
    axis=1
)

data['ATR_14'] = data['TR'].rolling(window=14).mean()

# 'Prev_Close'列は不要なので削除する場合は次の行を追加
data.drop(columns=['Prev_Close'], inplace=True)


# 10. RSI（14期間）
delta = data['Close'].diff(1)
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
data['RSI_14'] = 100 - (100 / (1 + rs))

# 11. MACD
data['EMA_12'] = data['Close'].ewm(span=12, adjust=False).mean()
data['EMA_26'] = data['Close'].ewm(span=26, adjust=False).mean()
data['MACD'] = data['EMA_12'] - data['EMA_26']

# 12. MACDシグナル
data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()

# 13. MACDヒストグラム
data['MACD_histogram'] = data['MACD'] - data['MACD_signal']

# 14. ストキャスティクス（%K）
data['L14'] = data['Low'].rolling(window=14).min()
data['H14'] = data['High'].rolling(window=14).max()
data['%K'] = (data['Close'] - data['L14']) * 100 / (data['H14'] - data['L14'])

# 15. ストキャスティクス（%D）
data['%D'] = data['%K'].rolling(window=3).mean()

# 17. 移動平均の傾き（20期間）
data['SMA_slope_20'] = data['SMA_20'].diff()

# 18. サポートラインとの距離
data['Support_Distance'] = data['Low'] - data['Low'].rolling(window=20).min()

# 19. レジスタンスラインとの距離
data['Resistance_Distance'] = data['High'].rolling(window=20).max() - data['High']

# 20. 価格変化率（5期間）
data['Price_Change_5'] = data['Close'].pct_change(periods=5)

# 21. 価格変化率（20期間）
data['Price_Change_20'] = data['Close'].pct_change(periods=20)

# 22. 瞬間ボラティリティ（差分平方）
data['Instant_Volatility'] = data['Close'].diff().pow(2).rolling(window=5).sum()

# 23. エンベロープ（±2%）
data['Envelope_upper'] = data['SMA_20'] * 1.02
data['Envelope_lower'] = data['SMA_20'] * 0.98

# 24. DMIのADX（14期間）
data['Plus_DM'] = data['High'].diff()
data['Minus_DM'] = -data['Low'].diff()
data['Plus_DI'] = 100 * (data['Plus_DM'].where(data['Plus_DM'] > 0, 0)).ewm(alpha=1/14, min_periods=0).mean() / data['ATR_14']
data['Minus_DI'] = 100 * (data['Minus_DM'].where(data['Minus_DM'] > 0, 0)).ewm(alpha=1/14, min_periods=0).mean() / data['ATR_14']
data['DX'] = (abs(data['Plus_DI'] - data['Minus_DI']) / abs(data['Plus_DI'] + data['Minus_DI'])) * 100
data['ADX'] = data['DX'].ewm(alpha=1/14, min_periods=0).mean()

# 25. 移動平均乖離率（20期間）
data['MA_deviation_20'] = (data['Close'] - data['SMA_20']) / data['SMA_20']

# 26. ウィリアムズ％R（14期間）
data['Williams_R'] = (data['H14'] - data['Close']) / (data['H14'] - data['L14']) * -100

# 27. 終値/始値の比率
data['Close_to_Open'] = data['Close'] / data['Open']

# 28. 終値の変化幅（前日比）
data['Close_Change'] = data['Close'].diff()

# 29. 終値の変化率（前日比、パーセンテージ）
data['Close_Pct_Change'] = data['Close'].pct_change() * 100

# 30. 前のローソク足との比率
data['Previous_Ratio'] = data['Close'] / data['Close'].shift(1)

# 欠損値の発生する先頭行を削除
data = data.dropna()

print(data.head())

                               Open      High       Low     Close  \
Datetime                                                            
2024-10-31 09:20:00+09:00  1.451245  1.424979  1.399954  1.375354   
2024-10-31 09:21:00+09:00  1.370649  1.345217  1.341134  1.325588   
2024-10-31 09:22:00+09:00  1.335464  1.371361  1.360766  1.397648   
2024-10-31 09:23:00+09:00  1.404644  1.457929  1.430049  1.461233   
2024-10-31 09:24:00+09:00  1.472622  1.446308  1.459393  1.464164   

                              Adj Close  Volume     SMA_5    SMA_20  \
Datetime                                                              
2024-10-31 09:20:00+09:00  39074.019531     0.0  1.469745  1.641475   
2024-10-31 09:21:00+09:00  39056.378906     0.0  1.431772  1.627836   
2024-10-31 09:22:00+09:00  39081.921875     0.0  1.399448  1.617558   
2024-10-31 09:23:00+09:00  39104.460938     0.0  1.401886  1.611220   
2024-10-31 09:24:00+09:00  39105.500000     0.0  1.404797  1.600627   

                  

In [107]:
# 終値の1日後の値を追加
data['Next_Close'] = data['Close'].shift(-1)

# 終値が上がるか下がるかのラベルを作成
#　Tureなら1、Falseなら0
data.loc[:, 'Price_Up'] = (data['Next_Close'] > data['Close']).astype(int)

# 最後の行は次の値がないため削除
data = data.dropna()

print(data[[ 'Close', 'Next_Close', 'Price_Up']].head())

                              Close  Next_Close  Price_Up
Datetime                                                 
2024-10-31 09:20:00+09:00  1.375354    1.325588         0
2024-10-31 09:21:00+09:00  1.325588    1.397648         1
2024-10-31 09:22:00+09:00  1.397648    1.461233         1
2024-10-31 09:23:00+09:00  1.461233    1.464164         1
2024-10-31 09:24:00+09:00  1.464164    1.480805         1


In [117]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

columns_to_scale = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_5', 'SMA_20', 'SMA_diff_5_20', 
                    'BB_upper', 'BB_lower', 'BB_width', 'Volatility_20', 'ATR_14', 'RSI_14', 
                    'MACD', 'MACD_signal', 'MACD_histogram', '%K', '%D', 'SMA_slope_20', 
                    'Support_Distance', 'Resistance_Distance', 'Price_Change_5', 'Price_Change_20', 
                    'Instant_Volatility', 'Envelope_upper', 'Envelope_lower', 'Plus_DI', 
                    'Minus_DI', 'DX', 'ADX', 'MA_deviation_20', 'Williams_R', 'Close_to_Open', 
                    'Close_Change', 'Close_Pct_Change', 'Previous_Ratio']

# 特徴量とラベルの定義
X = data[columns_to_scale]  # 標準化した特徴量
y = data['Price_Up']

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ランダムフォレストの学習と予測
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# 正解率の計算
accuracy = accuracy_score(y_test, y_pred)
print(f'ランダムフォレストのAccuracy: {accuracy:.3f}')

ランダムフォレストのAccuracy: 0.517


In [81]:
# 欠損値がある行を削除
data.dropna(inplace=True)
# 欠損値の確認
missing_values = data.isnull().sum()
print("欠損値の数:\n", missing_values)

欠損値の数:
 Open                   0
High                   0
Low                    0
Close                  0
Adj Close              0
Volume                 0
SMA_5                  0
SMA_20                 0
SMA_diff_5_20          0
BB_upper               0
BB_lower               0
BB_width               0
Volatility_20          0
TR                     0
ATR_14                 0
RSI_14                 0
EMA_12                 0
EMA_26                 0
MACD                   0
MACD_signal            0
MACD_histogram         0
L14                    0
H14                    0
%K                     0
%D                     0
SMA_slope_20           0
Support_Distance       0
Resistance_Distance    0
Price_Change_5         0
Price_Change_20        0
Instant_Volatility     0
Envelope_upper         0
Envelope_lower         0
Plus_DM                0
Minus_DM               0
Plus_DI                0
Minus_DI               0
DX                     0
ADX                    0
MA_deviation_20  

In [86]:
from sklearn.preprocessing import StandardScaler

# 標準化対象の列を選択
columns_to_scale = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_5', 'SMA_20', 'SMA_diff_5_20', 
                    'BB_upper', 'BB_lower', 'BB_width', 'Volatility_20', 'ATR_14', 'RSI_14', 
                    'MACD', 'MACD_signal', 'MACD_histogram', '%K', '%D', 'SMA_slope_20', 
                    'Support_Distance', 'Resistance_Distance', 'Price_Change_5', 'Price_Change_20', 
                    'Instant_Volatility', 'Envelope_upper', 'Envelope_lower', 'Plus_DI', 
                    'Minus_DI', 'DX', 'ADX', 'MA_deviation_20', 'Williams_R', 'Close_to_Open', 
                    'Close_Change', 'Close_Pct_Change', 'Previous_Ratio']

# 外れ値の検出と削除
outliers = {}
for column in columns_to_scale:
    outlier_condition = (data[column] < data[column].mean() - 3 * data[column].std()) | (data[column] > data[column].mean() + 3 * data[column].std())
    outliers[column] = outlier_condition.sum()

print("各特徴量の外れ値の数:")
for column, count in outliers.items():
    print(f"{column}: {count} 個")

# 外れ値を含む行を削除
data_cleaned = data[~(data[columns_to_scale] < (data[columns_to_scale].mean() - 3 * data[columns_to_scale].std())).any(axis=1) &
                    ~(data[columns_to_scale] > (data[columns_to_scale].mean() + 3 * data[columns_to_scale].std())).any(axis=1)].copy()

# 標準化
scaler = StandardScaler()
data_cleaned.loc[:, columns_to_scale] = scaler.fit_transform(data_cleaned[columns_to_scale])

# 標準化後のデータを表示（確認用）
print("\n外れ値を削除し標準化した後のデータ:")
print(data_cleaned[columns_to_scale].head())


各特徴量の外れ値の数:
Open: 0 個
High: 0 個
Low: 0 個
Close: 0 個
Volume: 0 個
SMA_5: 0 個
SMA_20: 0 個
SMA_diff_5_20: 5 個
BB_upper: 0 個
BB_lower: 4 個
BB_width: 3 個
Volatility_20: 3 個
ATR_14: 2 個
RSI_14: 1 個
MACD: 0 個
MACD_signal: 3 個
MACD_histogram: 6 個
%K: 0 個
%D: 0 個
SMA_slope_20: 2 個
Support_Distance: 9 個
Resistance_Distance: 4 個
Price_Change_5: 6 個
Price_Change_20: 7 個
Instant_Volatility: 10 個
Envelope_upper: 0 個
Envelope_lower: 0 個
Plus_DI: 1 個
Minus_DI: 1 個
DX: 3 個
ADX: 4 個
MA_deviation_20: 1 個
Williams_R: 0 個
Close_to_Open: 2 個
Close_Change: 2 個
Close_Pct_Change: 6 個
Previous_Ratio: 6 個

外れ値を削除し標準化した後のデータ:
                               Open      High       Low     Close  Volume  \
Datetime                                                                    
2024-11-01 09:29:00+09:00  1.595656  1.568150  1.628792  1.561080     0.0   
2024-11-01 09:30:00+09:00  1.543105  1.452261  1.324770  1.284826     0.0   
2024-11-01 09:31:00+09:00  1.265087  1.681452  1.396298  1.797045     0.0   
2024-11-01

In [87]:
# Xとyを結合してデータフレームにし、欠損値を確認
df_combined = data_cleaned[columns_to_scale].copy()
df_combined['Price_Up'] = data['Price_Up']

# 欠損値を含む行を削除して、再度Xとyに分割
df_combined.dropna(inplace=True)
X = df_combined[columns_to_scale]
y = df_combined['Price_Up']


In [88]:
# Xとyを結合してデータフレームを作成し、欠損値のある行を削除
df_combined = X.copy()
df_combined['Price_Up'] = y

# 欠損値のある行を削除
df_combined.dropna(inplace=True)

# 特徴量Xとターゲットyに再度分割
X = df_combined.drop(columns=['Price_Up'])
y = df_combined['Price_Up']


In [89]:
print(f'特徴量Xのサンプル数: {len(X)}')
print(f'ターゲットyのサンプル数: {len(y)}')


特徴量Xのサンプル数: 228
ターゲットyのサンプル数: 228
