In [None]:
import pandas as pd
import mplfinance as mpf
import ta
import matplotlib.pyplot as plt
from foregues.dataset import ForeguesDataset
import numpy as np
import seaborn as sns
import japanize_matplotlib

In [None]:
data = pd.read_csv(
    'data/histdata/HISTDATA_COM_ASCII_EURUSD_M1_2023/DAT_ASCII_EURUSD_M1_2023.csv',
    header=None,
    index_col='timestamp',
    delimiter=';',
    names=['timestamp', 'open', 'high', 'low', 'close', 'volume'],
    parse_dates=['timestamp'],
    date_format='%Y%m%d %H%M%S'
)

In [None]:
# 指定した時間足でリサンプリング
resampled_data = data.resample('5min').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
}).dropna()

print(f"リサンプリング後のデータサイズ: {len(resampled_data)}")

In [None]:
def calculate_label_distribution(data, th1, th2, prediction_period=288):
    """指定された閾値でラベル分布を計算"""
    labels = []
    
    for i in range(len(data)):
        if i == 0:
            labels.append(-1)
            continue
            
        prev_close = data.iloc[i-1]['close']
        future_start_idx = i + 1
        future_end_idx = i + 1 + prediction_period
        
        if future_end_idx >= len(data):
            labels.append(-1)
            continue
            
        future_data = data.iloc[future_start_idx:future_end_idx]
        future_high = future_data['high'].max()
        future_low = future_data['low'].min()
        
        if future_low > prev_close - th1 and future_high >= prev_close + th2:
            label = 1  # 買い
        elif future_high < prev_close + th1 and future_low <= prev_close - th2:
            label = 2  # 売り
        else:
            label = 0  # 何もしない
            
        labels.append(label)
    
    # 有効なラベル（-1以外）の分布を計算
    valid_labels = [l for l in labels if l != -1]
    if not valid_labels:
        return None
        
    label_counts = np.bincount(valid_labels, minlength=3)
    total = len(valid_labels)
    
    return {
        'counts': label_counts,
        'percentages': label_counts / total * 100,
        'total': total,
        'balance_score': calculate_balance_score(label_counts)
    }

def calculate_balance_score(counts):
    """バランススコアを計算（1に近いほどバランスが良い）"""
    if np.sum(counts) == 0:
        return 0
    
    proportions = counts / np.sum(counts)
    # 理想的な分布（各クラス33.33%）からの距離
    ideal = np.array([1/3, 1/3, 1/3])
    # KLダイバージェンスの逆数的なスコア
    epsilon = 1e-8
    proportions = np.maximum(proportions, epsilon)
    kl_div = np.sum(ideal * np.log(ideal / proportions))
    return np.exp(-kl_div)

# 閾値候補を広範囲で設定（pips単位で考える）
# 1pip = 0.0001, 5pips = 0.0005, 10pips = 0.001
th1_candidates = np.array([
    0.0001, 0.0002, 0.0003, 0.0004, 0.0005,  # 1-5 pips
    0.0006, 0.0007, 0.0008, 0.0009, 0.001,   # 6-10 pips
    0.0012, 0.0015, 0.002, 0.0025, 0.003,    # 12-30 pips
    0.004, 0.005, 0.006, 0.008, 0.01         # 40-100 pips
])

th2_candidates = np.array([
    0.0003, 0.0005, 0.0008, 0.001, 0.0012,   # 3-12 pips
    0.0015, 0.002, 0.0025, 0.003, 0.004,     # 15-40 pips
    0.005, 0.006, 0.008, 0.01, 0.012,        # 50-120 pips
    0.015, 0.02, 0.025, 0.03, 0.04, 0.05     # 150-500 pips
])

print(f"th1候補数: {len(th1_candidates)}")
print(f"th2候補数: {len(th2_candidates)}")
print(f"総組み合わせ数: {len(th1_candidates) * len(th2_candidates)}")

In [None]:
# 閾値探索の実行
results = []
best_balance_score = 0
best_params = None

print("閾値探索を開始...")
total_combinations = len(th1_candidates) * len(th2_candidates)
current_combination = 0

for th1 in th1_candidates:
    for th2 in th2_candidates:
        current_combination += 1
        
        # th2 > th1 の制約
        if th2 <= th1:
            continue
            
        # 進捗表示
        if current_combination % 50 == 0:
            print(f"進捗: {current_combination}/{total_combinations} ({current_combination/total_combinations*100:.1f}%)")
        
        # ラベル分布を計算
        dist = calculate_label_distribution(resampled_data, th1, th2)
        
        if dist is None:
            continue
            
        result = {
            'th1': th1,
            'th2': th2,
            'th1_pips': th1 * 10000,  # pips表示
            'th2_pips': th2 * 10000,  # pips表示
            'count_0': dist['counts'][0],  # 何もしない
            'count_1': dist['counts'][1],  # 買い
            'count_2': dist['counts'][2],  # 売り
            'pct_0': dist['percentages'][0],
            'pct_1': dist['percentages'][1],
            'pct_2': dist['percentages'][2],
            'total': dist['total'],
            'balance_score': dist['balance_score']
        }
        
        results.append(result)
        
        # 最高スコア更新
        if dist['balance_score'] > best_balance_score:
            best_balance_score = dist['balance_score']
            best_params = (th1, th2)

print(f"\n探索完了！有効な組み合わせ数: {len(results)}")
if best_params:
    print(f"最高バランススコア: {best_balance_score:.4f}")
    print(f"最適パラメータ: th1={best_params[0]} ({best_params[0]*10000:.1f}pips), th2={best_params[1]} ({best_params[1]*10000:.1f}pips)")

In [None]:
# 結果をDataFrameに変換して分析
results_df = pd.DataFrame(results)

# バランススコア上位20位を表示
top_balanced = results_df.nlargest(20, 'balance_score')

print("バランススコア上位20位:")
display(top_balanced[['th1_pips', 'th2_pips', 'count_0', 'count_1', 'count_2', 
                   'pct_0', 'pct_1', 'pct_2', 'balance_score']].round(3))

In [None]:
# 結果の可視化
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

# 1. バランススコアのヒートマップ
pivot_balance = results_df.pivot_table(
    values='balance_score', 
    index='th1_pips', 
    columns='th2_pips', 
    aggfunc='mean'
)

sns.heatmap(pivot_balance, ax=axes[0], cmap='viridis', cbar_kws={'label': 'Balance Score'})
axes[0].set_title('バランススコア (閾値別)')
axes[0].set_xlabel('th2 (pips)')
axes[0].set_ylabel('th1 (pips)')

# 2. 各クラスの分布
top_10 = results_df.nlargest(10, 'balance_score')
x_pos = np.arange(len(top_10))
width = 0.25

axes[1].bar(x_pos - width, top_10['pct_0'], width, label='何もしない', alpha=0.8)
axes[1].bar(x_pos, top_10['pct_1'], width, label='買い', alpha=0.8)
axes[1].bar(x_pos + width, top_10['pct_2'], width, label='売り', alpha=0.8)

axes[1].set_xlabel('上位10パラメータセット')
axes[1].set_ylabel('クラス割合 (%)')
axes[1].set_title('バランス上位10パラメータのクラス分布')
axes[1].legend()
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels([f"{row['th1_pips']:.1f}/{row['th2_pips']:.1f}" 
                          for _, row in top_10.iterrows()], rotation=45)

# 3. 閾値比率とバランススコア（90パーセンタイルまで表示）
results_df['th_ratio'] = results_df['th2'] / results_df['th1']

# 75パーセンタイルを計算
ratio_75th_percentile = results_df['th_ratio'].quantile(0.75)
print(f"閾値比率の70パーセンタイル: {ratio_75th_percentile:.2f}")

# 75パーセンタイル以下のデータのみでプロット
filtered_data = results_df[results_df['th_ratio'] <= ratio_75th_percentile]

axes[2].scatter(filtered_data['th_ratio'], filtered_data['balance_score'], alpha=0.6)
axes[2].set_xlabel('th2/th1 比率')
axes[2].set_ylabel('バランススコア')
axes[2].set_title(f'閾値比率とバランススコアの関係（75パーセンタイル={ratio_75th_percentile:.1f}まで）')
axes[2].set_xlim(0, ratio_75th_percentile * 1.05)  # 少し余裕を持たせる

plt.tight_layout()
plt.show()