In [None]:
!pip install yfinance pandas numpy tqdm

In [1]:
# ============================================
# セル1: ライブラリと日経225完全リスト
# ============================================

import yfinance as yf
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import os
import warnings
warnings.filterwarnings('ignore')

# 日経225構成銘柄 完全リスト（2024年1月時点）
NIKKEI_225_STOCKS = [
    # 水産・農林業 (2)
    '1332', '1333',
    # 鉱業 (1)
    '1605',
    # 建設業 (9)
    '1721', '1801', '1802', '1803', '1808', '1812', '1925', '1928', '1963',
    # 食料品 (11)
    '2002', '2269', '2282', '2501', '2502', '2503', '2531', '2801', '2802', '2871', '2914',
    # 繊維製品 (4)
    '3101', '3103', '3401', '3402',
    # パルプ・紙 (2)
    '3861', '3863',
    # 化学 (18)
    '3405', '3407', '4004', '4005', '4021', '4042', '4043', '4061', '4063', '4183',
    '4188', '4208', '4452', '4631', '4901', '4911', '6988', '8113',
    # 医薬品 (9)
    '4151', '4502', '4503', '4506', '4507', '4519', '4523', '4568', '4578',
    # 石油・石炭製品 (2)
    '5019', '5020',
    # ゴム製品 (2)
    '5101', '5108',
    # ガラス・土石製品 (8)
    '5201', '5202', '5214', '5232', '5233', '5301', '5332', '5333',
    # 鉄鋼 (4)
    '5401', '5406', '5411', '5541',
    # 非鉄金属 (11)
    '5703', '5706', '5707', '5711', '5713', '5714', '5715', '5801', '5802', '5803', '3436',
    # 金属製品 (1)
    '5901',
    # 機械 (15)
    '6103', '6113', '6301', '6302', '6305', '6326', '6361', '6367', '6471', '6472',
    '6473', '7004', '7011', '7012', '7013',
    # 電気機器 (29)
    '4902', '6479', '6501', '6502', '6503', '6504', '6506', '6508', '6645', '6674',
    '6701', '6702', '6703', '6724', '6752', '6753', '6758', '6762', '6767', '6770',
    '6841', '6857', '6902', '6952', '6954', '6971', '6976', '7735', '7751', '7752',
    # 輸送用機器 (12)
    '7003', '7201', '7202', '7203', '7205', '7211', '7261', '7267', '7269', '7270',
    '7272', '7313',
    # 精密機器 (5)
    '4543', '7731', '7733', '7741', '7762',
    # その他製品 (4)
    '7832', '7911', '7912', '7951',
    # 電気・ガス業 (10)
    '9501', '9502', '9503', '9504', '9506', '9507', '9508', '9509', '9531', '9532',
    # 陸運業 (10)
    '9001', '9005', '9007', '9008', '9009', '9020', '9021', '9022', '9062', '9064',
    # 海運業 (3)
    '9101', '9104', '9107',
    # 空運業 (2)
    '9201', '9202',
    # 倉庫・運輸関連業 (1)
    '9301',
    # 情報・通信業 (9)
    '3659', '4324', '4689', '4704', '4755', '9432', '9433', '9434', '9613',
    # 卸売業 (7)
    '2768', '8001', '8002', '8015', '8031', '8053', '8058',
    # 小売業 (8)
    '3086', '3099', '3382', '7453', '8233', '8267', '8270', '9983',
    # 銀行業 (7)
    '7186', '8304', '8306', '8308', '8309', '8316', '8411',
    # 証券・商品先物取引業 (2)
    '8601', '8604',
    # 保険業 (5)
    '8630', '8725', '8750', '8766', '8795',
    # その他金融業 (3)
    '8253', '8591', '8697',
    # 不動産業 (5)
    '8801', '8802', '8803', '8804', '8830',
    # サービス業 (11)
    '2413', '4661', '4732', '6098', '6178', '9602', '9681', '9735', '9766', '9984', '2432'
]

# リストの検証
NIKKEI_225_STOCKS = list(set(NIKKEI_225_STOCKS))[:225]
print(f"日経225銘柄数: {len(NIKKEI_225_STOCKS)}")

# パラメータ設定
START_DATE = "2000-01-01"
END_DATE = "2020-12-31"
UP_THRESHOLD = 0.1
DOWN_THRESHOLD = -0.1
OUTPUT_DIR = 'nikkei225_data'
GNMINER_DIR = 'nikkei225_data/gnminer_individual'

日経225銘柄数: 225


In [None]:
# ============================================
# セル2: データ取得と変化率計算
# ============================================

def download_and_process_data():
    """データ取得から変化率計算まで一括処理"""
    
    # ディレクトリ作成
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(GNMINER_DIR, exist_ok=True)
    
    # データ取得
    print("日経225銘柄データをダウンロード中...")
    all_data = {}
    failed = []
    
    for code in tqdm(NIKKEI_225_STOCKS, desc="取得進捗"):
        try:
            ticker = f"{code}.T"
            stock = yf.Ticker(ticker)
            df = stock.history(start=START_DATE, end=END_DATE)
            if not df.empty:
                all_data[code] = df['Close']
            else:
                failed.append(code)
        except:
            failed.append(code)
        time.sleep(0.1)
    
    if failed:
        print(f"警告: {len(failed)}銘柄の取得失敗: {failed[:5]}...")
    
    # DataFrame化
    raw_df = pd.DataFrame(all_data)
    raw_df = raw_df.fillna(method='ffill').fillna(method='bfill')
    raw_df = raw_df.reset_index()
    raw_df = raw_df.rename(columns={'Date': 'T'})
    
    # 生データ保存（Tはそのまま）
    raw_df.to_csv(f'{OUTPUT_DIR}/nikkei225_raw.csv', index=False)
    print(f"✓ 生データ保存: {OUTPUT_DIR}/nikkei225_raw.csv ({raw_df.shape})")
    
    # 変化率計算用のDataFrame（Tを文字列形式で保持）
    returns_df = pd.DataFrame()
    returns_df['T'] = pd.to_datetime(raw_df['T']).dt.strftime('%Y-%m-%d')[1:]
    
    for col in raw_df.columns:
        if col != 'T':
            returns = raw_df[col].pct_change() * 100
            returns_df[col] = returns.iloc[1:].values
    
    return returns_df, list(all_data.keys())

# 実行
returns_df, stock_codes = download_and_process_data()
print(f"変化率計算完了: {returns_df.shape}")

日経225銘柄データをダウンロード中...


取得進捗:   6%|▌         | 14/225 [00:15<03:31,  1.00s/it]$9681.T: possibly delisted; no timezone found
取得進捗:  15%|█▌        | 34/225 [00:35<02:57,  1.07it/s]$8270.T: possibly delisted; no price data found  (1d 2000-01-01 -> 2020-12-31)
取得進捗:  31%|███       | 70/225 [01:08<02:25,  1.06it/s]Exception ignored from cffi callback <function buffer_callback at 0x113c90c20>:
Traceback (most recent call last):
  File "/Users/suzukiyasuhiro/Desktop/study/ts-itemsbs/.venv/lib/python3.12/site-packages/curl_cffi/curl.py", line 100, in buffer_callback
    @ffi.def_extern()
    
KeyboardInterrupt: 
$5232.T: possibly delisted; no price data found  (1d 2000-01-01 -> 2020-12-31)
取得進捗:  36%|███▌      | 81/225 [01:20<02:32,  1.06s/it]$8750.T: possibly delisted; no price data found  (1d 2000-01-01 -> 2020-12-31)
取得進捗:  36%|███▋      | 82/225 [06:24<3:39:14, 91.99s/it]Failed to get ticker '8591.T' reason: Failed to perform, curl: (28) Operation timed out after 10002 milliseconds with 0 bytes received. See http

In [37]:
# ============================================
# セル3: 個別銘柄用GNMinerファイル生成
# ============================================

def create_all_gnminer_files(returns_df, stock_codes):
    """全銘柄のGNMinerファイルを一括生成"""
    
    print(f"\n個別銘柄用ファイル生成中（全{len(stock_codes)}ファイル）...")
    
    for target_code in tqdm(stock_codes, desc="生成進捗"):
        # 他銘柄を特定
        other_codes = [c for c in stock_codes if c != target_code]
        
        # 結果DataFrame初期化
        result = pd.DataFrame()
        
        # 各銘柄の3値分類を追加
        for code in other_codes:
            if code in returns_df.columns:
                returns_series = returns_df[code]
                
                # 3値分類（One-hot）
                result[f'{code}_Up'] = (returns_series > UP_THRESHOLD).astype(int)
                result[f'{code}_Stay'] = ((returns_series >= DOWN_THRESHOLD) & 
                                         (returns_series <= UP_THRESHOLD)).astype(int)
                result[f'{code}_Down'] = (returns_series < DOWN_THRESHOLD).astype(int)
        
        # 目標銘柄の変化率を追加
        if target_code in returns_df.columns:
            result['X'] = returns_df[target_code].round(2)
        
        # 時刻Tを最後の列に追加
        result['T'] = returns_df['timestamp']
        
        # ファイル保存
        result.to_csv(f'{GNMINER_DIR}/{target_code}.txt', index=False)
    
    print(f"✓ 全ファイル生成完了: {GNMINER_DIR}/")
    
    # サンプル検証
    sample_code = stock_codes[0]
    sample_df = pd.read_csv(f'{GNMINER_DIR}/{sample_code}.txt')
    
    print(f"\n【生成ファイル例】 {sample_code}.txt")
    print(f"  T形式: {sample_df['T'].iloc[0]}")
    print(f"  レコード数: {len(sample_df)}")
    print(f"  属性数: {len(sample_df.columns)-2} (他銘柄×3値)")
    print(f"  Target統計: mean={sample_df['X'].mean():.2f}, "
          f"std={sample_df['X'].std():.2f}")
    
    # 3値分類の分布
    up_count = sample_df[[c for c in sample_df.columns if c.endswith('_Up')]].sum().sum()
    stay_count = sample_df[[c for c in sample_df.columns if c.endswith('_Stay')]].sum().sum()
    down_count = sample_df[[c for c in sample_df.columns if c.endswith('_Down')]].sum().sum()
    total = up_count + stay_count + down_count
    
    print(f"  3値分布: Up={up_count/total:.1%}, Stay={stay_count/total:.1%}, "
          f"Down={down_count/total:.1%}")
    
    return len(sample_df.columns) - 2

# 実行
num_attributes = create_all_gnminer_files(returns_df, stock_codes)


個別銘柄用ファイル生成中（全220ファイル）...


生成進捗: 100%|██████████| 220/220 [01:28<00:00,  2.48it/s]

✓ 全ファイル生成完了: nikkei225_data/gnminer_individual/

【生成ファイル例】 3402.txt
  T形式: 2000-01-04
  レコード数: 5268
  属性数: 657 (他銘柄×3値)
  Target統計: mean=0.03, std=1.95
  3値分布: Up=34.9%, Stay=30.2%, Down=34.9%





In [None]:
# ============================================
# セル4: 最終サマリーとGNMiner設定情報
# ============================================

print("\n" + "="*60)
print("処理完了サマリー")
print("="*60)

print(f"\n【生成データ】")
print(f"期間: {START_DATE} ~ {END_DATE}")
print(f"銘柄数: {len(stock_codes)}")
print(f"レコード数: {len(returns_df)}")

print(f"\n【ファイル構成】")
print(f"├── nikkei225_raw.csv (生データ、Tはタイムゾーン付き)")
print(f"└── gnminer_individual/")
print(f"    ├── {stock_codes[0]}.txt (T: YYYY-MM-DD)")
print(f"    ├── {stock_codes[1]}.txt (T: YYYY-MM-DD)")
print(f"    └── ... ({len(stock_codes)}ファイル)")

print(f"\n【GNMiner設定】")
print(f"#define DATANAME \"gnminer_individual/7203.txt\"")
print(f"#define Nzk {num_attributes}")
print(f"#define Nrd {len(returns_df)}")
print(f"#define Minsup 0.04")
print(f"#define UP_THRESHOLD {UP_THRESHOLD}")
print(f"#define DOWN_THRESHOLD {DOWN_THRESHOLD}")

print(f"\n処理完了！")


処理完了サマリー

【生成データ】
期間: 2000-01-01 ~ 2020-12-31
銘柄数: 220
レコード数: 5268

【ファイル構成】
├── nikkei225_raw.csv (生データ)
└── gnminer_individual/
    ├── 3402.txt
    ├── 9001.txt
    └── ... (220ファイル)

【GNMiner設定】
#define DATANAME "gnminer_individual/7203.txt"
#define Nzk 657
#define Nrd 5268
#define Minsup 0.04
#define UP_THRESHOLD 0.5
#define DOWN_THRESHOLD -0.5

処理完了！
