In [None]:
import requests
from datetime import datetime, timedelta
import pandas as pd
from dotenv import load_dotenv
import os
from tqdm import tqdm
import zipfile
from pathlib import Path

In [None]:
load_dotenv()
edinet_subscription_key = os.environ['EDINET_SUBSCRIPTION_KEY']

In [None]:
# APIエンドポイント
url = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
    
end_date = datetime.now()
start_date = end_date - timedelta(days=365)

date_range = []
current_date = start_date
while current_date <= end_date:
    date_range.append(current_date)
    current_date += timedelta(days=1)

all_documents = []

for current_date in tqdm(date_range, desc='データ取得中', unit='日'):
    date_str = current_date.strftime("%Y-%m-%d")

    # リクエストパラメータ
    params = {
        "date": date_str,  # 今日の日付
        'type': 2,
        "Subscription-Key": edinet_subscription_key  # あなたのSubscription-Keyに置き換えてください
    }
    
    # GETリクエストを送信
    response = requests.get(url, params=params)
    response.raise_for_status()
    
    # JSONレスポンスを取得
    data = response.json()
    results = data['results']

    # 日付を追加
    for i in range(len(results)):
        results[i]['date'] = date_str
        
    all_documents.extend(results)

In [None]:
all_documents_df = pd.DataFrame(all_documents)
target_documents_df = all_documents_df[pd.notna(all_documents_df['secCode']) & (all_documents_df['docTypeCode'] == '120') & (all_documents_df['formCode'] == '030000')]
target_documents_df.to_excel('document_list.xlsx', index=False)

In [None]:
save_path = Path('documents')
save_path.mkdir(parents=True, exist_ok=True)

In [None]:
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import requests
import zipfile
from pathlib import Path
from tqdm import tqdm
import time

def download_and_extract(doc_id, type_, save_path, edinet_subscription_key, max_retries=3):
    """
    単一のドキュメントをダウンロードして展開する関数
    """
    url = f'https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}'
    params = {
        'type': type_,
        'Subscription-Key': edinet_subscription_key
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            
            zip_filename = save_path / f'{doc_id}_type{type_}.zip'
            with open(zip_filename, 'wb') as f:
                f.write(response.content)
            
            extract_dir = save_path / f'{doc_id}_type{type_}'
            with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            
            zip_filename.unlink()
            
            return (doc_id, type_, True, None)
            
        except Exception as e:
            if attempt == max_retries - 1:
                return (doc_id, type_, False, str(e))
            time.sleep(1)  # リトライ前に少し待機

def process_documents_parallel(target_documents_df, save_path, edinet_subscription_key, max_workers=10):
    """
    並列処理でドキュメントをダウンロード
    """
    # タスクリストを作成
    tasks = []
    for doc_id in target_documents_df['docID']:
        for type_ in [1, 5]:
            tasks.append((doc_id, type_))
    
    # 結果を格納するリスト
    results = []
    failed = []
    
    # プログレスバーを設定
    with tqdm(total=len(tasks), desc='データ取得中', unit='件') as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # 非同期でタスクを実行
            future_to_task = {
                executor.submit(
                    download_and_extract, 
                    doc_id, 
                    type_, 
                    save_path, 
                    edinet_subscription_key
                ): (doc_id, type_) 
                for doc_id, type_ in tasks
            }
            
            # 完了したタスクから順に処理
            for future in concurrent.futures.as_completed(future_to_task):
                doc_id, type_ = future_to_task[future]
                try:
                    result = future.result()
                    if result[2]:  # 成功
                        results.append(result)
                    else:  # 失敗
                        failed.append(result)
                        print(f"\n失敗: {doc_id} (type={type_}): {result[3]}")
                except Exception as e:
                    failed.append((doc_id, type_, False, str(e)))
                    print(f"\n予期しないエラー: {doc_id} (type={type_}): {e}")
                finally:
                    pbar.update(1)
    
    return results, failed

# 使用例
results, failed = process_documents_parallel(
    target_documents_df, 
    save_path, 
    edinet_subscription_key,
    max_workers=10  # 同時実行数（API制限に応じて調整）
)

# 失敗したものがあれば再試行
if failed:
    print(f"\n{len(failed)}件のダウンロードに失敗しました。")
    retry_df = pd.DataFrame([{'docID': f[0]} for f in failed if f[1] == 1]).drop_duplicates()
    if not retry_df.empty:
        print("再試行中...")
        retry_results, retry_failed = process_documents_parallel(
            retry_df, 
            save_path, 
            edinet_subscription_key,
            max_workers=5  # 再試行時は控えめに
        )
