In [0]:
import requests
import json
import time
import urllib.parse
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from concurrent.futures import ThreadPoolExecutor
import threading

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DUR INGREDIENT APIs - COMPLETE EXTRACTION (ALL 7 APIs) ===")

# Base configuration
ENCODED_SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
SERVICE_KEY = urllib.parse.unquote(ENCODED_SERVICE_KEY)
BASE_URL = "https://apis.data.go.kr/1471000/DURIrdntInfoService03"

# API Configuration for ALL 7 DUR Ingredient APIs
API_CONFIGS = {
    "병용금기": {
        "api_id": 1,
        "endpoint": "getUsjntTabooInfoList02",
        "expected_records": 1587,
        "table_name": "main.default.dur_ingredient_interaction_bronze",
        "description": "Drug Interaction Contraindications"
    },
    "임부금기": {
        "api_id": 2,
        "endpoint": "getPwnmTabooInfoList02", 
        "expected_records": 1426,
        "table_name": "main.default.dur_ingredient_pregnancy_bronze",
        "description": "Pregnancy Contraindications"
    },
    "용량주의": {
        "api_id": 3,
        "endpoint": "getCpctyAtentInfoList02",
        "expected_records": 706, 
        "table_name": "main.default.dur_ingredient_dosage_bronze",
        "description": "Dosage Precautions"
    },
    "투여기간주의": {
        "api_id": 4,
        "endpoint": "getMdctnPdAtentInfoList02",
        "expected_records": 98,
        "table_name": "main.default.dur_ingredient_duration_bronze", 
        "description": "Administration Period Precautions"
    },
    "노인주의": {
        "api_id": 5,
        "endpoint": "getOdsnAtentInfoList02",
        "expected_records": 112,
        "table_name": "main.default.dur_ingredient_elderly_bronze",
        "description": "Elderly Precautions"
    },
    "특정연령대금기": {
        "api_id": 6,
        "endpoint": "getSpcifyAgrdeTabooInfoList02",
        "expected_records": 226,
        "table_name": "main.default.dur_ingredient_age_bronze",
        "description": "Specific Age Group Contraindications"
    },
    "효능군중복": {
        "api_id": 7,
        "endpoint": "getEfcyDplctInfoList02",
        "expected_records": 404,
        "table_name": "main.default.dur_ingredient_efficacy_bronze",
        "description": "Efficacy Group Duplication"
    }
}

# Thread lock for safe logging
log_lock = threading.Lock()

def safe_print(message):
    """Thread-safe printing"""
    with log_lock:
        print(message)

def make_api_call(api_name, endpoint, page_no, num_rows=100):
    """Make a single API call"""
    url = f"{BASE_URL}/{endpoint}"
    params = {
        "serviceKey": SERVICE_KEY,
        "pageNo": page_no,
        "numOfRows": num_rows,
        "type": "json"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, */*',
        'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8'
    }
    
    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        
        data = response.json()
        
        # Check API response status
        header = data.get("header", {})
        if header.get("resultCode") != "00":
            raise Exception(f"API Error: {header.get('resultMsg', 'Unknown error')}")
        
        return data
    
    except Exception as e:
        safe_print(f"   ❌ {api_name} Page {page_no} Error: {str(e)}")
        raise

def extract_api_data(api_name, config):
    """Extract all data for a single API"""
    safe_print(f"\n🚀 Starting {api_name} ({config['description']})")
    safe_print(f"   Expected records: {config['expected_records']:,}")
    
    all_records = []
    page_no = 1
    total_count = None
    
    while True:
        try:
            # Respectful delay
            if page_no > 1:
                time.sleep(0.3)
            
            data = make_api_call(api_name, config['endpoint'], page_no)
            
            body = data.get("body", {})
            if total_count is None:
                total_count = body.get("totalCount", 0)
                estimated_pages = (total_count + 99) // 100
                safe_print(f"   📊 {api_name}: {total_count:,} records ({estimated_pages} pages)")
            
            items = body.get("items", [])
            if not items:
                break
            
            # Extract records from nested structure
            page_records = []
            for item_wrapper in items:
                if "item" in item_wrapper:
                    page_records.append(item_wrapper["item"])
                else:
                    page_records.append(item_wrapper)
            
            all_records.extend(page_records)
            
            safe_print(f"   ✅ {api_name} Page {page_no}: +{len(page_records)} | Total: {len(all_records):,}")
            
            if len(all_records) >= total_count:
                break
            
            page_no += 1
            
            # Safety check
            if page_no > 15:
                safe_print(f"   ⚠️  {api_name}: Safety break at page {page_no}")
                break
                
        except Exception as e:
            safe_print(f"   ❌ {api_name}: Error on page {page_no}, continuing...")
            if page_no == 1:  # If first page fails, stop
                break
            page_no += 1
            continue
    
    safe_print(f"   🎉 {api_name}: Extracted {len(all_records):,}/{total_count:,} records")
    return all_records, total_count, config

def create_bronze_table(api_name, records, expected_count, config):
    """Create bronze table for an API"""
    if not records:
        safe_print(f"   ❌ {api_name}: No records to save")
        return None
    
    try:
        safe_print(f"   💾 {api_name}: Creating bronze table...")
        
        # Convert to Spark DataFrame
        df = spark.createDataFrame(records)
        
        # Write to bronze table
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(config['table_name'])
        
        # Verify
        bronze_df = spark.table(config['table_name'])
        verified_count = bronze_df.count()
        
        completeness = (verified_count / expected_count * 100) if expected_count > 0 else 0
        
        safe_print(f"   ✅ {api_name}: Bronze table created!")
        safe_print(f"      Table: {config['table_name']}")
        safe_print(f"      Records: {verified_count:,} | Completeness: {completeness:.1f}%")
        
        return {
            'api_name': api_name,
            'table_name': config['table_name'],
            'records': verified_count,
            'expected': expected_count,
            'completeness': completeness,
            'columns': len(df.columns)
        }
        
    except Exception as e:
        safe_print(f"   ❌ {api_name}: Error creating table - {str(e)}")
        return None

def process_single_api(api_item):
    """Process a single API end-to-end"""
    api_name, config = api_item
    start_time = time.time()
    
    try:
        # Extract data
        records, total_count, config = extract_api_data(api_name, config)
        
        # Create bronze table
        result = create_bronze_table(api_name, records, total_count, config)
        
        processing_time = time.time() - start_time
        
        if result:
            result['processing_time'] = processing_time
            safe_print(f"   ⏱️  {api_name}: Completed in {processing_time:.1f}s")
        
        return result
        
    except Exception as e:
        safe_print(f"   💥 {api_name}: Failed - {str(e)}")
        return None

def main():
    """Main execution - parallel processing"""
    overall_start = time.time()
    
    print(f"📋 Processing ALL {len(API_CONFIGS)} DUR Ingredient APIs:")
    for api_name, config in API_CONFIGS.items():
        print(f"   {config['api_id']}. {api_name} ({config['description']}) - {config['expected_records']:,} records")
    
    print(f"\n🔄 Starting parallel extraction...")
    
    # Process APIs in parallel (limited concurrency to be respectful)
    results = []
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(process_single_api, item) for item in API_CONFIGS.items()]
        
        for future in futures:
            result = future.result()
            if result:
                results.append(result)
    
    # Final summary
    overall_time = time.time() - overall_start
    
    print(f"\n🎉 === ALL 7 DUR INGREDIENT APIs COMPLETED ===")
    print(f"⏱️  Total processing time: {overall_time:.1f} seconds")
    print(f"✅ Successful extractions: {len(results)}/{len(API_CONFIGS)}")
    
    if results:
        print(f"\n📊 Complete Summary Report:")
        print(f"{'API Name':<15} {'Records':<10} {'Expected':<10} {'Complete':<10} {'Columns':<8} {'Time':<8}")
        print(f"{'-'*75}")
        
        total_records = 0
        total_expected = 0
        
        # Sort results by API ID for better readability
        sorted_results = sorted(results, key=lambda x: API_CONFIGS[x['api_name']]['api_id'])
        
        for result in sorted_results:
            total_records += result['records']
            total_expected += result['expected']
            
            print(f"{result['api_name']:<15} "
                  f"{result['records']:<10,} "
                  f"{result['expected']:<10,} "
                  f"{result['completeness']:<9.1f}% "
                  f"{result['columns']:<8} "
                  f"{result['processing_time']:<7.1f}s")
        
        print(f"{'-'*75}")
        print(f"{'TOTAL':<15} {total_records:<10,} {total_expected:<10,} {total_records/total_expected*100:<9.1f}%")
        
        print(f"\n💾 ALL Bronze Tables Created:")
        for result in sorted_results:
            api_id = API_CONFIGS[result['api_name']]['api_id']
            print(f"   {api_id}. ✅ {result['table_name']}")
        
        print(f"\n🏆 DUR Ingredient APIs Complete!")
        print(f"   📊 Total drug safety records: {total_records:,}")
        print(f"   🎯 Overall completeness: {total_records/total_expected*100:.1f}%")
        print(f"   🚀 Ready for silver layer transformations!")
        
    else:
        print(f"❌ No extractions completed successfully")

# Execute the extraction
if __name__ == "__main__":
    main()