In [0]:
import requests
import json
import time
import urllib.parse
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DUR PRODUCT INFO - ALL 9 APIs EXTRACTION (SPARK CONNECT COMPATIBLE) ===")

# Configuration
ENCODED_SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
SERVICE_KEY = urllib.parse.unquote(ENCODED_SERVICE_KEY)
BASE_URL = "https://apis.data.go.kr/1471000/DURPrdlstInfoService03"

# All 9 API configurations (ordered by size: small → large)
API_CONFIGS = [
    {
        "name": "투여기간주의",
        "api_id": 6,
        "endpoint": "getMdctnPdAtentInfoList03",
        "expected_records": 642,
        "table_name": "main.default.dur_product_duration_bronze",
        "description": "Administration Period Precautions"
    },
    {
        "name": "노인주의",
        "api_id": 2,
        "endpoint": "getOdsnAtentInfoList03",
        "expected_records": 2052,
        "table_name": "main.default.dur_product_elderly_bronze",
        "description": "Elderly Precautions"
    },
    {
        "name": "서방정분할주의",
        "api_id": 8,
        "endpoint": "getSeobangjeongPartitnAtentInfoList03",
        "expected_records": 2157,
        "table_name": "main.default.dur_product_extended_bronze",
        "description": "Extended Release Division Precautions"
    },
    {
        "name": "특정연령대금기",
        "api_id": 4,
        "endpoint": "getSpcifyAgrdeTabooInfoList03",
        "expected_records": 2692,
        "table_name": "main.default.dur_product_age_bronze",
        "description": "Specific Age Group Contraindications"
    },
    {
        "name": "용량주의",
        "api_id": 5,
        "endpoint": "getCpctyAtentInfoList03",
        "expected_records": 6782,
        "table_name": "main.default.dur_product_dosage_bronze",
        "description": "Dosage Precautions"
    },
    {
        "name": "효능군중복",
        "api_id": 7,
        "endpoint": "getEfcyDplctInfoList03",
        "expected_records": 7118,
        "table_name": "main.default.dur_product_efficacy_bronze",
        "description": "Efficacy Group Duplication"
    },
    {
        "name": "임부금기",
        "api_id": 9,
        "endpoint": "getPwnmTabooInfoList03",
        "expected_records": 16353,
        "table_name": "main.default.dur_product_pregnancy_bronze",
        "description": "Pregnancy Contraindications"
    },
    {
        "name": "DUR품목정보",
        "api_id": 3,
        "endpoint": "getDurPrdlstInfoList03",
        "expected_records": 24065,
        "table_name": "main.default.dur_product_info_bronze",
        "description": "General DUR Product Info"
    },
    {
        "name": "병용금기",
        "api_id": 1,
        "endpoint": "getUsjntTabooInfoList03",
        "expected_records": 240873,
        "table_name": "main.default.dur_product_interaction_bronze",
        "description": "Drug Interaction Contraindications"
    }
]

def clean_record(record):
    """Clean record - convert all values to strings"""
    cleaned = {}
    for key, value in record.items():
        cleaned[key] = "" if value is None else str(value)
    return cleaned

def make_api_call(endpoint, page_no, num_rows=100):
    """Make API call"""
    url = f"{BASE_URL}/{endpoint}"
    params = {
        "serviceKey": SERVICE_KEY,
        "pageNo": page_no,
        "numOfRows": num_rows,
        "type": "json"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    response = requests.get(url, params=params, headers=headers, timeout=30)
    response.raise_for_status()
    data = response.json()
    
    header = data.get("header", {})
    if header.get("resultCode") != "00":
        raise Exception(f"API Error: {header.get('resultMsg')}")
    
    return data

def extract_data(config):
    """Extract data from API"""
    print(f"\n🚀 Extracting {config['name']} data...")
    print(f"   Expected: {config['expected_records']:,} records")
    
    all_records = []
    page_no = 1
    total_count = None
    
    while True:
        try:
            if page_no > 1:
                # Dynamic delay based on dataset size
                delay = 0.3 if config['expected_records'] < 5000 else 0.5
                time.sleep(delay)
            
            data = make_api_call(config['endpoint'], page_no)
            
            body = data.get("body", {})
            if total_count is None:
                total_count = body.get("totalCount", 0)
                estimated_pages = (total_count + 99) // 100
                print(f"   📊 Total records: {total_count:,} ({estimated_pages:,} pages)")
            
            items = body.get("items", [])
            if not items:
                break
            
            # Clean each record
            cleaned_items = [clean_record(item) for item in items]
            all_records.extend(cleaned_items)
            
            # Progress reporting based on dataset size
            if config['expected_records'] > 50000:
                # Large dataset: report every 100 pages
                if page_no % 100 == 0 or len(all_records) >= total_count:
                    progress = len(all_records) / total_count * 100 if total_count > 0 else 0
                    print(f"   ✅ Page {page_no}: {len(all_records):,}/{total_count:,} ({progress:.1f}%)")
            elif config['expected_records'] > 10000:
                # Medium dataset: report every 50 pages
                if page_no % 50 == 0 or len(all_records) >= total_count:
                    progress = len(all_records) / total_count * 100 if total_count > 0 else 0
                    print(f"   ✅ Page {page_no}: {len(all_records):,}/{total_count:,} ({progress:.1f}%)")
            else:
                # Small dataset: report every 10 pages
                if page_no % 10 == 0 or len(all_records) >= total_count:
                    print(f"   ✅ Page {page_no}: +{len(items)} | Total: {len(all_records):,}")
            
            if len(all_records) >= total_count:
                break
            
            page_no += 1
            
            # Safety check
            max_expected_pages = (config['expected_records'] + 99) // 100 + 20
            if page_no > max_expected_pages:
                print(f"   ⚠️  Safety break at page {page_no}")
                break
                
        except Exception as e:
            print(f"   ❌ Error on page {page_no}: {str(e)}")
            if page_no == 1:
                break
            page_no += 1
            continue
    
    completeness = len(all_records) / config['expected_records'] * 100
    print(f"   🎉 Extraction complete: {len(all_records):,} records ({completeness:.1f}%)")
    return all_records, total_count

def create_bronze_table(config, records):
    """Create bronze table using Spark Connect compatible method"""
    if not records:
        print(f"   ❌ No records to process for {config['name']}")
        return False
    
    try:
        print(f"   💾 Creating bronze table for {config['name']}...")
        
        # Primary method: Direct DataFrame creation (Spark Connect compatible)
        df = spark.createDataFrame(records)
        
        print(f"   ✅ DataFrame created: {df.count():,} records, {len(df.columns)} columns")
        
        # Write to Delta table
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(config['table_name'])
        
        print(f"   ✅ Bronze table created: {config['table_name']}")
        
        # Verification
        verification_df = spark.table(config['table_name'])
        verified_count = verification_df.count()
        
        print(f"   📊 Verification: {verified_count:,} records in table")
        
        return {
            'name': config['name'],
            'table_name': config['table_name'],
            'records': verified_count,
            'expected': config['expected_records'],
            'columns': len(df.columns)
        }
        
    except Exception as e:
        print(f"   ❌ Error creating table for {config['name']}: {str(e)}")
        print(f"   🔄 Trying SQL approach...")
        
        try:
            # Fallback: SQL approach
            temp_df = spark.createDataFrame(records)
            temp_view_name = f"temp_{config['name'].replace(' ', '_')}"
            temp_df.createOrReplaceTempView(temp_view_name)
            
            spark.sql(f"""
                CREATE OR REPLACE TABLE {config['table_name']}
                USING DELTA
                AS SELECT * FROM {temp_view_name}
            """)
            
            print(f"   ✅ Bronze table created via SQL: {config['table_name']}")
            
            verification_df = spark.table(config['table_name'])
            verified_count = verification_df.count()
            print(f"   📊 Verification: {verified_count:,} records in table")
            
            return {
                'name': config['name'],
                'table_name': config['table_name'],
                'records': verified_count,
                'expected': config['expected_records'],
                'columns': len(temp_df.columns)
            }
            
        except Exception as e2:
            print(f"   ❌ SQL approach also failed for {config['name']}: {str(e2)}")
            return None

def process_api(config):
    """Process a single API end-to-end"""
    start_time = time.time()
    
    print(f"\n{'='*70}")
    print(f"Processing API {config['api_id']}/9: {config['name']}")
    print(f"Description: {config['description']}")
    print(f"{'='*70}")
    
    try:
        # Extract data
        records, total_count = extract_data(config)
        
        if not records:
            print(f"   ❌ No data extracted for {config['name']}")
            return None
        
        # Create bronze table
        result = create_bronze_table(config, records)
        
        if result:
            processing_time = time.time() - start_time
            result['processing_time'] = processing_time
            
            records_per_sec = len(records) / processing_time if processing_time > 0 else 0
            print(f"   ⏱️  Completed in {processing_time/60:.1f} minutes ({records_per_sec:.0f} rec/sec)")
            
            return result
        else:
            return None
        
    except Exception as e:
        print(f"   💥 {config['name']} failed: {str(e)}")
        return None

def main():
    """Main execution - process all 9 APIs sequentially"""
    overall_start = time.time()
    
    total_expected = sum(config['expected_records'] for config in API_CONFIGS)
    
    print(f"📋 Processing ALL {len(API_CONFIGS)} DUR Product APIs:")
    print(f"   Total expected records: {total_expected:,}")
    print(f"   Estimated processing time: {total_expected//5000} minutes")
    
    for i, config in enumerate(API_CONFIGS, 1):
        size = "🔥 LARGE" if config['expected_records'] > 50000 else "📊 MEDIUM" if config['expected_records'] > 5000 else "⚡ SMALL"
        print(f"   {i}. {config['name']} ({size}: {config['expected_records']:,} records)")
    
    print(f"\n🔄 Starting sequential extraction (most stable approach)...")
    
    results = []
    failed_apis = []
    
    for i, config in enumerate(API_CONFIGS, 1):
        result = process_api(config)
        
        if result:
            results.append(result)
        else:
            failed_apis.append(config['name'])
        
        # Brief pause between APIs (except after the last one)
        if i < len(API_CONFIGS):
            print(f"\n   ⏸️  Brief pause before next API...")
            time.sleep(3)
    
    # Final comprehensive summary
    overall_time = time.time() - overall_start
    
    print(f"\n{'='*80}")
    print(f"🎉 === ALL DUR PRODUCT APIs PROCESSING COMPLETED ===")
    print(f"{'='*80}")
    print(f"⏱️  Total processing time: {overall_time/60:.1f} minutes")
    print(f"✅ Successful extractions: {len(results)}/{len(API_CONFIGS)}")
    
    if failed_apis:
        print(f"❌ Failed APIs: {', '.join(failed_apis)}")
    
    if results:
        print(f"\n📊 Detailed Summary Report:")
        print(f"{'API Name':<20} {'Records':<12} {'Expected':<12} {'Complete':<10} {'Columns':<8} {'Time':<8}")
        print(f"{'-'*85}")
        
        total_records = 0
        total_expected_final = 0
        
        for result in results:
            total_records += result['records']
            total_expected_final += result['expected']
            completeness = result['records'] / result['expected'] * 100
            time_min = result['processing_time'] / 60
            
            print(f"{result['name']:<20} "
                  f"{result['records']:<12,} "
                  f"{result['expected']:<12,} "
                  f"{completeness:<9.1f}% "
                  f"{result['columns']:<8} "
                  f"{time_min:<7.1f}m")
        
        print(f"{'-'*85}")
        overall_completeness = total_records / total_expected_final * 100 if total_expected_final > 0 else 0
        print(f"{'TOTAL':<20} {total_records:<12,} {total_expected_final:<12,} {overall_completeness:<9.1f}%")
        
        print(f"\n💾 Bronze Tables Successfully Created:")
        for i, result in enumerate(results, 1):
            print(f"   {i}. ✅ {result['table_name']}")
        
        print(f"\n🏆 Final Success Summary:")
        print(f"   📊 Total records extracted: {total_records:,}")
        print(f"   🎯 Overall completeness: {overall_completeness:.1f}%")
        print(f"   ⚡ Average extraction speed: {total_records/(overall_time/60):.0f} records/minute")
        print(f"   🏗️  Bronze tables ready for silver layer transformations!")
        
        # Performance insights
        if len(results) > 1:
            fastest_api = max(results, key=lambda x: x['records']/x['processing_time'] if x['processing_time'] > 0 else 0)
            slowest_api = min(results, key=lambda x: x['records']/x['processing_time'] if x['processing_time'] > 0 else float('inf'))
            
            print(f"\n⚡ Performance Insights:")
            print(f"   Fastest: {fastest_api['name']} ({fastest_api['records']/fastest_api['processing_time']:.0f} rec/sec)")
            print(f"   Slowest: {slowest_api['name']} ({slowest_api['records']/slowest_api['processing_time']:.0f} rec/sec)")
        
    else:
        print(f"\n❌ No APIs were processed successfully")
        print(f"💡 Check API connectivity and service key")

# Execute the comprehensive extraction
if __name__ == "__main__":
    main()