In [0]:
import requests
import json
import time
import urllib.parse
import re
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DUR PRODUCT API 1 - FINAL CHUNKED EXTRACTION ===")

# Configuration
ENCODED_SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
SERVICE_KEY = urllib.parse.unquote(ENCODED_SERVICE_KEY)
BASE_URL = "https://apis.data.go.kr/1471000/DURPrdlstInfoService03/getUsjntTabooInfoList03"

# API 1 Configuration
API_CONFIG = {
    "name": "병용금기",
    "api_id": 1,
    "endpoint": "getUsjntTabooInfoList03",
    "expected_records": 240873,
    "table_name": "main.default.dur_product_interaction_bronze",
    "description": "Drug Interaction Contraindications"
}

# Chunking configuration
CHUNK_SIZE = 5000  # Process 5000 records at a time (50 API pages)
PAGES_PER_CHUNK = 50  # 50 pages * 100 records = 5000 records per chunk

def clean_column_names(df):
    """Clean column names to remove invalid characters"""
    old_columns = df.columns
    new_columns = []
    
    for col_name in old_columns:
        clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', col_name.strip())
        clean_name = re.sub(r'_+', '_', clean_name).strip('_')
        
        if clean_name and clean_name[0].isdigit():
            clean_name = 'col_' + clean_name
        if not clean_name:
            clean_name = f'col_{len(new_columns)}'
            
        new_columns.append(clean_name)
    
    for old_col, new_col in zip(old_columns, new_columns):
        if old_col != new_col:
            df = df.withColumnRenamed(old_col, new_col)
    
    return df

def clean_record(record):
    """Clean record - convert all values to strings"""
    cleaned = {}
    for key, value in record.items():
        cleaned[key] = "" if value is None else str(value)
    return cleaned

def make_api_call(page_no, num_rows=100):
    """Make API call with retry logic - using working format from debug"""
    params = {
        "serviceKey": SERVICE_KEY,  # Using the working decoded version
        "pageNo": page_no,
        "numOfRows": num_rows,
        "type": "json"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, */*'
    }
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(BASE_URL, params=params, headers=headers, timeout=30)
            response.raise_for_status()
            
            # Parse JSON
            data = response.json()
            
            # Check API response status
            header = data.get("header", {})
            if header.get("resultCode") != "00":
                raise Exception(f"API Error: {header.get('resultMsg')}")
            
            return data
            
        except requests.RequestException as e:
            if attempt < max_retries - 1:
                wait_time = (attempt + 1) * 2
                print(f"      ⚠️  Retry {attempt + 1}/3 for page {page_no}: {str(e)}")
                time.sleep(wait_time)
                continue
            else:
                raise e
        except json.JSONDecodeError as e:
            if attempt < max_retries - 1:
                wait_time = (attempt + 1) * 2
                print(f"      ⚠️  JSON retry {attempt + 1}/3 for page {page_no}: {str(e)}")
                time.sleep(wait_time)
                continue
            else:
                raise e

def extract_chunk(start_page, end_page, chunk_num, total_chunks):
    """Extract a chunk of data (pages start_page to end_page)"""
    print(f"   📥 Extracting chunk {chunk_num}/{total_chunks}: pages {start_page}-{end_page}...")
    
    chunk_records = []
    errors_in_chunk = 0
    max_errors_per_chunk = 5
    
    chunk_start_time = time.time()
    
    for page_no in range(start_page, end_page + 1):
        try:
            if page_no > start_page:
                time.sleep(0.3)  # Respectful delay
            
            data = make_api_call(page_no)
            
            body = data.get("body", {})
            items = body.get("items", [])
            
            if not items:
                print(f"      ⚠️  No items on page {page_no}, chunk may be complete")
                break
            
            # Clean each record
            cleaned_items = [clean_record(item) for item in items]
            chunk_records.extend(cleaned_items)
            
            # Progress within chunk (report every 10 pages or at end)
            if page_no % 10 == 0 or page_no == end_page:
                elapsed = time.time() - chunk_start_time
                pages_done = page_no - start_page + 1
                pages_remaining = end_page - page_no
                eta_chunk = (elapsed / pages_done * pages_remaining) if pages_done > 0 else 0
                
                print(f"      ✅ Page {page_no}: +{len(items)} | Chunk: {len(chunk_records)} | ETA: {eta_chunk:.0f}s")
            
        except Exception as e:
            errors_in_chunk += 1
            print(f"      ❌ Error on page {page_no}: {str(e)}")
            
            if errors_in_chunk >= max_errors_per_chunk:
                print(f"      🛑 Too many errors in chunk {chunk_num}, stopping chunk")
                break
            
            continue  # Skip this page and continue
    
    chunk_time = time.time() - chunk_start_time
    print(f"   ✅ Chunk {chunk_num} extracted: {len(chunk_records)} records in {chunk_time:.1f}s (errors: {errors_in_chunk})")
    return chunk_records, errors_in_chunk

def write_chunk_to_table(chunk_records, chunk_number, is_first_chunk, total_records_so_far):
    """Write a chunk of records to the Delta table"""
    if not chunk_records:
        print(f"   ⚠️  Chunk {chunk_number}: No records to write")
        return False
    
    try:
        print(f"   💾 Writing chunk {chunk_number}: {len(chunk_records)} records...")
        
        # Create DataFrame
        df_start = time.time()
        df = spark.createDataFrame(chunk_records)
        
        # Clean column names (important for API 1)
        cleaned_df = clean_column_names(df)
        df_time = time.time() - df_start
        
        print(f"      📊 DataFrame ready: {cleaned_df.count()} records, {len(cleaned_df.columns)} columns ({df_time:.1f}s)")
        
        # Write to Delta table
        write_start = time.time()
        
        if is_first_chunk:
            # First chunk: overwrite to create/reset table
            print(f"      🔄 Creating new table (overwrite mode)")
            cleaned_df.write \
                .format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(API_CONFIG['table_name'])
            print(f"      ✅ New table created: {API_CONFIG['table_name']}")
        else:
            # Subsequent chunks: append
            print(f"      ➕ Appending to existing table")
            cleaned_df.write \
                .format("delta") \
                .mode("append") \
                .saveAsTable(API_CONFIG['table_name'])
        
        write_time = time.time() - write_start
        print(f"      ✅ Chunk {chunk_number} written successfully ({write_time:.1f}s)")
        
        # Verification
        current_count = spark.table(API_CONFIG['table_name']).count()
        expected_count = total_records_so_far + len(chunk_records)
        
        print(f"      📊 Table verification: {current_count:,} records (expected: {expected_count:,})")
        
        if current_count != expected_count:
            print(f"      ⚠️  Count mismatch - expected {expected_count:,}, got {current_count:,}")
        
        return True
        
    except Exception as e:
        print(f"      ❌ Error writing chunk {chunk_number}: {str(e)}")
        
        # Try SQL approach as fallback
        try:
            print(f"      🔄 Trying SQL fallback for chunk {chunk_number}...")
            
            temp_df = spark.createDataFrame(chunk_records)
            cleaned_temp_df = clean_column_names(temp_df)
            
            temp_view_name = f"temp_chunk_{chunk_number}"
            cleaned_temp_df.createOrReplaceTempView(temp_view_name)
            
            if is_first_chunk:
                spark.sql(f"""
                    CREATE OR REPLACE TABLE {API_CONFIG['table_name']}
                    USING DELTA
                    AS SELECT * FROM {temp_view_name}
                """)
            else:
                spark.sql(f"""
                    INSERT INTO {API_CONFIG['table_name']}
                    SELECT * FROM {temp_view_name}
                """)
            
            print(f"      ✅ Chunk {chunk_number} written via SQL fallback")
            return True
            
        except Exception as e2:
            print(f"      ❌ SQL fallback also failed for chunk {chunk_number}: {str(e2)}")
            return False

def main():
    """Main chunked extraction process"""
    overall_start = time.time()
    
    print(f"🎯 Target: {API_CONFIG['name']} ({API_CONFIG['description']})")
    print(f"   Expected records: {API_CONFIG['expected_records']:,}")
    print(f"   Table: {API_CONFIG['table_name']}")
    print(f"   Strategy: Chunked processing ({CHUNK_SIZE:,} records per chunk)")
    
    # Get actual total count
    print(f"\n📊 Verifying API connectivity and total count...")
    try:
        data = make_api_call(1, 1)  # Test call
        body = data.get("body", {})
        total_count = body.get("totalCount", 0)
        
        print(f"   ✅ API connectivity confirmed")
        print(f"   📊 API reports: {total_count:,} records")
        
    except Exception as e:
        print(f"   ❌ API connectivity test failed: {str(e)}")
        print(f"   🔄 Using expected count as fallback")
        total_count = API_CONFIG['expected_records']
    
    # Calculate chunks
    total_pages = (total_count + 99) // 100
    total_chunks = (total_pages + PAGES_PER_CHUNK - 1) // PAGES_PER_CHUNK
    estimated_time_min = total_chunks * 2  # ~2 minutes per chunk
    
    print(f"   📄 Total pages: {total_pages:,}")
    print(f"   📦 Total chunks: {total_chunks}")
    print(f"   ⏱️  Estimated time: {estimated_time_min} minutes")
    
    print(f"\n🚀 Starting chunked extraction...")
    print(f"   💡 Each chunk processes {PAGES_PER_CHUNK} pages (~{CHUNK_SIZE:,} records)")
    print(f"   🧠 Memory efficient: Only ~5MB per chunk vs 500MB+ for full load")
    
    # Process chunks
    successful_chunks = 0
    failed_chunks = 0
    total_records_written = 0
    total_errors = 0
    
    for chunk_num in range(1, total_chunks + 1):
        chunk_overall_start = time.time()
        
        # Calculate page range for this chunk
        start_page = (chunk_num - 1) * PAGES_PER_CHUNK + 1
        end_page = min(chunk_num * PAGES_PER_CHUNK, total_pages)
        
        print(f"\n{'='*70}")
        print(f"Processing Chunk {chunk_num}/{total_chunks}")
        print(f"Pages {start_page}-{end_page} ({end_page - start_page + 1} pages)")
        overall_progress = (chunk_num - 1) / total_chunks * 100
        print(f"Overall Progress: {overall_progress:.1f}% | Records so far: {total_records_written:,}")
        print(f"{'='*70}")
        
        try:
            # Extract chunk
            chunk_records, chunk_errors = extract_chunk(start_page, end_page, chunk_num, total_chunks)
            total_errors += chunk_errors
            
            if chunk_records:
                # Write chunk
                write_success = write_chunk_to_table(
                    chunk_records, 
                    chunk_num, 
                    is_first_chunk=(chunk_num == 1),
                    total_records_so_far=total_records_written
                )
                
                if write_success:
                    successful_chunks += 1
                    total_records_written += len(chunk_records)
                    
                    # Progress update
                    chunk_overall_time = time.time() - chunk_overall_start
                    overall_elapsed = time.time() - overall_start
                    overall_progress_updated = chunk_num / total_chunks * 100
                    
                    # ETA calculation
                    chunks_remaining = total_chunks - chunk_num
                    avg_time_per_chunk = overall_elapsed / chunk_num
                    eta_minutes = (chunks_remaining * avg_time_per_chunk) / 60
                    
                    print(f"   🎉 Chunk {chunk_num} COMPLETED in {chunk_overall_time:.1f}s")
                    print(f"   📊 Progress: {overall_progress_updated:.1f}% | Total records: {total_records_written:,}")
                    print(f"   ⏱️  ETA: {eta_minutes:.0f} minutes remaining")
                    
                else:
                    failed_chunks += 1
                    print(f"   ❌ Chunk {chunk_num} extraction succeeded but write failed")
            else:
                print(f"   ⚠️  Chunk {chunk_num}: No data extracted")
                failed_chunks += 1
            
            # Brief pause between chunks
            if chunk_num < total_chunks:
                print(f"   ⏸️  Brief pause before next chunk...")
                time.sleep(1)
            
        except Exception as e:
            print(f"   💥 Chunk {chunk_num} failed completely: {str(e)}")
            failed_chunks += 1
            continue
    
    # Final comprehensive summary
    overall_time = time.time() - overall_start
    
    print(f"\n{'='*80}")
    print(f"🎉 === CHUNKED EXTRACTION COMPLETED ===")
    print(f"{'='*80}")
    
    # Final verification
    try:
        final_table_count = spark.table(API_CONFIG['table_name']).count()
        completeness = (final_table_count / total_count * 100) if total_count > 0 else 0
        
        print(f"📊 Final Results:")
        print(f"   ✅ Successful chunks: {successful_chunks}/{total_chunks}")
        print(f"   ❌ Failed chunks: {failed_chunks}")
        print(f"   📊 Records written: {total_records_written:,}")
        print(f"   📊 Final table count: {final_table_count:,}")
        print(f"   🎯 Completeness: {completeness:.1f}%")
        print(f"   ⚠️  Total API errors: {total_errors}")
        print(f"   ⏱️  Total processing time: {overall_time/60:.1f} minutes")
        print(f"   ⚡ Average speed: {final_table_count/(overall_time/60):.0f} records/minute")
        
        print(f"\n💾 Bronze Table: {API_CONFIG['table_name']}")
        
        if final_table_count > 0:
            print(f"   ✅ Table created successfully with {final_table_count:,} records!")
            print(f"   📈 Ready for silver layer transformations")
            
            # Show schema info
            table_df = spark.table(API_CONFIG['table_name'])
            print(f"   📋 Schema: {len(table_df.columns)} columns")
            
            # Quick sample
            print(f"\n📝 Sample Data (first 3 records):")
            table_df.show(3, truncate=True)
            
            # Success assessment
            if completeness >= 95:
                print(f"\n🏆 EXTRACTION HIGHLY SUCCESSFUL!")
                print(f"   ✅ {completeness:.1f}% completeness achieved")
                print(f"   ✅ Memory-efficient chunked approach worked perfectly")
                print(f"   ✅ Ready for analytics and silver layer processing")
            else:
                print(f"\n⚠️  EXTRACTION PARTIALLY SUCCESSFUL")
                print(f"   📊 {completeness:.1f}% completeness - some data missing")
                print(f"   💡 Consider re-running failed chunks if needed")
                
        else:
            print(f"   ❌ No data in final table - extraction failed")
        
        # Memory efficiency summary
        print(f"\n🧠 Memory Efficiency Achieved:")
        print(f"   📦 Processed in {total_chunks} chunks of ~{CHUNK_SIZE:,} records each")
        print(f"   💾 Peak memory: ~5MB per chunk (vs ~500MB for full load)")
        print(f"   🔄 Sustainable approach for large datasets")
        
    except Exception as e:
        print(f"❌ Error in final verification: {str(e)}")

# Execute the chunked extraction
if __name__ == "__main__":
    main()