In [0]:
import requests
import json
import time
import urllib.parse
import re
from pyspark.sql import SparkSession
from concurrent.futures import ThreadPoolExecutor
import threading

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DUR PRODUCT APIs 1 & 3 - TEST EXTRACTION (300 records each, parallel) ===")

# Configuration
ENCODED_SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
SERVICE_KEY = urllib.parse.unquote(ENCODED_SERVICE_KEY)
BASE_URL = "https://apis.data.go.kr/1471000/DURPrdlstInfoService03"

# Test configuration - Only APIs 1 and 3 with 300 records each
API_CONFIGS = [
    {
        "name": "병용금기",
        "api_id": 1,
        "endpoint": "getUsjntTabooInfoList03",
        "expected_records": 240873,
        "test_records": 300,  # Only extract 300 for testing
        "table_name": "main.default.dur_product_interaction_bronze_test",
        "description": "Drug Interaction Contraindications"
    },
    {
        "name": "DUR품목정보",
        "api_id": 3,
        "endpoint": "getDurPrdlstInfoList03",
        "expected_records": 24065,
        "test_records": 300,  # Only extract 300 for testing
        "table_name": "main.default.dur_product_info_bronze_test",
        "description": "General DUR Product Info"
    }
]

# Thread lock for safe logging
log_lock = threading.Lock()

def safe_print(message):
    """Thread-safe printing"""
    with log_lock:
        print(message)

def clean_column_names(df):
    """Clean column names to remove invalid characters"""
    # Get current column names
    old_columns = df.columns
    new_columns = []
    
    for col_name in old_columns:
        # Remove invalid characters and spaces
        clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', col_name)
        # Remove leading/trailing underscores and multiple underscores
        clean_name = re.sub(r'_+', '_', clean_name).strip('_')
        # Ensure it doesn't start with a number
        if clean_name and clean_name[0].isdigit():
            clean_name = 'col_' + clean_name
        # Handle empty names
        if not clean_name:
            clean_name = f'col_{len(new_columns)}'
        new_columns.append(clean_name)
    
    # Rename columns
    for old_col, new_col in zip(old_columns, new_columns):
        if old_col != new_col:
            df = df.withColumnRenamed(old_col, new_col)
    
    return df, dict(zip(old_columns, new_columns))

def clean_record(record):
    """Clean record - convert all values to strings"""
    cleaned = {}
    for key, value in record.items():
        cleaned[key] = "" if value is None else str(value)
    return cleaned

def make_api_call(endpoint, page_no, num_rows=100):
    """Make API call"""
    url = f"{BASE_URL}/{endpoint}"
    params = {
        "serviceKey": SERVICE_KEY,
        "pageNo": page_no,
        "numOfRows": num_rows,
        "type": "json"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    response = requests.get(url, params=params, headers=headers, timeout=30)
    response.raise_for_status()
    data = response.json()
    
    header = data.get("header", {})
    if header.get("resultCode") != "00":
        raise Exception(f"API Error: {header.get('resultMsg')}")
    
    return data

def extract_test_data(config):
    """Extract limited test data from API"""
    safe_print(f"\n🚀 Extracting {config['name']} TEST data...")
    safe_print(f"   Target: {config['test_records']} records (from {config['expected_records']:,} total)")
    
    all_records = []
    page_no = 1
    target_records = config['test_records']
    
    while len(all_records) < target_records:
        try:
            if page_no > 1:
                time.sleep(0.3)
            
            # Calculate how many records to request this page
            remaining = target_records - len(all_records)
            records_to_request = min(100, remaining)
            
            data = make_api_call(config['endpoint'], page_no, records_to_request)
            
            body = data.get("body", {})
            if page_no == 1:
                total_available = body.get("totalCount", 0)
                safe_print(f"   📊 {config['name']}: {total_available:,} records available (extracting {target_records})")
            
            items = body.get("items", [])
            if not items:
                safe_print(f"   ⚠️  {config['name']}: No more items at page {page_no}")
                break
            
            # Clean each record
            cleaned_items = [clean_record(item) for item in items]
            all_records.extend(cleaned_items)
            
            safe_print(f"   ✅ {config['name']} Page {page_no}: +{len(items)} | Total: {len(all_records)}")
            
            # Stop when we have enough records
            if len(all_records) >= target_records:
                all_records = all_records[:target_records]  # Trim to exact target
                break
            
            page_no += 1
            
            # Safety check
            if page_no > 10:  # Should only need 3 pages for 300 records
                safe_print(f"   ⚠️  {config['name']}: Safety break at page {page_no}")
                break
                
        except Exception as e:
            safe_print(f"   ❌ {config['name']}: Error on page {page_no}: {str(e)}")
            if page_no == 1:
                break
            page_no += 1
            continue
    
    safe_print(f"   🎉 {config['name']}: Test extraction complete - {len(all_records)} records")
    return all_records

def create_test_bronze_table(config, records):
    """Create test bronze table with column name cleaning"""
    if not records:
        safe_print(f"   ❌ {config['name']}: No records to process")
        return None
    
    try:
        safe_print(f"   💾 {config['name']}: Creating test bronze table...")
        
        # Create DataFrame
        df = spark.createDataFrame(records)
        
        # Clean column names to handle invalid characters
        cleaned_df, column_mapping = clean_column_names(df)
        
        if column_mapping:
            safe_print(f"   🔧 {config['name']}: Cleaned {len(column_mapping)} column names")
            # Show a few examples of renamed columns
            examples = list(column_mapping.items())[:3]
            for old, new in examples:
                if old != new:
                    safe_print(f"      '{old}' → '{new}'")
        
        safe_print(f"   ✅ {config['name']}: DataFrame created - {cleaned_df.count()} records, {len(cleaned_df.columns)} columns")
        
        # Write to Delta table
        cleaned_df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(config['table_name'])
        
        safe_print(f"   ✅ {config['name']}: Test bronze table created - {config['table_name']}")
        
        # Verification
        verification_df = spark.table(config['table_name'])
        verified_count = verification_df.count()
        
        safe_print(f"   📊 {config['name']}: Verification - {verified_count} records in table")
        
        return {
            'name': config['name'],
            'table_name': config['table_name'],
            'records': verified_count,
            'target': config['test_records'],
            'columns': len(cleaned_df.columns),
            'column_mapping': column_mapping
        }
        
    except Exception as e:
        safe_print(f"   ❌ {config['name']}: Error creating table - {str(e)}")
        
        # Try SQL approach with cleaned names
        try:
            safe_print(f"   🔄 {config['name']}: Trying SQL approach...")
            
            temp_df = spark.createDataFrame(records)
            cleaned_temp_df, _ = clean_column_names(temp_df)
            
            # Use simple temp view name
            temp_view_name = f"temp_api_{config['api_id']}"
            cleaned_temp_df.createOrReplaceTempView(temp_view_name)
            
            spark.sql(f"""
                CREATE OR REPLACE TABLE {config['table_name']}
                USING DELTA
                AS SELECT * FROM {temp_view_name}
            """)
            
            safe_print(f"   ✅ {config['name']}: Test table created via SQL - {config['table_name']}")
            
            verification_df = spark.table(config['table_name'])
            verified_count = verification_df.count()
            
            return {
                'name': config['name'],
                'table_name': config['table_name'],
                'records': verified_count,
                'target': config['test_records'],
                'columns': len(cleaned_temp_df.columns),
                'column_mapping': {}
            }
            
        except Exception as e2:
            safe_print(f"   ❌ {config['name']}: SQL approach also failed - {str(e2)}")
            return None

def process_api_test(config):
    """Process a single API test extraction"""
    start_time = time.time()
    
    safe_print(f"\n{'='*50}")
    safe_print(f"Testing API {config['api_id']}: {config['name']}")
    safe_print(f"{'='*50}")
    
    try:
        # Extract test data
        records = extract_test_data(config)
        
        if not records:
            safe_print(f"   ❌ {config['name']}: No test data extracted")
            return None
        
        # Create test bronze table
        result = create_test_bronze_table(config, records)
        
        if result:
            processing_time = time.time() - start_time
            result['processing_time'] = processing_time
            
            records_per_sec = len(records) / processing_time if processing_time > 0 else 0
            safe_print(f"   ⏱️  {config['name']}: Test completed in {processing_time:.1f}s ({records_per_sec:.0f} rec/sec)")
            
            return result
        else:
            return None
        
    except Exception as e:
        safe_print(f"   💥 {config['name']}: Test failed - {str(e)}")
        return None

def main():
    """Main execution - parallel test of APIs 1 and 3"""
    overall_start = time.time()
    
    print(f"🧪 Testing problematic APIs with limited data:")
    for config in API_CONFIGS:
        print(f"   API {config['api_id']}: {config['name']} (extracting {config['test_records']} from {config['expected_records']:,})")
        print(f"      Test table: {config['table_name']}")
    
    print(f"\n🔄 Starting parallel test extraction...")
    
    results = []
    
    # Process both APIs in parallel
    with ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(process_api_test, config) for config in API_CONFIGS]
        
        for future in futures:
            result = future.result()
            if result:
                results.append(result)
    
    # Summary
    overall_time = time.time() - overall_start
    
    print(f"\n{'='*60}")
    print(f"🎉 === TEST EXTRACTION COMPLETED ===")
    print(f"{'='*60}")
    print(f"⏱️  Total processing time: {overall_time:.1f} seconds")
    print(f"✅ Successful tests: {len(results)}/{len(API_CONFIGS)}")
    
    if results:
        print(f"\n📊 Test Results:")
        print(f"{'API Name':<15} {'Records':<8} {'Target':<8} {'Success':<8} {'Columns':<8} {'Time':<8}")
        print(f"{'-'*60}")
        
        for result in results:
            success_rate = result['records'] / result['target'] * 100
            time_sec = result['processing_time']
            
            print(f"{result['name']:<15} "
                  f"{result['records']:<8} "
                  f"{result['target']:<8} "
                  f"{success_rate:<7.1f}% "
                  f"{result['columns']:<8} "
                  f"{time_sec:<7.1f}s")
        
        print(f"\n💾 Test Tables Created:")
        for result in results:
            print(f"   ✅ {result['table_name']}")
        
        print(f"\n🎯 Test Assessment:")
        all_success = all(r['records'] == r['target'] for r in results)
        
        if all_success:
            print(f"   ✅ All tests passed perfectly!")
            print(f"   🚀 Ready to scale to full extraction")
            print(f"   🔧 Column cleaning approach validated")
        else:
            print(f"   ⚠️  Some tests had partial success")
            print(f"   🔍 Review results before full extraction")
        
        # Column mapping info
        for result in results:
            if result.get('column_mapping'):
                print(f"\n   📋 {result['name']} column mapping applied")
    else:
        print(f"❌ No tests completed successfully")
        print(f"💡 Review API connectivity and schema issues")

# Execute the test
if __name__ == "__main__":
    main()