In [0]:
import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DEBUG DUR API EXTRACTION ===")

# API Configuration
SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
BASE_URL = "https://apis.data.go.kr/1471000/DURIrdntInfoService03/getUsjntTabooInfoList02"

def debug_api_call():
    """Debug the API call to understand what's going wrong"""
    
    print(f"\n1. Testing API connectivity...")
    print(f"   URL: {BASE_URL}")
    
    # Test with minimal parameters first
    params = {
        "serviceKey": SERVICE_KEY,
        "pageNo": 1,
        "numOfRows": 1,  # Start with just 1 record
        "type": "json"
    }
    
    print(f"\n2. API Parameters:")
    for key, value in params.items():
        if key == "serviceKey":
            print(f"   {key}: {value[:20]}...{value[-10:]}")  # Mask most of the service key
        else:
            print(f"   {key}: {value}")
    
    try:
        print(f"\n3. Making API request...")
        
        # Add headers that might be required
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8'
        }
        
        response = requests.get(BASE_URL, params=params, headers=headers, timeout=30)
        
        print(f"   Status Code: {response.status_code}")
        print(f"   Content Type: {response.headers.get('content-type', 'Not specified')}")
        print(f"   Content Length: {len(response.content)} bytes")
        
        # Print first 500 characters of response
        print(f"\n4. Response Preview:")
        response_text = response.text
        print(f"   First 500 characters:")
        print(f"   {repr(response_text[:500])}")
        
        if response.status_code != 200:
            print(f"\n❌ HTTP Error {response.status_code}")
            print(f"   Response: {response_text}")
            return None
        
        # Try to parse as JSON
        print(f"\n5. Attempting JSON parsing...")
        try:
            data = response.json()
            print(f"   ✅ JSON parsing successful!")
            
            # Analyze the structure
            print(f"\n6. JSON Structure Analysis:")
            print(f"   Top-level keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dictionary'}")
            
            if 'header' in data:
                header = data['header']
                print(f"   Header: {header}")
                
            if 'body' in data:
                body = data['body']
                print(f"   Body keys: {list(body.keys()) if isinstance(body, dict) else 'Not a dictionary'}")
                
                if 'totalCount' in body:
                    print(f"   Total count: {body['totalCount']}")
                    
                if 'items' in body:
                    items = body['items']
                    print(f"   Items count: {len(items) if isinstance(items, list) else 'Not a list'}")
                    
                    if isinstance(items, list) and len(items) > 0:
                        first_item = items[0]
                        print(f"   First item keys: {list(first_item.keys()) if isinstance(first_item, dict) else 'Not a dictionary'}")
                        print(f"   First item preview: {str(first_item)[:200]}...")
            
            return data
            
        except json.JSONDecodeError as e:
            print(f"   ❌ JSON parsing failed: {str(e)}")
            print(f"   Raw response: {response_text}")
            return None
            
    except requests.RequestException as e:
        print(f"   ❌ Request failed: {str(e)}")
        return None

def test_different_approaches():
    """Test different API call approaches"""
    
    print(f"\n=== TESTING DIFFERENT APPROACHES ===")
    
    # Approach 1: Without URL encoding in service key
    print(f"\n1. Testing with decoded service key...")
    
    # The service key might need to be URL decoded
    import urllib.parse
    decoded_key = urllib.parse.unquote(SERVICE_KEY)
    
    params_decoded = {
        "serviceKey": decoded_key,
        "pageNo": 1,
        "numOfRows": 1,
        "type": "json"
    }
    
    try:
        response = requests.get(BASE_URL, params=params_decoded, timeout=30)
        print(f"   Status: {response.status_code}")
        print(f"   Response preview: {response.text[:200]}")
        
        if response.status_code == 200:
            try:
                data = response.json()
                print(f"   ✅ Success with decoded key!")
                return data
            except:
                print(f"   ❌ Still not JSON")
    except Exception as e:
        print(f"   ❌ Failed: {str(e)}")
    
    # Approach 2: Test the exact URL from documentation
    print(f"\n2. Testing exact URL from documentation...")
    
    # Build URL manually to match documentation exactly
    test_url = f"{BASE_URL}?serviceKey={SERVICE_KEY}&pageNo=1&numOfRows=1&type=json"
    
    try:
        response = requests.get(test_url, timeout=30)
        print(f"   Status: {response.status_code}")
        print(f"   Response preview: {response.text[:200]}")
        
        if response.status_code == 200:
            try:
                data = response.json()
                print(f"   ✅ Success with manual URL!")
                return data
            except:
                print(f"   ❌ Still not JSON")
    except Exception as e:
        print(f"   ❌ Failed: {str(e)}")
    
    # Approach 3: Check if service key is valid by testing a simple endpoint
    print(f"\n3. Testing service key validity...")
    
    # Sometimes the issue is with the service key itself
    print(f"   Service key format check:")
    print(f"   Length: {len(SERVICE_KEY)}")
    print(f"   Contains %3D: {'%3D' in SERVICE_KEY}")
    print(f"   Contains %2F: {'%2F' in SERVICE_KEY}")
    
    return None

def main():
    """Main debug function"""
    
    # Step 1: Basic API debugging
    result = debug_api_call()
    
    if result is None:
        # Step 2: Try different approaches
        result = test_different_approaches()
    
    if result is not None:
        print(f"\n=== SUCCESS! ===")
        print(f"API is working. Here's the successful response structure:")
        print(json.dumps(result, indent=2, ensure_ascii=False)[:1000])
        
        # If successful, we can proceed with the full extraction
        print(f"\n✅ Ready to proceed with full data extraction!")
        
    else:
        print(f"\n=== TROUBLESHOOTING SUGGESTIONS ===")
        print(f"1. ✅ Check if the service key is still valid")
        print(f"2. ✅ Verify the API endpoint URL is correct")  
        print(f"3. ✅ Confirm API service is operational")
        print(f"4. ✅ Test from a different network/IP address")
        print(f"5. ✅ Contact API provider for support")
        
        print(f"\n📧 API Provider: data.go.kr")
        print(f"🔑 Service Key (partial): {SERVICE_KEY[:20]}...{SERVICE_KEY[-10:]}")

# Execute the debug
if __name__ == "__main__":
    main()

In [0]:
import requests
import json
import time
import urllib.parse
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DUR API EXTRACTION - DRUG INTERACTION CONTRAINDICATIONS ===")

# API Configuration - FIXED: Use decoded service key
ENCODED_SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
SERVICE_KEY = urllib.parse.unquote(ENCODED_SERVICE_KEY)  # This was the fix!
BASE_URL = "https://apis.data.go.kr/1471000/DURIrdntInfoService03/getUsjntTabooInfoList02"
BRONZE_TABLE_NAME = "main.default.dur_ingredient_interaction_bronze"

print(f"\nTarget API: {BASE_URL}")
print(f"Bronze table: {BRONZE_TABLE_NAME}")
print(f"Expected records: 1,587")

def make_api_call(page_no, num_rows=100):
    """Make a single API call with error handling"""
    params = {
        "serviceKey": SERVICE_KEY,  # Using decoded key
        "pageNo": page_no,
        "numOfRows": num_rows,
        "type": "json"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, */*',
        'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8'
    }
    
    try:
        print(f"   📡 API Call - Page {page_no} (requesting {num_rows} records)")
        response = requests.get(BASE_URL, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        
        # Parse JSON response
        data = response.json()
        
        # Check API response status
        header = data.get("header", {})
        if header.get("resultCode") != "00":
            raise Exception(f"API Error: {header.get('resultMsg', 'Unknown error')}")
        
        return data
    
    except requests.RequestException as e:
        print(f"   ❌ Network error: {str(e)}")
        raise
    except json.JSONDecodeError as e:
        print(f"   ❌ JSON parsing error: {str(e)}")
        print(f"   Raw response: {response.text[:200]}")
        raise
    except Exception as e:
        print(f"   ❌ API call failed: {str(e)}")
        raise

def extract_all_data():
    """Extract all data using pagination"""
    print(f"\n🚀 Starting data extraction...")
    
    all_records = []
    page_no = 1
    total_count = None
    
    while True:
        try:
            # Respectful delay between API calls
            if page_no > 1:
                time.sleep(0.5)
            
            data = make_api_call(page_no)
            
            # Get total count from first response
            body = data.get("body", {})
            if total_count is None:
                total_count = body.get("totalCount", 0)
                estimated_pages = (total_count + 99) // 100
                print(f"   📊 Total records available: {total_count:,}")
                print(f"   📄 Estimated pages needed: {estimated_pages}")
            
            # Extract items from response
            items = body.get("items", [])
            
            if not items:
                print(f"   ✅ No more data found at page {page_no}")
                break
            
            # Process items - extract from nested structure
            page_records = []
            for item_wrapper in items:
                if "item" in item_wrapper:
                    page_records.append(item_wrapper["item"])
                else:
                    page_records.append(item_wrapper)
            
            all_records.extend(page_records)
            records_collected = len(all_records)
            
            print(f"   ✅ Page {page_no}: +{len(page_records)} records | Total: {records_collected:,}/{total_count:,}")
            
            # Check if we've collected all records
            if records_collected >= total_count:
                print(f"   🎉 All records collected successfully!")
                break
            
            page_no += 1
            
            # Safety check
            if page_no > 20:  # Should only need ~16 pages
                print(f"   ⚠️  Safety break at page {page_no}")
                break
                
        except Exception as e:
            print(f"   ❌ Error on page {page_no}: {str(e)}")
            
            # Try to continue with next page for transient errors
            if "timeout" in str(e).lower() or "connection" in str(e).lower():
                print(f"   🔄 Retrying in 2 seconds...")
                time.sleep(2)
                continue
            else:
                print(f"   🛑 Stopping extraction due to persistent error")
                break
    
    print(f"\n📋 Extraction Summary:")
    print(f"   Pages processed: {page_no}")
    print(f"   Records extracted: {len(all_records):,}")
    print(f"   Completeness: {len(all_records)/total_count*100:.1f}%" if total_count else "N/A")
    
    return all_records, total_count

def create_bronze_table(records, expected_count):
    """Convert records to DataFrame and create bronze table"""
    print(f"\n🏗️  Creating bronze table...")
    
    if not records:
        print("   ❌ No records to process")
        return None
    
    try:
        # Convert to Spark DataFrame
        print(f"   📊 Converting {len(records):,} records to Spark DataFrame...")
        df = spark.createDataFrame(records)
        
        record_count = df.count()
        column_count = len(df.columns)
        
        print(f"   ✅ DataFrame created successfully!")
        print(f"      Records: {record_count:,}")
        print(f"      Columns: {column_count}")
        
        # Display schema
        print(f"\n   📋 Schema:")
        df.printSchema()
        
        # Show sample data
        print(f"\n   📝 Sample Records (first 3):")
        df.select("INGR_KOR_NAME", "INGR_ENG_NAME", "MIXTURE_INGR_KOR_NAME", "PROHBT_CONTENT").show(3, truncate=False)
        
        # Write to bronze table
        print(f"\n   💾 Writing to bronze table: {BRONZE_TABLE_NAME}")
        
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(BRONZE_TABLE_NAME)
        
        print(f"   ✅ Bronze table created successfully!")
        
        # Verify the table
        print(f"\n🔍 Verification:")
        bronze_df = spark.table(BRONZE_TABLE_NAME)
        verified_count = bronze_df.count()
        
        print(f"   Records in bronze table: {verified_count:,}")
        print(f"   Expected records: {expected_count:,}")
        print(f"   Data integrity: {'✅ Perfect' if verified_count == len(records) else '⚠️ Check needed'}")
        print(f"   Completeness: {verified_count/expected_count*100:.1f}%")
        
        # Business intelligence preview
        print(f"\n📊 Data Insights:")
        
        # Mix type distribution
        print(f"   Mix Types:")
        bronze_df.groupBy("MIX_TYPE").count().orderBy(desc("count")).show()
        
        # Top drug interactions
        print(f"   Most Common Drug Interactions:")
        bronze_df.groupBy("INGR_KOR_NAME", "MIXTURE_INGR_KOR_NAME") \
                .count() \
                .orderBy(desc("count")) \
                .select("INGR_KOR_NAME", "MIXTURE_INGR_KOR_NAME") \
                .show(5, truncate=False)
        
        # Drug classes involved
        print(f"   Drug Classes Distribution:")
        bronze_df.groupBy("CLASS").count().orderBy(desc("count")).show(5, truncate=False)
        
        return bronze_df
        
    except Exception as e:
        print(f"   ❌ Error creating bronze table: {str(e)}")
        raise

def main():
    """Main execution function"""
    start_time = time.time()
    
    try:
        print(f"⏰ Started at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        # Extract all data from API  
        records, expected_count = extract_all_data()
        
        if records:
            # Create bronze table
            bronze_df = create_bronze_table(records, expected_count)
            
            # Final summary
            end_time = time.time()
            processing_time = end_time - start_time
            
            print(f"\n🎉 === EXTRACTION COMPLETED SUCCESSFULLY ===")
            print(f"📋 Summary:")
            print(f"   ✅ Bronze table: {BRONZE_TABLE_NAME}")
            print(f"   📊 Records extracted: {len(records):,}")
            print(f"   📈 Completeness: {len(records)/expected_count*100:.1f}%")
            print(f"   ⏱️ Processing time: {processing_time:.1f} seconds")
            print(f"   🚀 Performance: {len(records)/processing_time:.0f} records/second")
            print(f"   💾 Ready for silver layer transformations!")
            
        else:
            print(f"\n❌ EXTRACTION FAILED: No data retrieved")
            
    except Exception as e:
        print(f"\n💥 EXTRACTION FAILED: {str(e)}")
        print(f"🔧 Check API connectivity and service key validity")
        raise

# Execute the extraction
if __name__ == "__main__":
    main()