In [0]:
import requests
import json
import time
import urllib.parse
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("=== DUR PRODUCT INFO - FIXED API 6: ADMINISTRATION PERIOD PRECAUTIONS ===")

# API Configuration
ENCODED_SERVICE_KEY = "h9Dbf2cz0HOrqZb5BIqrfrti%2FD5zZLTYAxFpQuywAB7ZUx3yb67jBDuD5uNlHvAszz9c14NffOmMNQjGv5FzwA%3D%3D"
SERVICE_KEY = urllib.parse.unquote(ENCODED_SERVICE_KEY)
BASE_URL = "https://apis.data.go.kr/1471000/DURPrdlstInfoService03/getMdctnPdAtentInfoList03"
BRONZE_TABLE_NAME = "main.default.dur_product_duration_bronze"

print(f"\nTarget API: {BASE_URL}")
print(f"Bronze table: {BRONZE_TABLE_NAME}")
print(f"Expected records: 642")

def make_api_call(page_no, num_rows=100):
    """Make a single API call with error handling"""
    params = {
        "serviceKey": SERVICE_KEY,
        "pageNo": page_no,
        "numOfRows": num_rows,
        "type": "json"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, */*',
        'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8'
    }
    
    try:
        response = requests.get(BASE_URL, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        
        data = response.json()
        
        header = data.get("header", {})
        if header.get("resultCode") != "00":
            raise Exception(f"API Error: {header.get('resultMsg', 'Unknown error')}")
        
        return data
    
    except Exception as e:
        print(f"   ❌ API call error on page {page_no}: {str(e)}")
        raise

def clean_record(record):
    """Clean and standardize a single record"""
    cleaned = {}
    for key, value in record.items():
        # Convert None to empty string for consistency
        if value is None:
            cleaned[key] = ""
        # Ensure all values are strings to avoid type conflicts
        else:
            cleaned[key] = str(value)
    return cleaned

def extract_all_data():
    """Extract all data with data cleaning"""
    print(f"\n📥 Extracting all data with schema fixing...")
    
    all_records = []
    page_no = 1
    total_count = None
    
    while True:
        try:
            if page_no > 1:
                time.sleep(0.5)
            
            data = make_api_call(page_no, 100)
            
            body = data.get("body", {})
            if total_count is None:
                total_count = body.get("totalCount", 0)
                print(f"   📊 Total records: {total_count:,}")
            
            items = body.get("items", [])
            
            if not items:
                break
            
            # Clean each record to ensure consistent schema
            cleaned_items = [clean_record(item) for item in items]
            all_records.extend(cleaned_items)
            
            print(f"   ✅ Page {page_no}: +{len(items)} records | Total: {len(all_records):,}")
            
            if len(all_records) >= total_count:
                break
            
            page_no += 1
            
            if page_no > 15:
                break
                
        except Exception as e:
            print(f"   ❌ Error on page {page_no}: {str(e)}")
            break
    
    print(f"   🎉 Extraction complete: {len(all_records):,}/{total_count:,} records")
    return all_records, total_count

def create_explicit_schema():
    """Define explicit schema based on expected DUR product fields"""
    return StructType([
        StructField("TYPE_NAME", StringType(), True),
        StructField("MIX_TYPE", StringType(), True),
        StructField("INGR_CODE", StringType(), True),
        StructField("INGR_ENG_NAME", StringType(), True),
        StructField("INGR_NAME", StringType(), True),
        StructField("MIX_INGR", StringType(), True),
        StructField("FORM_NAME", StringType(), True),
        StructField("ITEM_SEQ", StringType(), True),
        StructField("ITEM_NAME", StringType(), True),
        StructField("ITEM_PERMIT_DATE", StringType(), True),
        StructField("ENTP_NAME", StringType(), True),
        StructField("CHART", StringType(), True),
        StructField("CLASS_CODE", StringType(), True),
        StructField("CLASS_NAME", StringType(), True),
        StructField("ETC_OTC_NAME", StringType(), True),
        StructField("MAIN_INGR", StringType(), True),
        StructField("NOTIFICATION_DATE", StringType(), True),
        StructField("PROHBT_CONTENT", StringType(), True),
        StructField("REMARK", StringType(), True),
        StructField("INGR_ENG_NAME_FULL", StringType(), True),
        StructField("CHANGE_DATE", StringType(), True)
    ])

def analyze_data_structure(records):
    """Analyze data structure with explicit schema"""
    if not records:
        print("   ❌ No records to analyze")
        return None
    
    print(f"\n🔬 Analyzing data structure with fixed schema...")
    
    try:
        # Method 1: Use explicit schema
        schema = create_explicit_schema()
        df = spark.createDataFrame(records, schema)
        
        print(f"   ✅ DataFrame created successfully!")
        print(f"   📊 Records: {df.count():,}")
        print(f"   📋 Columns: {len(df.columns)}")
        
        print(f"\n   📋 Schema:")
        df.printSchema()
        
        print(f"\n   📝 Sample records (first 3):")
        df.select("ITEM_NAME", "ENTP_NAME", "INGR_NAME", "FORM_NAME", "CLASS_NAME").show(3, truncate=False)
        
        # Data quality analysis
        print(f"\n   🔍 Data Quality Analysis:")
        key_fields = ["ITEM_NAME", "ENTP_NAME", "INGR_NAME", "FORM_NAME"]
        
        for field in key_fields:
            non_empty_count = df.filter((col(field) != "") & col(field).isNotNull()).count()
            print(f"      {field}: {non_empty_count}/{df.count()} non-empty ({non_empty_count/df.count()*100:.1f}%)")
        
        return df
        
    except Exception as e:
        print(f"   ❌ Error with explicit schema: {str(e)}")
        
        # Method 2: Alternative approach with JSON
        try:
            print(f"   🔄 Trying alternative approach with JSON strings...")
            
            # Convert records to JSON strings and back to handle schema issues
            json_strings = [json.dumps(record) for record in records]
            json_rdd = spark.sparkContext.parallelize(json_strings)
            df = spark.read.json(json_rdd)
            
            print(f"   ✅ Alternative method successful!")
            print(f"   📊 Records: {df.count():,}")
            print(f"   📋 Columns: {len(df.columns)}")
            
            df.printSchema()
            return df
            
        except Exception as e2:
            print(f"   ❌ Alternative method also failed: {str(e2)}")
            return None

def create_bronze_table(records, df=None):
    """Create bronze table with cleaned data"""
    print(f"\n💾 Creating bronze table...")
    
    if df is None:
        print("   ❌ No DataFrame to save")
        return None
    
    try:
        # Write to bronze table
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(BRONZE_TABLE_NAME)
        
        print(f"   ✅ Bronze table created: {BRONZE_TABLE_NAME}")
        
        # Verify
        bronze_df = spark.table(BRONZE_TABLE_NAME)
        verified_count = bronze_df.count()
        
        print(f"\n   📊 Verification:")
        print(f"      Records in table: {verified_count:,}")
        print(f"      Columns: {len(bronze_df.columns)}")
        print(f"      Data integrity: {'✅ Perfect' if verified_count == len(records) else '⚠️ Check needed'}")
        
        # Business insights
        print(f"\n   📈 Business Insights:")
        
        # Top enterprises
        print(f"      Top enterprises:")
        bronze_df.groupBy("ENTP_NAME") \
                .count() \
                .orderBy(desc("count")) \
                .show(5, truncate=False)
        
        # Drug forms
        print(f"      Drug forms:")
        bronze_df.groupBy("FORM_NAME") \
                .count() \
                .orderBy(desc("count")) \
                .show(5, truncate=False)
        
        # Class distribution
        print(f"      Drug classes:")
        bronze_df.groupBy("CLASS_NAME") \
                .count() \
                .orderBy(desc("count")) \
                .show(5, truncate=False)
        
        return BRONZE_TABLE_NAME
        
    except Exception as e:
        print(f"   ❌ Error creating bronze table: {str(e)}")
        return None

def main():
    """Main execution with schema fixes"""
    start_time = time.time()
    
    print(f"🔧 FIXED VERSION: Handling schema inference issues")
    
    try:
        # Extract all data
        all_records, total_count = extract_all_data()
        
        if not all_records:
            print(f"\n❌ No data extracted.")
            return False
        
        # Analyze data structure with fixes
        df = analyze_data_structure(all_records)
        
        if df is None:
            print(f"\n❌ Could not create DataFrame.")
            return False
        
        # Create bronze table
        bronze_table = create_bronze_table(all_records, df)
        
        # Final results
        end_time = time.time()
        processing_time = end_time - start_time
        
        print(f"\n🎉 === FIXED TEST COMPLETED SUCCESSFULLY ===")
        print(f"📊 Results:")
        print(f"   ✅ API extraction: Perfect ({len(all_records):,} records)")
        print(f"   ✅ Schema handling: Fixed")
        print(f"   ✅ Bronze table: {bronze_table}")
        print(f"   📈 Completeness: {len(all_records)/total_count*100:.1f}%")
        print(f"   ⏱️ Processing time: {processing_time:.1f} seconds")
        
        print(f"\n🚀 Next Steps:")
        print(f"   1. ✅ Schema issues resolved")
        print(f"   2. ✅ Product API pattern established")
        print(f"   3. 🚀 Ready for comprehensive extraction of all 9 APIs")
        print(f"   4. 📋 Use explicit schema approach for consistency")
        
        return True
        
    except Exception as e:
        print(f"\n💥 FIXED TEST FAILED: {str(e)}")
        return False

# Execute the fixed test
if __name__ == "__main__":
    success = main()
    
    if success:
        print(f"\n✅ SCHEMA ISSUES FIXED - Ready for comprehensive extraction!")
    else:
        print(f"\n❌ Still need to debug schema issues")