In [1]:
# This notebook parses a specific KMZ file to extract a subset of the dataset for analysis

In [2]:
import zipfile
import xml.etree.ElementTree as ET
import os
from typing import List, Set, Dict, Any
import re

In [3]:
def parse_kmz_file(kmz_path: str) -> Dict[str, Any]:
    """
    Parse a KMZ file and extract column names from its dataset.
    
    Args:
        kmz_path (str): Path to the KMZ file
        
    Returns:
        Dict containing column names and other metadata
    """
    results = {
        'column_names': set(),
        'placemark_count': 0,
        'folders': [],
        'schemas': []
    }
    
    try:
        with zipfile.ZipFile(kmz_path, 'r') as kmz:
            # List all files in the KMZ
            file_list = kmz.namelist()
            
            # Find the main KML file (usually doc.kml or the largest .kml file)
            kml_files = [f for f in file_list if f.endswith('.kml')]
            
            if not kml_files:
                raise ValueError("No KML files found in the KMZ archive")
            
            # Use the first KML file or doc.kml if it exists
            main_kml = 'doc.kml' if 'doc.kml' in kml_files else kml_files[0]
            
            # Extract and parse the KML content
            with kmz.open(main_kml) as kml_file:
                kml_content = kml_file.read().decode('utf-8')
                
            # Parse the XML
            root = ET.fromstring(kml_content)
            
            # Define KML namespace
            namespace = {'kml': 'http://www.opengis.net/kml/2.2'}
            
            # Extract column names from different sources
            column_names = extract_column_names(root, namespace)
            results['column_names'] = column_names
            
            # Count placemarks
            placemarks = root.findall('.//kml:Placemark', namespace)
            results['placemark_count'] = len(placemarks)
            
            # Extract folder information
            folders = root.findall('.//kml:Folder', namespace)
            results['folders'] = [get_element_text(folder, 'kml:name', namespace) for folder in folders]
            
            # Extract schema information
            schemas = root.findall('.//kml:Schema', namespace)
            results['schemas'] = extract_schema_info(schemas, namespace)
            
    except Exception as e:
        print(f"Error parsing KMZ file: {e}")
        return results
    
    return results

def extract_column_names(root: ET.Element, namespace: Dict[str, str]) -> Set[str]:
    """Extract column names from various sources in the KML."""
    column_names = set()
    
    # 1. Extract from Schema definitions
    schemas = root.findall('.//kml:Schema', namespace)
    for schema in schemas:
        simple_fields = schema.findall('.//kml:SimpleField', namespace)
        for field in simple_fields:
            name = field.get('name')
            if name:
                column_names.add(name)
    
    # 2. Extract from ExtendedData
    extended_data_elements = root.findall('.//kml:ExtendedData', namespace)
    for extended_data in extended_data_elements:
        # SchemaData approach
        schema_data_elements = extended_data.findall('.//kml:SchemaData', namespace)
        for schema_data in schema_data_elements:
            simple_data_elements = schema_data.findall('.//kml:SimpleData', namespace)
            for simple_data in simple_data_elements:
                name = simple_data.get('name')
                if name:
                    column_names.add(name)
        
        # Data elements approach
        data_elements = extended_data.findall('.//kml:Data', namespace)
        for data in data_elements:
            name = data.get('name')
            if name:
                column_names.add(name)
    
    # 3. Extract from description fields (common in Google Earth exports)
    descriptions = root.findall('.//kml:description', namespace)
    for desc in descriptions:
        if desc.text:
            # Look for HTML table headers or common patterns
            html_columns = extract_columns_from_html_description(desc.text)
            column_names.update(html_columns)
    
    return column_names

def extract_columns_from_html_description(description_text: str) -> Set[str]:
    """Extract potential column names from HTML descriptions."""
    column_names = set()
    
    # Look for table headers
    th_pattern = r'<th[^>]*>(.*?)</th>'
    th_matches = re.findall(th_pattern, description_text, re.IGNORECASE | re.DOTALL)
    for match in th_matches:
        clean_text = re.sub(r'<[^>]+>', '', match).strip()
        if clean_text:
            column_names.add(clean_text)
    
    # Look for common key-value patterns
    # Pattern like "Name: Value" or "Name = Value"
    kv_pattern = r'(?:^|\n|<br[^>]*>)\s*([^:=\n]+)[:=]\s*[^:\n]+'
    kv_matches = re.findall(kv_pattern, description_text, re.IGNORECASE | re.MULTILINE)
    for match in kv_matches:
        clean_text = re.sub(r'<[^>]+>', '', match).strip()
        if clean_text and len(clean_text) < 50:  # Reasonable field name length
            column_names.add(clean_text)
    
    return column_names

def extract_schema_info(schemas: List[ET.Element], namespace: Dict[str, str]) -> List[Dict[str, Any]]:
    """Extract detailed schema information."""
    schema_info = []
    
    for schema in schemas:
        schema_data = {
            'name': schema.get('name'),
            'id': schema.get('id'),
            'fields': []
        }
        
        simple_fields = schema.findall('.//kml:SimpleField', namespace)
        for field in simple_fields:
            field_info = {
                'name': field.get('name'),
                'type': field.get('type'),
                'displayName': get_element_text(field, 'kml:displayName', namespace)
            }
            schema_data['fields'].append(field_info)
        
        schema_info.append(schema_data)
    
    return schema_info

def get_element_text(element: ET.Element, tag: str, namespace: Dict[str, str]) -> str:
    """Safely get text content from an XML element."""
    found_element = element.find(tag, namespace)
    return found_element.text if found_element is not None and found_element.text else ""

def print_results(results: Dict[str, Any]):
    """Print the parsing results in a formatted way."""
    print("="*60)
    print("KMZ FILE ANALYSIS RESULTS")
    print("="*60)
    
    print(f"\nColumn Names Found ({len(results['column_names'])}):")
    print("-" * 40)
    if results['column_names']:
        for i, column in enumerate(sorted(results['column_names']), 1):
            print(f"{i:2d}. {column}")
    else:
        print("No column names found")
    
    print(f"\nPlacemarks: {results['placemark_count']}")
    
    if results['folders']:
        print(f"\nFolders ({len(results['folders'])}):")
        for folder in results['folders']:
            print(f"  - {folder}")
    
    if results['schemas']:
        print(f"\nSchemas ({len(results['schemas'])}):")
        for schema in results['schemas']:
            print(f"  - {schema['name']} (ID: {schema['id']})")
            for field in schema['fields']:
                print(f"    • {field['name']} ({field['type']})")

def main():
    """Main function to run the KMZ parser."""
    kmz_file_path = r"C:\Users\MIKE.MK-DESKTOP\Documents\Career\Data_Analytics\Projects\patterns_within_the_bible\data\geo\all.kmz"
    
    if not os.path.exists(kmz_file_path):
        print(f"Error: File '{kmz_file_path}' not found.")
        return
    
    if not kmz_file_path.lower().endswith('.kmz'):
        print("Warning: File doesn't have .kmz extension. Proceeding anyway...")
    
    print(f"Parsing KMZ file: {kmz_file_path}")
    results = parse_kmz_file(kmz_file_path)
    
    print_results(results)
    
    # Return just the column names as a list for easy use
    return sorted(list(results['column_names']))

Parsing KMZ file: C:\Users\MIKE.MK-DESKTOP\Documents\Career\Data_Analytics\Projects\patterns_within_the_bible\data\geo\all.kmz
KMZ FILE ANALYSIS RESULTS

Column Names Found (45):
----------------------------------------
 1. 1Chr <a href
 2. 1Kgs <a href
 3. 1Pet <a href
 4. 1Sam <a href
 5. 2Chr <a href
 6. 2Kgs <a href
 7. 2Sam <a href
 8. 2Tim <a href
 9. Acts <a href
10. Amos <a href
11. Col <a href
12. Dan <a href
13. Deut <a href
14. Esth <a href
15. Exod <a href
16. Ezek <a href
17. Ezra <a href
18. Gal <a href
19. Gen <a href
20. Hab <a href
21. Hos <a href
22. Isa <a href
23. Jer <a href
24. Job <a href
25. Joel <a href
26. John <a href
27. Josh <a href
28. Judg <a href
29. Lev <a href
30. Luke <a href
31. Mark <a href
32. Matt <a href
33. Mic <a href
34. Nah <a href
35. Neh <a href
36. Num <a href
37. Obad <a href
38. Ps <a href
39. Rev <a href
40. Rom <a href
41. Ruth <a href
42. Song <a href
43. Titus <a href
44. Zech <a href
45. Zeph <a href

Placemarks: 7141

Folders (1500

In [None]:
if __name__ == "__main__":
    column_names = main()
    print(f"\nColumn names list: {column_names}")

results = parse_kmz_file(r"C:\Users\MIKE.MK-DESKTOP\Documents\Career\Data_Analytics\Projects\patterns_within_the_bible\data\geo\all.kmz")
column_names = sorted(list(results['column_names']))
print(column_names)