In [1]:
import zipfile
import xml.etree.ElementTree as ET
import pandas as pd
from pathlib import Path

In [None]:
kmz_file = r"C:\Users\MIKE.MK-DESKTOP\Documents\Career\Data_Analytics\Projects\patterns_within_the_bible\data\geo\all.kmz"  # Update this path

In [12]:
def extract_kmz_data(kmz_file_path):
    """
    Extract location data from a KMZ file and return as a pandas DataFrame.
    
    Args:
        kmz_file_path (str): Path to the KMZ file
    
    Returns:
        pandas.DataFrame: Table with location data and all placemark tags
    """
    
    # Step 1: Extract KML from KMZ (which is a ZIP file)
    with zipfile.ZipFile(kmz_file_path, 'r') as kmz:
        # Find the main KML file (usually doc.kml or similar)
        kml_files = [f for f in kmz.namelist() if f.endswith('.kml')]
        if not kml_files:
            raise ValueError("No KML file found in the KMZ archive")
        
        # Use the first KML file found
        kml_content = kmz.read(kml_files[0])
    
    # Step 2: Parse the KML XML
    root = ET.fromstring(kml_content)
    
    # Handle KML namespace
    namespace = {'kml': 'http://www.opengis.net/kml/2.2'}
    
    # Step 3: Find all placemarks
    placemarks = root.findall('.//kml:Placemark', namespace)
    
    # Step 4: Collect all unique tag names across all placemarks
    all_tags = set()
    placemark_data = []
    
    # First pass: collect all unique tags
    for placemark in placemarks:
        for child in placemark:
            tag_name = child.tag.replace('{http://www.opengis.net/kml/2.2}', '')  # Remove namespace
            all_tags.add(tag_name)
    
    # Add standard columns
    all_tags.update(['folder_name', 'coordinates'])
    
    # Step 5: Build a map of placemarks to their parent folders
    folder_map = {}
    
    # Find all folders and their placemarks
    folders = root.findall('.//kml:Folder', namespace)
    for folder in folders:
        folder_name_elem = folder.find('kml:name', namespace)
        folder_name = folder_name_elem.text if folder_name_elem is not None else ""
        
        # Find all placemarks within this folder
        folder_placemarks = folder.findall('.//kml:Placemark', namespace)
        for pm in folder_placemarks:
            folder_map[pm] = folder_name
    
    # Step 6: Extract data from each placemark
    for placemark in placemarks:
        placemark_dict = {}
        
        # Initialize all tags with empty strings
        for tag in all_tags:
            placemark_dict[tag] = ""
        
        # Get folder name from the map
        placemark_dict['folder_name'] = folder_map.get(placemark, "")
        
        # Extract all tag values for this placemark
        for child in placemark:
            tag_name = child.tag.replace('{http://www.opengis.net/kml/2.2}', '')
            
            if tag_name == 'Point':
                # Extract coordinates from Point geometry
                coordinates = child.find('kml:coordinates', namespace)
                if coordinates is not None and coordinates.text:
                    coord_text = coordinates.text.strip()
                    # Parse longitude,latitude,altitude and keep longitude,latitude format
                    parts = coord_text.split(',')
                    if len(parts) >= 2:
                        longitude = parts[0].strip()
                        latitude = parts[1].strip()
                        placemark_dict['coordinates'] = f"{longitude},{latitude}"
            elif tag_name == 'LineString' or tag_name == 'Polygon':
                # Handle other geometry types - extract all coordinates
                coordinates = child.find('kml:coordinates', namespace)
                if coordinates is not None and coordinates.text:
                    coord_text = coordinates.text.strip()
                    # Parse multiple coordinates and keep longitude,latitude format
                    coord_pairs = []
                    # Split by whitespace and newlines to handle different formatting
                    coord_points = coord_text.replace('\n', ' ').split()
                    for point in coord_points:
                        if point.strip():  # Skip empty strings
                            parts = point.split(',')
                            if len(parts) >= 2:
                                longitude = parts[0].strip()
                                latitude = parts[1].strip()
                                coord_pairs.append(f"{longitude},{latitude}")
                    
                    if coord_pairs:
                        placemark_dict['coordinates'] = " ".join(coord_pairs)
            else:
                # Handle regular tags
                if child.text is not None:
                    placemark_dict[tag_name] = child.text
                elif child.tag == '{http://www.opengis.net/kml/2.2}ExtendedData':
                    # Handle ExtendedData specially
                    extended_data = []
                    for data in child.findall('kml:Data', namespace):
                        name = data.get('name', '')
                        value_elem = data.find('kml:value', namespace)
                        value = value_elem.text if value_elem is not None else ""
                        extended_data.append(f"{name}: {value}")
                    placemark_dict[tag_name] = "; ".join(extended_data)
        
        placemark_data.append(placemark_dict)
    
    # Step 7: Create DataFrame
    df = pd.DataFrame(placemark_data)
    
    # Reorder columns to put standard ones first
    standard_cols = ['folder_name', 'name', 'coordinates']
    other_cols = [col for col in df.columns if col not in standard_cols]
    column_order = [col for col in standard_cols if col in df.columns] + sorted(other_cols)
    
    return df[column_order]

def save_to_csv(df, output_path):
    """Save the DataFrame to a CSV file."""
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")

def save_to_excel(df, output_path):
    """Save the DataFrame to an Excel file."""
    df.to_excel(output_path, index=False)
    print(f"Data saved to {output_path}")



In [13]:
# Example usage
if __name__ == "__main__":
    # Specify your KMZ file path
    #kmz_file = "path/to/your/file.kmz"  # Update this path
    
    try:
        # Extract data from KMZ file
        print("Extracting data from KMZ file...")
        data_df = extract_kmz_data(kmz_file)
        
        # Display basic info
        print(f"\nFound {len(data_df)} locations")
        print(f"Columns: {list(data_df.columns)}")
        
        # Display first few rows
        print("\nFirst 5 rows:")
        print(data_df.head())
        
        # Save to files
        output_csv = "kmz_locations.csv"
        output_excel = "kmz_locations.xlsx"
        
        save_to_csv(data_df, output_csv)
        save_to_excel(data_df, output_excel)
        
    except Exception as e:
        print(f"Error processing KMZ file: {e}")

Extracting data from KMZ file...

Found 7141 locations
Columns: ['folder_name', 'name', 'coordinates', 'LineString', 'Point', 'Polygon', 'description', 'styleUrl']

First 5 rows:
      folder_name                     name  \
0  Abana (Abanah)     Abana / Barada River   
1  Abana (Abanah)     Abana / Barada River   
2          Abarim  Abarim (50% confidence)   
3          Abarim  Abarim (40% confidence)   
4          Abarim  Abarim (30% confidence)   

                                         coordinates LineString Point Polygon  \
0  36.239828,33.540154 36.238717,33.539496 36.238...                            
1                                   36.305,33.513542                            
2                                                                               
3                                                                               
4                                                                               

  description                   styleUrl  
0             

In [14]:
data_df

Unnamed: 0,folder_name,name,coordinates,LineString,Point,Polygon,description,styleUrl
0,Abana (Abanah),Abana / Barada River,"36.239828,33.540154 36.238717,33.539496 36.238...",,,,,#water
1,Abana (Abanah),Abana / Barada River,"36.305,33.513542",,,,,#waterrepresentativepoint
2,Abarim,Abarim (50% confidence),,,,,,#landisobands
3,Abarim,Abarim (40% confidence),,,,,,#landisobands
4,Abarim,Abarim (30% confidence),,,,,,#landisobands
...,...,...,...,...,...,...,...,...
7136,Zuph,Zuph / about 10 km around Nabi Samwil,"35.18015,31.832733",,,,about 10 km around Ramathaim-zophim,#landrepresentativepoint
7137,Zuph,Zuph / about 10 km around Ramat Rahel,,,,,about 10 km around Ramathaim-zophim,#region
7138,Zuph,Zuph / about 10 km around Ramat Rahel,"35.216896,31.739901",,,,about 10 km around Ramathaim-zophim,#landrepresentativepoint
7139,Zuph,Zuph / about 10 km around Al Bira,,,,,about 10 km around Ramathaim-zophim,#region


In [9]:
data_df.LineString.values

array(['', '', '', ..., '', '', ''], dtype=object)