# EGMS DATA DOWNLOADER for BUILDING ANOMALY DETECTION

## Load Dependencies

In [1]:
import os
import osmnx as ox
import json
import zipfile
import uuid
import requests
import shutil
import glob
import pickle
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from io import BytesIO
from joblib import Parallel, delayed
from shapely.geometry import Point, Polygon, box
from shapely.ops import unary_union
from shapely.wkt import dumps, loads
from tqdm import tqdm
import warnings
from pyrosm import get_data, OSM
import concurrent.futures

# Configure inline plotting for Jupyter Notebook
%matplotlib inline

# Suppress specific warnings from libraries
warnings.filterwarnings("ignore", category=UserWarning)


In [5]:

# Assuming your JSON data is in a file named 'data.json'
input_file_path = 'data/insar_data_download_updated.json'
output_file_path = 'data/insar_data_download_updated2.json'

# Load the JSON data from a file
with open(input_file_path, 'r') as file:
    data = json.load(file)

# Function to add a bbox attribute and an empty intersection to each zip file URL in the JSON data
def add_bbox_and_intersection_to_urls(data):
    for country, cities in data.items():
        for city, directions in cities.items():
            for direction, urls in directions.items():
                new_url_entries = []
                for url in urls:
                    new_url_entries.append({
                        "url": url,
                        "bbox": []  # Initialize bbox as empty list
                    })
                directions[direction] = new_url_entries
            # Initialize an empty Intersection key at the same level as Ascending/Descending
            directions['Intersection'] = ""  # Initialize intersection as an empty string

# Modify the data
add_bbox_and_intersection_to_urls(data)

# Save the modified data back to a new JSON file
with open(output_file_path, 'w') as file:
    json.dump(data, file, indent=4)

print("Updated JSON has been saved to", output_file_path)


Updated JSON has been saved to data/insar_data_download_updated2.json


## Download and Extract InSAR Data

In [8]:
def download_and_extract_insar_data(base_dir, json_config_path, download_id):
    
    def _download_file(url, filename, extract_dir, base_name):
        
        def _is_valid_download(extract_dir, base_name):
            subfolder_path = os.path.join(extract_dir, base_name)
            if os.path.exists(subfolder_path) and os.path.isdir(subfolder_path):
                xml_file = os.path.join(subfolder_path, base_name + ".xml")
                csv_file = os.path.join(subfolder_path, base_name + ".csv")
                return os.path.exists(xml_file) and os.path.exists(csv_file) and \
                       os.path.getsize(xml_file) > 0 and os.path.getsize(csv_file) > 0
            return False
        
        if _is_valid_download(extract_dir, base_name):
            progress.set_postfix_str(f"Files for {base_name} already downloaded and valid. Skipping...")
            return "Skipped"

        try:
            response = requests.get(url, stream=True)
            with open(filename, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            if os.path.getsize(filename) < 1024:
                with open(filename, 'r') as file:
                    content = file.read()
                if "valid download token" in content:
                    os.remove(filename)
                    return False
            return True
        
        except requests.RequestException as e:
            tqdm.write(f"Error downloading {url}: {e}")
            if os.path.exists(filename):
                os.remove(filename)
            return False
    
    def _extract_to_subfolder(zip_file_name, extract_dir):
        if not os.path.exists(zip_file_name):
            tqdm.write(f"No zip file to extract: {zip_file_name}")
            return False
        base_name = os.path.splitext(os.path.basename(zip_file_name))[0]
        subfolder_path = os.path.join(extract_dir, base_name)
        os.makedirs(subfolder_path, exist_ok=True)
        with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
            zip_ref.extractall(subfolder_path)
        return True
    
    with open(json_config_path, 'r') as file:
        json_data = json.load(file)

    #total_urls = sum(len(urls) for _, cities in json_data.items() for _, directions in cities.items() for _, urls in directions.items())
    #total_urls = sum(len(direction['urls']) for _, cities in json_config.items() for _, directions in cities.items() for direction in directions.values())
    total_urls = sum(len(urls) for cities in json_data.values() for directions in cities.values() for key, urls in directions.items() if key != 'Intersection')

    progress = tqdm(total=total_urls, desc='Overall Progress')

    os.makedirs(base_dir, exist_ok=True)
    for country, cities in json_data.items():
        country_dir = os.path.join(base_dir, country)
        os.makedirs(country_dir, exist_ok=True)
        for city, directions in cities.items():
            city_dir = os.path.join(country_dir, city)
            os.makedirs(city_dir, exist_ok=True)
            for direction, urls in directions.items():
                if 'Intersection' in direction: continue
                direction_dir = os.path.join(city_dir, direction)
                os.makedirs(direction_dir, exist_ok=True)
                for url_entry in urls:
                    url = url_entry['url']  # Access the URL from the new JSON structure
                    url_with_id = f"{url}?id={download_id}"
                    zip_file_name = os.path.join(direction_dir, os.path.basename(url).split('?')[0])
                    base_name = os.path.splitext(os.path.basename(zip_file_name))[0]
                    download_result = _download_file(url_with_id, zip_file_name, direction_dir, base_name)
                    if download_result == True:
                        if not _extract_to_subfolder(zip_file_name, direction_dir):
                            os.remove(zip_file_name)
                    progress.update(1)

    progress.close()
    tqdm.write("Download and extraction complete.")

In [9]:
data_root = "data/insar_downloads"  
json_file_path = 'data/insar_data_download_updated.json'
    
download_id = "e3b17ad10e024423b43c509624a05463"
download_and_extract_insar_data(data_root, json_file_path, download_id)

Overall Progress: 100%|██████████| 72/72 [00:00<00:00, 1221.23it/s, Files for EGMS_L2a_007_0850_IW1_VV already downloaded and valid. Skipping...]

Download and extraction complete.





## Convert Downloaded InSAR CSV Files into GeoPackage Files

In [135]:
import os
import json
import geopandas as gpd
import pandas as pd
from tqdm import tqdm

def convert_csv_to_geopackage(data_root, json_config_path):
    # Load JSON configuration
    with open(json_config_path, 'r') as file:
        json_config = json.load(file)
        #print(json_config)

    total_iterations = sum(len(urls) for _, cities in json_config.items() for _, directions in cities.items() for urls in directions.values())

    progress = tqdm(total=total_iterations, desc='Overall Progress')

    for country, cities in json_config.items():
        for city, directions in cities.items():
            for direction, url_entries in directions.items():
                for url_entry in url_entries:
                    url = url_entry['url']
                    bbox = url_entry['bbox']  # This will store the bounding box
                    folder_name = os.path.basename(url).split('.')[0]
                    folder_path = os.path.join(data_root, country, city, direction, folder_name)

                    if not bbox:  # If bbox is empty, processing is needed
                        if os.path.exists(folder_path) and os.path.isdir(folder_path):
                            for filename in os.listdir(folder_path):
                                if filename.endswith(".csv"):
                                    file_without_extension = os.path.splitext(filename)[0]
                                    file_path_without_extension = os.path.join(folder_path, file_without_extension)
                                    try:
                                        gdf = read_csv_or_geopackage_to_geopandas(file_path_without_extension)
                                        if not gdf.empty:
                                            # Successfully converted and not previously saved bbox
                                            bbox = gdf.total_bounds  # Get the bounding box of the GeoDataFrame
                                            url_entry['bbox'] = bbox.tolist()  # Update the bbox in json data
                                            # Save the updated JSON configuration back to the file
                                            with open(json_config_path, 'w') as f:
                                                json.dump(json_config, f, indent=4)
                                        progress.set_postfix_str(f"Converted {filename} in {city}/{direction}")
                                    except FileNotFoundError as e:
                                        progress.set_postfix_str(str(e))
                        # Update progress after each URL is processed
                    else:
                        progress.set_postfix_str(f"Skipped {folder_name} in {city}/{direction}")
                    progress.update(1)

    progress.set_postfix_str(f"Complete")
    progress.close()


def read_csv_or_geopackage_to_geopandas(file_name,verbose=False):
    """
    Reads a GeoJSON or CSV file and converts it to a GeoDataFrame.

    Parameters:
        file_name (str): The name of the file without the file extension.

    Returns:
        gdf (GeoDataFrame): The GeoDataFrame containing the data from the file.

    Raises:
        FileNotFoundError: If no GeoJSON or CSV file is found with the given file name.
    """
    def _displacement2float32(displacement_str):
        """Converts  displacements to float32 precision."""
        # Convert the string to a dictionary
        displacement_dict = json.loads(displacement_str)
        # Convert each value in the dictionary to float32
        return {date: round(value, 3) for date, value in displacement_dict.items()}

    geo_path = f"{file_name}.geojson"
    csv_path = f"{file_name}.csv"

    # TODO: activate it later, it was deactivated to overwrite everything once.
    # Check if GeoJSON file exists
    #if os.path.exists(geo_path):
    #    if verbose: print("File is loading: {} ...".format(geo_path))
    #    gdf = gpd.read_file(geo_path, engine="pyogrio")
    # Check if CSV file exists
    if os.path.exists(csv_path):
        if verbose: print("File is loading: {} ...".format(csv_path))
        df = pd.read_csv(csv_path,engine='c')

        # Define the target columns for the GeoDataFrame
        target_cols = ['latitude', 'longitude']
        displacements_cols = df.columns[df.columns.str.isdigit()].tolist()

        # Create a new column 'displacements' containing dictionaries of non-null values from date columns
        df['displacements'] = df[displacements_cols].apply(lambda x: x.dropna().to_dict(), axis=1)
        
        # Drop the individual date columns
        df.drop(columns=displacements_cols, inplace=True)
        
        df['displacements'] = df['displacements'].apply(_displacement2float32)

        # Create a GeoDataFrame from the DataFrame
        geometry = gpd.points_from_xy(df['longitude'], df['latitude'])
        gdf = gpd.GeoDataFrame(df, geometry=geometry)
        # Convert the GeoDataFrame to a GeoJSON file
        gdf.to_file(geo_path, driver='GeoJSON')
    else:
        raise FileNotFoundError(f"No GeoJSON or CSV file found for '{file_name}'")

    return gdf


In [None]:
data_root = "data/insar_downloads"  
json_file_path = 'data/insar_data_download_updated.json'
    
convert_csv_to_geopackage(data_root, json_file_path)

## Download Eubucco Building Footprints for Cities

In [2]:
def download_and_extract_eubucco_for_countries(data_root, json_config_path, osm_type_path):
    def _find_gpkg_in_directory(directory):
        """Helper function to find the first .gpkg file in a specified directory."""
        for file in os.listdir(directory):
            if file.endswith(".gpkg"):
                return os.path.join(directory, file)
        return None  # Return None if no .gpkg file is found

    # Load the JSON configuration
    with open(json_config_path, 'r') as file:
        json_config = json.load(file)

    # First, get the list of countries from the EUBUCCO API
    eubucco_api_url = "https://api.eubucco.com/v0.1/countries"
    response = requests.get(eubucco_api_url)
    if response.status_code != 200:
        print("Failed to retrieve country list from EUBUCCO")
        return

    available_countries = response.json()
    available_country_names = {country['name'].lower(): country for country in available_countries}

    # Create the root directory if it doesn't exist
    os.makedirs(data_root, exist_ok=True)

    # Initialize tqdm progress bar
    pbar = tqdm(total=len(json_config), desc="Downloading and extracting data", unit="country")
    
    # Loop through the countries specified in the JSON config
    for country, cities in json_config.items():
        extract_dir = os.path.join(data_root, country)
        gpkg_path = _find_gpkg_in_directory(extract_dir)
        if gpkg_path and os.path.exists(gpkg_path):
            pbar.set_postfix_str(f"{country} GPKG already exists, skipping download.")
        else:
            country_lower = country.lower()
            if country_lower in available_country_names:
                # Get the download link for the country's GPKG file
                gpkg_url = available_country_names[country_lower]['gpkg']['download_link']
                zip_file_path = os.path.join(extract_dir, 'eubucco_country.zip')

                # Create directory for the country
                os.makedirs(extract_dir, exist_ok=True)

                # Download the GPKG zip file
                zip_response = requests.get(gpkg_url, stream=True)
                if zip_response.status_code == 200:
                    downloaded = 0
                    with open(zip_file_path, 'wb') as f:
                        for chunk in zip_response.iter_content(chunk_size=1024*1024):  # Larger chunk size
                            f.write(chunk)
                            downloaded += len(chunk)
                            pbar.set_postfix_str(f"Downloading {country}: {downloaded / 1024**2:.2f} MB")

                    # Extract the GPKG file from the ZIP
                    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_dir)
                        pbar.set_postfix_str(f"Extracting {country}")

                    # Optionally remove the ZIP file after extraction
                    os.remove(zip_file_path)
                    pbar.set_postfix_str(f"Completed {country}")
                else:
                    pbar.set_postfix_str(f"Failed to download GPKG for {country}")
            else:
                pbar.set_postfix_str(f"{country} not available in EUBUCCO. OSM data is to be downloaded ...")
                fp = get_data(country,directory=extract_dir)
                osm = OSM(fp)
                buildings = osm.get_buildings()
                
                type_mapping = pd.read_csv(osm_type_path)
                # Map 'type_source' to 'type_db' using a dictionary for faster lookup
                type_dict = pd.Series(type_mapping['type_db'].values, index=type_mapping['type_source']).to_dict()
    
                buildings=clean_buildings_osm(buildings, type_dict)
                buildings=buildings.to_crs(epsg=3035)
                buildings.to_file(os.path.join(extract_dir,f'{country}.gpkg'),driver='GPKG')
                

        # Update tqdm progress bar after finishing or skipping a country
        pbar.update(1)

    pbar.set_postfix_str("Complete")
    pbar.close()

    

def clean_buildings_osm(osm_gdf, type_dict):
    def _replace_nan_with_uuid(x):
        if pd.isna(x):
            return uuid.uuid4().hex
        else:
            return x
        
    def _add_floor_as_height(df, floor_height):
        return [x if str(x) != 'nan' else y * floor_height for x, y in zip(df.height, df.floors)]

        
    # List of columns to clean
    columns_to_clean = ['height', 'building:levels', 'start_date']
    
    # Cleaning each specified column if it exists
    for col in columns_to_clean:
        if col in osm_gdf.columns:
            osm_gdf[col] = osm_gdf[col].replace({None: np.nan, '': np.nan})
            osm_gdf[col] = osm_gdf[col].astype(str).str.extract("(\\d+\\.?(\\d+)?)")[0].astype(float)  

    # Check if type info is present and replace 'yes' with None
    osm_gdf.replace({'yes': None}, inplace=True)

    # Determine 'type_source' based on available columns
    def get_type_source(row):
        for col in ['building', 'building:use', 'amenity']:
            if col in row.index and pd.notna(row[col]) and row[col] != '':
                return row[col]
        return None

    # Apply the function across rows to determine 'type_source'
    osm_gdf['type_source'] = osm_gdf.apply(get_type_source, axis=1)
    
    # Map 'type_source' to 'type'
    osm_gdf['type'] = osm_gdf['type_source'].map(type_dict).fillna('unknown')
    
    # Rename columns if they exist
    if 'osmid' in osm_gdf.columns:
        osm_gdf.rename(columns={'osmid': 'id'}, inplace=True)
    if 'start_date' in osm_gdf.columns:
        osm_gdf.rename(columns={'start_date': 'age'}, inplace=True)
    if 'building:levels' in osm_gdf.columns:
        osm_gdf.rename(columns={'building:levels': 'floors'}, inplace=True)
        
    # Add heights using floor count if applicable
    if 'floors' in osm_gdf.columns and 'height' in osm_gdf.columns:
        osm_gdf['height'] = _add_floor_as_height(osm_gdf, floor_height=2.5)  # assuming floor height is 2.5 meters

        
        
    # Define the target columns to include in the filtered GeoDataFrame
    target_cols = ['id','height','age','type','type_source','geometry']
    osm_gdf = osm_gdf.loc[:,target_cols]
    
    osm_gdf = osm_gdf[osm_gdf.geometry.type.isin(['Polygon', 'MultiPolygon'])]
    osm_gdf = osm_gdf[~osm_gdf["geometry"].geom_type.isin(["LineString", "Point"])]
    
    osm_gdf = osm_gdf.drop_duplicates().reset_index(drop=True)
    osm_gdf['id'] = osm_gdf['id'].apply(_replace_nan_with_uuid)
    

    return osm_gdf

In [3]:
data_root = "data/insar_downloads"  
json_file_path = 'data/insar_data_download_updated.json'
osm_type_path = 'data/osm_type_matches.csv'

download_and_extract_eubucco_for_countries(data_root, json_file_path,osm_type_path)

Downloading and extracting data:  56%|█████▌    | 5/9 [00:00<00:00, 1337.47country/s, Great Britain not available in EUBUCCO. OSM data is to be downloaded ...]

Downloaded Protobuf data 'great-britain-latest.osm.pbf' (1667.83 MB) to:
'/p/scratch/hai_eo_hype/kuzu/building_anomaly/building_anomaly_detection/data/insar_downloads/Great Britain/great-britain-latest.osm.pbf'


Downloading and extracting data: 100%|██████████| 9/9 [1:11:31<00:00, 476.87s/country, Complete]                                                                


## Prepare District Building Footprint Maps

In [34]:
def prepare_building_footprints_for_selected_ROIs(data_root, json_config_path):
    
    def _calculate_intersection(ascending, descending):
        union_ascending = unary_union([box(*url['bbox']) for url in ascending if 'bbox' in url])
        union_descending = unary_union([box(*url['bbox']) for url in descending if 'bbox' in url])
        intersection=union_ascending.intersection(union_descending)
        return intersection
    
    def _find_gpkg_in_directory(directory):
        """Helper function to find the first .gpkg file in a specified directory."""
        for file in os.listdir(directory):
            if file.endswith(".gpkg"):
                return os.path.join(directory, file)
        return None  # Return None if no .gpkg file is found

    # Load the JSON configuration
    with open(json_config_path, 'r') as file:
        json_config = json.load(file)
    
    # Determine the total number of cities to process for setting up the tqdm progress bar
    total_cities = sum(len(cities) for _, cities in json_config.items())

    # Initialize tqdm with the total number of cities
    pbar = tqdm(total=total_cities, desc="Processing Cities")
    
    # Iterate through each country and its cities in the JSON configuration
    for country, cities in json_config.items():
        
        for city, attributes in cities.items():
            # Update tqdm description for current city
            pbar.set_description(f"Processing {city} in {country}")
            
            if 'Intersection' in attributes and attributes['Intersection']:  # Skip if intersection is already calculated
                 pbar.set_postfix_str(f"Skipped footprints for {city} in {country}. (Already extracted!)")
                
            # Calculate intersection if not already calculated
            elif 'Intersection' not in attributes or not attributes['Intersection']:
                intersection_area = _calculate_intersection(attributes['Ascending'], attributes['Descending'])
                mask_polygon = gpd.GeoSeries([intersection_area], crs='EPSG:4326')

                # Construct the full path to the GeoPackage file
                gpkg_file=_find_gpkg_in_directory(os.path.join(data_root, country))

                # Save the filtered GeoDataFrame under each city folder as building_footprint.gpkg
                output_folder = os.path.join(data_root, country, city)
                if not os.path.exists(output_folder):
                    os.makedirs(output_folder)
                output_path = os.path.join(output_folder, 'footprint_buildings.gpkg')
                    
                # Check if the GeoPackage file is available
                if gpkg_file and os.path.exists(gpkg_file):
                    # Load the GeoPackage file with a mask
                    pbar.set_postfix_str(f"Reading footprints for {city} in {country}.")
                    
                    geometry = mask_polygon.to_crs(epsg=3035).iloc[0]
                    df = gpd.read_file(gpkg_file, mask=geometry).to_crs(epsg=4326)

                    df.to_file(output_path, driver="GPKG")
                    
                    attributes['Intersection'] = dumps(intersection_area)  # Save as WKT
                    with open(json_config_path, 'w') as f:
                        json.dump(json_config, f, indent=4)
                        pbar.set_postfix_str(f"Processed and saved footprints for {city} in {country}.")
                    
                else:
                    # TODO: Implement additional error handling or alternative actions here
                    tqdm.write(f'No building footprint info available for {city}/{country}')
                          

            # Update the progress bar after processing each city
            pbar.update(1)
    
    # Close the tqdm progress bar after all processing is complete
    pbar.close()




In [35]:
data_root = "data/insar_downloads"  
json_file_path = 'data/insar_data_download_updated.json'
prepare_building_footprints_for_selected_ROIs(data_root, json_file_path)

Processing Athens in Greece: 100%|██████████| 11/11 [00:00<00:00, 1394.97it/s, Skipped footprints for Athens in Greece. (Already extracted!)]               


## Find Points within Buildings

In [2]:
def find_points_within_buildings(json_config_path, data_root):
    
    def _process_points_within_building(building, points, folder_name, city_dir, building_id, origin, pbar=None):
        
        def _displacement2float32(displacement_str): #TODO this is temporary, later remove it
            """Converts  displacements to float32 precision."""
            # Convert the string to a dictionary
            displacement_dict = json.loads(displacement_str)
            # Convert each value in the dictionary to float32
            return {date: round(value, 3) for date, value in displacement_dict.items()}

        #points=geo_state_points[geo_state_points.geometry.within(building.geometry)]
        if not points.empty:
            
            output_dir = os.path.join(city_dir, f'buildings_{folder_name.lower()}')
            os.makedirs(output_dir, exist_ok=True)
            output_file = os.path.join(output_dir, f"{building_id}_points.geojson")

            # Add additional information to the points
            points = points.copy()
            points['b_id'] = building.id
            points['b_height'] = building.height
            points['b_age'] = building.age
            points['b_type'] = building.type
            points['b_type_source'] = building.type_source
            points['origin'] = origin  # add the origin attribute
            points['displacements'] = points['displacements'].apply(_displacement2float32)

            # Check if the file already exists
            if os.path.exists(output_file):
                # Read the existing GeoJSON file
                existing_points = gpd.read_file(output_file)
                # Append new points to the existing points
                combined_points = gpd.GeoDataFrame(pd.concat([existing_points, points], ignore_index=True), crs=existing_points.crs)
                # Remove duplicates based on the geometry column
                combined_points = combined_points.drop_duplicates(subset=['geometry'])
            else:
                combined_points = points

            # Save or overwrite the GeoJSON file with updated points
            combined_points.to_file(output_file, driver='GeoJSON')
            if pbar:
                pbar.update(1)
                pbar.set_postfix(file=f"Saved {building_id}", refresh=True)
            
    with open(json_config_path, 'r') as file:
        json_config = json.load(file)

    for country, cities in json_config.items():
        for city, attributes in cities.items():
            city_dir = os.path.join(data_root, country, city)
            building_file = os.path.join(city_dir, 'footprint_buildings.gpkg')
            if not os.path.exists(building_file):
                print(f"No building footprint file found for {city} in {country}")
                continue

            buildings = gpd.read_file(building_file, engine="pyogrio")
            intersection_area = loads(attributes['Intersection'])
            
            # Create a map from building ID to index for quick lookup
            building_index_map = {building.id: idx for idx, building in enumerate(buildings.itertuples())}


            for direction in ['Ascending', 'Descending']:
                file_list = attributes.get(direction, [])
                for file_info in file_list:
                    
                    url = file_info['url']
                    origin = os.path.basename(url).split('.')[0]
                    folder_path = os.path.join(city_dir, direction, origin)
                    points_file = os.path.join(folder_path, f"{origin}.geojson")
                    
                    processed = file_info.get('processed')
                    if processed == 'complete':
                        tqdm.write(f"Skipping {origin} points in buildings in {city}/{direction} (Already processed!)")
                        continue  # Skip already processed files

                    tqdm.write(f"Processing file {origin} for {city}/{direction}...")
                    geo_state_points = gpd.read_file(points_file, engine="pyogrio", bbox=box(*intersection_area.bounds))

                    starting_index = building_index_map.get(processed, 0)
                    with tqdm(buildings.itertuples(), total=len(buildings), desc=f"Processing {origin} points in buildings in {city}/{direction}") as pbar:
                        
                        #with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
                        #    futures = [executor.submit(_process_points_within_building, building, geo_state_points, direction, city_dir, building.id, origin, pbar) 
                        #               for building in buildings.itertuples() if building.Index >= starting_index]

                        
                        for building in pbar:
                            if pbar.n < starting_index:
                                continue  # Skip buildings until reaching the starting index
                            points_within_building = geo_state_points[geo_state_points.geometry.within(building.geometry)]
                            _process_points_within_building(building, points_within_building, direction, city_dir, building.id, origin)
                            file_info['processed'] = building.id
                            with open(json_config_path, 'w') as file:
                                json.dump(json_config, file, indent=4)


                    # Update the file info to mark as processed
                    file_info['processed'] = 'complete'
                    geo_state_points=None
                    # Write the updated JSON configuration back to the file
                    with open(json_config_path, 'w') as file:
                        json.dump(json_config, file, indent=4)
            
            buildings=None


In [None]:
# Example usage
data_root = "data/insar_downloads"  
json_file_path = 'data/insar_data_download_updated.json'
find_points_within_buildings(json_file_path, data_root)

Skipping EGMS_L2a_146_0238_IW1_VV points in buildings in Pertusillo/Ascending (Already processed!)
Skipping EGMS_L2a_124_0835_IW1_VV points in buildings in Pertusillo/Descending (Already processed!)
Skipping EGMS_L2a_124_0835_IW2_VV points in buildings in Pertusillo/Descending (Already processed!)
Skipping EGMS_L2a_124_0836_IW1_VV points in buildings in Pertusillo/Descending (Already processed!)
Skipping EGMS_L2a_117_0246_IW2_VV points in buildings in Rome/Ascending (Already processed!)
Skipping EGMS_L2a_117_0247_IW2_VV points in buildings in Rome/Ascending (Already processed!)
Skipping EGMS_L2a_095_0825_IW1_VV points in buildings in Rome/Descending (Already processed!)
Skipping EGMS_L2a_095_0826_IW1_VV points in buildings in Rome/Descending (Already processed!)
Skipping EGMS_L2a_044_0284_IW1_VV points in buildings in Munich/Ascending (Already processed!)
Skipping EGMS_L2a_044_0285_IW1_VV points in buildings in Munich/Ascending (Already processed!)
Skipping EGMS_L2a_044_0286_IW1_VV poi

Processing EGMS_L2a_146_0311_IW1_VV points in buildings in Berlin/Ascending: 100%|█████████▉| 1362044/1362842 [28:52<00:31, 25.07it/s]    

Processing file EGMS_L2a_146_0311_IW2_VV for Berlin/Ascending...


Processing EGMS_L2a_146_0311_IW2_VV points in buildings in Berlin/Ascending:   1%|          | 16623/1362842 [1:09:11<90:38:11,  4.13it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing EGMS_L2a_146_0311_IW2_VV points in buildings in Berlin/Ascending:   5%|▍         | 63444/1362842 [4:19:58<87:05:32,  4.14it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing EGMS_L2a_146_0311_IW2_VV points in buildings in Berlin/Ascending:   8%|▊       

## Experimental

### Getting building footprints for the region of interest

In [137]:
def get_buildings_by_geometry(base_dir, geometry, osm_type_path):
    def _replace_nan_with_uuid(x):
        if pd.isna(x):
            return uuid.uuid4().hex
        else:
            return x
    """
    Retrieves and filters building geometries based on specified geometry using a tiling approach,
    applies a buffer, and writes the filtered data to a GeoJSON file.

    Args:
        geometry (Polygon): The polygon geometry within which to filter buildings.
        file_name (str): Name of the GeoJSON file to save the filtered buildings.
        basedir (str): Directory path where the data files are stored.

    Returns:
        GeoDataFrame: Filtered building geometries as a GeoDataFrame.
    """
    
    type_mapping = pd.read_csv(osm_type_path)
    # Map 'type_source' to 'type_db' using a dictionary for faster lookup
    type_dict = pd.Series(type_mapping['type_db'].values, index=type_mapping['type_source']).to_dict()
    
    
    
    # Get the bounding box of the geometry
    minx, miny, maxx, maxy = geometry.bounds
        
    width = maxx - minx
    height = maxy - miny

        # Calculate the desired width and height of each rectangle
    desired_width = 0.125
    desired_height = 0.125

        # Calculate the number of rows and columns needed to split the bbox into rectangles
    num_rows = int(height / desired_height)
    num_cols = int(width / desired_width)

        # Calculate the actual width and height of each rectangle based on the number of rows and columns
    actual_width = width / num_cols
    actual_height = height / num_rows
        

        # Create a grid of bounding boxes within the geometry
    all_buildings = []
        
        # Create the temporary directory path
    temp_dir = os.path.join(base_dir, "temp")

        # Create the temporary directory if it doesn't exist
    os.makedirs(temp_dir, exist_ok=True)
    #print("Temporary Directory Created: {}".format(temp_dir))
        
    rectangles = []
    for i in range(num_rows):
        for j in range(num_cols):
                # Calculate the bounds of the smaller bbox
            rect_minx = minx + (j * actual_width) - (actual_width * 0.01)
            rect_miny = miny + (i * actual_height) - (actual_height * 0.01)
            rect_maxx = rect_minx + actual_width + (actual_width * 0.02)
            rect_maxy = rect_miny + actual_height + (actual_height * 0.02)
                
            polygon = Polygon([(rect_minx, rect_miny), (rect_maxx, rect_miny), (rect_maxx, rect_maxy), (rect_minx, rect_maxy)])
                
            if polygon.intersects(geometry):
                rectangles.append(polygon)
        
    for index, polygon in tqdm(enumerate(rectangles), total=len(rectangles), desc='Processing'):
        temp_file = os.path.join(temp_dir, "{}_temp.gpkg".format(index))
        if os.path.exists(temp_file):
            temp_buildings = gpd.read_file(temp_file)
            all_buildings.append(temp_buildings)
        
                
        else:
            temp_file = os.path.join(temp_dir, "{}_temp.gpkg".format(index))
                # Check if the bbox intersects with the input geometry
            if polygon.intersects(geometry):
                    # Fetch buildings from osmnx within this bbox
                    #try:
                temp_buildings = ox.features_from_polygon(polygon, tags={'building': True})
                
                
               # print(temp_buildings.columns)
               # with open('temp.txt', 'w') as file:
               #     file.write('\n'.join(temp_buildings.columns.astype(str).tolist()))
               #     temp_buildings.to_csv('output.csv', index=True)
                    
                temp_buildings = temp_buildings[temp_buildings.geometry.type.isin(['Polygon', 'MultiPolygon'])]
                temp_buildings = temp_buildings[~temp_buildings["geometry"].geom_type.isin(["LineString", "Point"])]
                temp_buildings = temp_buildings.droplevel("element_type")
                    
                temp_buildings=clean_buildings_osm(temp_buildings, type_dict)
                
                
                if not temp_buildings.empty:
                    temp_buildings.to_file(temp_file, driver='GPKG')	
                    all_buildings.append(temp_buildings)
                    #except Exception as e:
                    #print(f"Failed to retrieve data for polygon ({polygon}): {str(e)}")

        # Buffer the input geometry and filter buildings
    buffer_distance = 10  # meters
    buffered_geometry = geometry.buffer(buffer_distance)
    all_buildings = all_buildings[all_buildings.intersects(buffered_geometry)]

        # Clean up the data
    all_buildings=pd.concat(all_buildings)
    all_buildings = all_buildings.drop_duplicates().reset_index(drop=True)
    all_buildings['id'] = all_buildings['id'].apply(_replace_nan_with_uuid)
        #shutil.rmtree(temp_dir)
        
    return all_buildings

#data_root = "data/insar_downloads"  
#json_file_path = 'data/insar_data_download_updated.json'        
#results = merge_and_clip_geopackages_for_all_cities(data_root, json_file_path)
#print(results)


### Merge ascending and descending tiles

In [None]:


def merge_and_clip_geopackages_for_all_cities(data_root, json_config_path):
    
    def _calculate_intersection(ascending, descending):
        union_ascending = unary_union([box(*url['bbox']) for url in ascending if 'bbox' in url])
        union_descending = unary_union([box(*url['bbox']) for url in descending if 'bbox' in url])
        intersection=union_ascending.intersection(union_descending)
        return intersection

    
    def _process_and_store_gdf(data_root, country, city_name, direction, files, intersection_area):
        merged_gdf = gpd.GeoDataFrame()
        for file in files:
            url = file['url']
            folder_name = os.path.basename(url).split('.')[0]
            folder_path = os.path.join(data_root, country, city_name, direction, folder_name)
            file_path_without_extension = os.path.join(folder_path, folder_name)

            # Load data using the intersection area as a mask
            progress.set_postfix_str(f"Reading {folder_name} in {city_name}/{direction}")
            gdf = gpd.read_file(f"{file_path_without_extension}.geojson", engine="pyogrio", bbox = box(*intersection_area.bounds))
             
            # Concatenate to the merged GeoDataFrame
            merged_gdf = pd.concat([merged_gdf, gdf[gdf.intersects(intersection_area)]], ignore_index=True)
            progress.update(1)
           
        # Save clipped GeoJSON files
        clipped_file_path = os.path.join(data_root, country, city_name, f"{direction.lower()}_clipped.geojson")
        if not merged_gdf.empty:
            merged_gdf.to_file(clipped_file_path, driver='GeoJSON')
            
        # Clear memory
        merged_gdf = None

        return clipped_file_path
    
   
    # Load the JSON configuration
    with open(json_config_path, 'r') as file:
        json_config = json.load(file)

    # Set up progress tracking
    results=[]
    #total_iterations = sum(len(urls) for cities in json_config.values() for directions in cities.values() for urls in directions.values())
    total_iterations = sum(len(urls) for cities in json_config.values() for directions in cities.values() for key, urls in directions.items() if key != 'Intersection')

    progress = tqdm(total=total_iterations, desc='Processing Cities')
    
    for country, cities in json_config.items():
        for city_name, directions in cities.items():
            if 'Intersection' in directions and directions['Intersection']:  # Skip if intersection is already calculated
                progress.update(len(directions['Ascending']) + len(directions['Descending']))
                continue
                
            # Calculate intersection if not already calculated
            if 'Intersection' not in directions or not directions['Intersection']:
                intersection_area = _calculate_intersection(directions['Ascending'], directions['Descending'])
                directions['Intersection'] = dumps(intersection_area)  # Save as WKT
                
            else:
                intersection_area = loads(directions['Intersection'])

            # Process Ascending
            ascending_result = _process_and_store_gdf(data_root, country, city_name, 'Ascending', directions['Ascending'], intersection_area)
            descending_result = _process_and_store_gdf(data_root, country, city_name, 'Descending', directions['Descending'], intersection_area)
            
            # Save updated JSON configuration after each city is processed
            with open(json_config_path, 'w') as f:
                json.dump(json_config, f, indent=4)
                
            results.append({"ascending_clipped": ascending_result,"descending_clipped": descending_result,"intersection_area": intersection_area})


    progress.set_postfix_str(f"Complete")
    progress.close()
    return results

#TODO: incluse origing as filename and simplify generation by skipping clipping here move calculate intersection to next block