In [86]:
import geopandas as gpd
import pandas as pd
import fiona 
from fiona.errors import DriverError
import os


In [87]:
# Global Vars
OUTPUT_LAYERS_CSV_PATH = r'C:\PERSONAL\UK PHD\NEOM_PROJECT\gdb_layers.csv'
OUTPUT_LAYERS_METADATA_PATH = r'C:\PERSONAL\UK PHD\NEOM_PROJECT\gdb_layer_metadata.csv'
ROOT_DIRS = [r'C:\PERSONAL\UK PHD\NEOM_PROJECT']

In [11]:
# function to return geodatabase / files paths in list 
def get_geodbs_to_list(root_dirs, files_endwith='gdb'):
    """This function walks through directories and grabs all geodatabases"""
    db_names = []
    db_paths = []

    for root_dir in root_dirs:
        for dirpath, dirnames, filenames in os.walk(root_dir):
            for f in dirnames:
                if f.endswith(f"{files_endwith}"):
                    file_path = os.path.join(dirpath, f)
                    
                    # Extract parts 
                    parts = os.path.normpath(file_path).split(os.sep)  
                    name = parts[-2]   

                    db_names.append(name)
                    db_paths.append(file_path)

    return db_names, db_paths


In [33]:
# Get geo dbs to list
gdb_names, gdb_paths = get_geodbs_to_list(root_dirs=ROOT_DIRS)

print(gdb_names)
print(gdb_paths)

['Habitat map', '04_NEOM_FinalData_GDB_20211209', 'Complete dataset for birds and megafauna', '2025-05-11_All Marine Megafauna & Avifauna Geodatabase KMZ', 'FINAL', 'KBD.gdb', 'Summer Interim Report GIS', '20231217_Final Neom Schema Submission', '2025-05-11_All Marine Megafauna & Avifauna Geodatabase KMZ', 'FINAL', '5. GIS data', 'FINAL']
['C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\HABITAT MAP\\Habitat map\\NEOM_Combined_Habitat_Map_Ver02.gdb', 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Birds and Turtles Surveys AECOM\\305. Raw Data\\04_NEOM_FinalData_GDB_20211209\\NEOM_Turtle_Bird_Survey_2021_Data.gdb', 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Complete dataset for birds and megafauna\\KBD.gdb', 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Complete dataset for birds and megafauna\\2025-05-11_All Marine Megafauna & Avifauna Geodatabase KMZ\\KBD.gdb', 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Complete dataset for 

In [34]:
# how many geo databases do we have
len(gdb_paths)

12

In [66]:
# find layers in gdb

def get_gdb_layers(gdb_paths, output_layers_csv_path):
    """
    Returns a dictionary mapping each geodatabase path
    to a list of its layer names.
    """
    layers_dict = {}
    rows = []

    for gdb_path in gdb_paths:
        try:
            layers = fiona.listlayers(gdb_path)
            layers_dict[gdb_path] = list(layers)

        except Exception as e:
            print(f"Skipping {gdb_path}: {e}")
            layers_dict[gdb_path] = []  # keep record of failed GDBs
            continue

    # return layers_dict
    for gdb_path, layers in layers_dict.items():
        if layers:
            for layer in layers:
                rows.append({
                    "geodatabase": gdb_path,
                    "layer": layer
                })
        else:
            rows.append({
                "geodatabase": gdb_path,
                "layer": None
            })

        # save a csv
    df = pd.DataFrame(rows)
    df.to_csv(f"{output_layers_csv_path}", index=False)
    
    # return layers
    return rows


In [67]:
# get each geo db and its containing layers
layers = get_gdb_layers(gdb_paths, OUTPUT_LAYERS_CSV_PATH)
layers

Skipping C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MARINE DATA\Complete dataset for birds and megafauna\KBD.gdb: Failed to open dataset (flags=68): C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MARINE DATA\Complete dataset for birds and megafauna\KBD.gdb
Skipping C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\__MARINE DATA\Complete dataset for birds and megafauna\2025-05-11_All Marine Megafauna & Avifauna Geodatabase KMZ\FINAL\KBD.gdb: Failed to open dataset (flags=68): C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\__MARINE DATA\Complete dataset for birds and megafauna\2025-05-11_All Marine Megafauna & Avifauna Geodatabase KMZ\FINAL\KBD.gdb


[{'geodatabase': 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\HABITAT MAP\\Habitat map\\NEOM_Combined_Habitat_Map_Ver02.gdb',
  'layer': 'NEOM_COMBINED_Habitat_Map_Ver_02_20250128'},
 {'geodatabase': 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\HABITAT MAP\\Habitat map\\NEOM_Combined_Habitat_Map_Ver02.gdb',
  'layer': 'NEOM_Combined_Habitat_Coastline_Ver01_20241202'},
 {'geodatabase': 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Birds and Turtles Surveys AECOM\\305. Raw Data\\04_NEOM_FinalData_GDB_20211209\\NEOM_Turtle_Bird_Survey_2021_Data.gdb',
  'layer': 'Turtle_Observations'},
 {'geodatabase': 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Birds and Turtles Surveys AECOM\\305. Raw Data\\04_NEOM_FinalData_GDB_20211209\\NEOM_Turtle_Bird_Survey_2021_Data.gdb',
  'layer': 'Osprey_SootyFalcon_Observations'},
 {'geodatabase': 'C:\\PERSONAL\\UK PHD\\NEOM_PROJECT\\MARINE DATA\\MARINE DATA\\Birds and Turtles Surveys AECOM\\305. Raw Data\\04_NEOM_FinalData_GDB_20211209\\NEO

In [69]:
# read the output csv layers table
lyrs_df = pd.read_csv(OUTPUT_LAYERS_CSV_PATH)
lyrs_df.head()

Unnamed: 0,geodatabase,layer
0,C:\PERSONAL\UK PHD\NEOM_PROJECT\HABITAT MAP\Ha...,NEOM_COMBINED_Habitat_Map_Ver_02_20250128
1,C:\PERSONAL\UK PHD\NEOM_PROJECT\HABITAT MAP\Ha...,NEOM_Combined_Habitat_Coastline_Ver01_20241202
2,C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MA...,Turtle_Observations
3,C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MA...,Osprey_SootyFalcon_Observations
4,C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MA...,Raptor_Satelite_Tagging


### READ EACH LAYER AND DERIVE ITS META DATA
- crs
- spatial extent / bbox
- geometry types
- field names
- field data types
- geometry types


In [73]:
# read one database layer to check
gdf = gpd.read_file(lyrs_df.geodatabase[0], layer=lyrs_df.layer[0])
gdf.head()

  return ogr_read(


Unnamed: 0,Join_Count,fid_neom_combined_habitat_map_2,core_name,core_group,core_hab,bioregion,habitat_group,Ha,hab_code_sup,hab_name_sup,Shape_Length,Shape_Area,geometry
0,0,1,Abraded pavement of the Hisma Plateau,6415,TERRESTRIAL,Upper Valley,"Mountains, hills, rocky terrain and wadis",0.832771,,,479.093473,8327.710597,"MULTIPOLYGON Z (((759748.822 3193801.157 0, 75..."
1,0,2,Abraded pavement of the Hisma Plateau,6415,TERRESTRIAL,Upper Valley,"Mountains, hills, rocky terrain and wadis",0.670714,,,516.562659,6707.140234,"MULTIPOLYGON Z (((760551.452 3193212.033 0, 76..."
2,0,3,Abraded pavement of the Hisma Plateau,6415,TERRESTRIAL,Upper Valley,"Mountains, hills, rocky terrain and wadis",6.321505,,,1340.762745,63215.05322,"MULTIPOLYGON Z (((742650.071 3226915.771 0, 74..."
3,0,4,Abraded pavement of the Hisma Plateau,6415,TERRESTRIAL,Upper Valley,"Mountains, hills, rocky terrain and wadis",9.968409,,,1967.691933,99684.094971,"MULTIPOLYGON Z (((744806.573 3224983.642 0, 74..."
4,0,5,Abraded pavement of the Hisma Plateau,6415,TERRESTRIAL,Upper Valley,"Mountains, hills, rocky terrain and wadis",0.587627,,,387.943237,5876.268688,"MULTIPOLYGON Z (((753409.936 3211311.867 0, 75..."


Extract meta data for all layers

In [88]:
# function to extract layers meta data

def extract_gdb_layer_metadata(
    layers_df,
    output_csv
):
    """
    Reads geodatabase layers and extracts metadata safely.
    
    Parameters
    ----------
    layers_df : pd.DataFrame
        Must contain columns: ['geodatabase', 'layer']
    output_csv : str
        Path to save metadata CSV
    
    Returns
    -------
    pd.DataFrame
        Detailed metadata table
    """

    records = []

    for idx, row in layers_df.iterrows():
        gdb = row["geodatabase"]
        layer = row["layer"]

        meta = {
            "geodatabase": gdb,
            "layer": layer,
            "status": "success",
            "error": None
        }

        try:
            # ---- Read layer ----
            gdf = gpd.read_file(gdb, layer=layer)

            # ---- Spatial metadata ----
            meta["crs"] = str(gdf.crs)
            meta["epsg"] = gdf.crs.to_epsg() if gdf.crs else None
            meta["geometry_types"] = ", ".join(sorted(gdf.geom_type.unique()))
            meta["bbox"] = list(gdf.total_bounds)

            # ---- Feature-level metadata ----
            meta["feature_count"] = len(gdf)
            meta["has_geometry"] = "geometry" in gdf.columns

            # ---- Attribute metadata ----
            meta["field_count"] = len(gdf.columns)
            meta["field_names"] = ", ".join(gdf.columns)

            meta["field_types"] = ", ".join(
                f"{col}:{dtype}"
                for col, dtype in gdf.dtypes.items()
            )

            # ---- Derived metadata ----
            meta["memory_mb"] = round(
                gdf.memory_usage(deep=True).sum() / (1024 ** 2), 3
            )

            # ---- Z / M detection (best-effort) ----
            try:
                meta["has_z"] = gdf.geometry.has_z.any()
            except Exception:
                meta["has_z"] = None

        except (DriverError, PermissionError) as e:
            meta["status"] = "skipped"
            meta["error"] = str(e)

        except Exception as e:
            meta["status"] = "failed"
            meta["error"] = str(e)

        records.append(meta)

    # ---- Create DataFrame ----
    meta_df = pd.DataFrame(records)

    # ---- Save CSV ----
    meta_df.to_csv(output_csv, index=False, encoding="utf-8")

    return meta_df



In [89]:
# extract meta data for each geo database layer and save csv
metadata_df = extract_gdb_layer_metadata(
    lyrs_df,
    output_csv=OUTPUT_LAYERS_METADATA_PATH
)

metadata_df.head()  

  return ogr_read(
  return ogr_read(
  return ogr_read(
  return ogr_read(


Unnamed: 0,geodatabase,layer,status,error,crs,epsg,geometry_types,bbox,feature_count,has_geometry,field_count,field_names,field_types,memory_mb,has_z
0,C:\PERSONAL\UK PHD\NEOM_PROJECT\HABITAT MAP\Ha...,NEOM_COMBINED_Habitat_Map_Ver_02_20250128,success,,EPSG:32636,32636.0,MultiPolygon,"[645600.9989, 3026996.5185000002, 857646.47649...",357255.0,True,13.0,"Join_Count, fid_neom_combined_habitat_map_2, c...","Join_Count:int32, fid_neom_combined_habitat_ma...",129.156,True
1,C:\PERSONAL\UK PHD\NEOM_PROJECT\HABITAT MAP\Ha...,NEOM_Combined_Habitat_Coastline_Ver01_20241202,success,,EPSG:32636,32636.0,MultiLineString,"[654343.9362000003, 3037335.329399999, 756140....",70.0,True,4.0,"LEFT_FID, RIGHT_FID, Shape_Length, geometry","LEFT_FID:int32, RIGHT_FID:int32, Shape_Length:...",0.002,True
2,C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MA...,Turtle_Observations,success,,EPSG:4326,4326.0,Point,"[34.65184900000003, 27.64983200000006, 35.4674...",158.0,True,27.0,"IslandMain, IslandName, FINAL_X, FINAL_Y, Unde...","IslandMain:object, IslandName:object, FINAL_X:...",0.146,False
3,C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MA...,Osprey_SootyFalcon_Observations,success,,EPSG:4326,4326.0,Point,"[34.650691059844576, 27.648746670322623, 35.28...",190.0,True,16.0,"Species, TrackUnit, Name, Sex, NestStatus, Cat...","Species:object, TrackUnit:object, Name:object,...",0.123,False
4,C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MA...,Raptor_Satelite_Tagging,success,,EPSG:4326,4326.0,Point,"[35.162930920025985, 27.64960854999464, 35.282...",8.0,True,17.0,"Species, Comments, NestPresent, Number, TrackU...","Species:object, Comments:object, NestPresent:o...",0.004,False


Questions
- What is the meaning of habitat groups, core groups and core names
- What is the difference between core groups, core names and habitat groups

Read Layer 2

### Get all csv and excel sheets meta table

In [15]:
#
kdb_df = pd.read_excel(r"C:\PERSONAL\UK PHD\NEOM_PROJECT\MARINE DATA\MARINE DATA\Birds and Turtles Surveys AECOM\305. Raw Data\04_NEOM_FinalData_GDB_20211209\Excel\Sea_Shore_Bird_Observations.xls", engine="xlrd")
kdb_df

Unnamed: 0,OBJECTID,CommonName,ScientificName,Number,Category,Nest,Behaviour,Notes,CreationDate,Creator,EditDate,Editor,IslandMain,IslandName,FINAL_X,FINAL_Y
0,1,Bridled Tern,Onychoprion anaethetus,80,,Colony,Nesting,Nests under shrubs,2021-08-30,Robert.Conohan@aecom.com_aecom,2021-08-30,Robert.Conohan@aecom.com_aecom,Umm Shujayrat,Umm Shujayrat,35.090280,28.065897
1,2,Bridled Tern,Onychoprion anaethetus,18,,Colony,Nesting,Sparsely vegetated backshore,2021-08-30,Robert.Conohan@aecom.com_aecom,2021-08-30,Robert.Conohan@aecom.com_aecom,Umm Shujayrat,Umm Shujayrat,35.087717,28.060908
2,3,Brown Booby,Sula leucogaster,1,,Nest,Nesting,Sandy outcrop,2021-07-07,Hanneke.VanLavieren@aecom.com_aecom,2021-07-07,Hanneke.VanLavieren@aecom.com_aecom,Sindalah,Sindalah,34.718132,28.052980
3,4,Brown Booby,Sula leucogaster,1,,Nest,,,2021-07-07,Hanneke.VanLavieren@aecom.com_aecom,2021-07-07,Hanneke.VanLavieren@aecom.com_aecom,Sindalah,Sindalah,34.718132,28.052980
4,5,Brown Booby,Sula leucogaster,1,Fledgling,Nest,Nesting,Rock island,2021-07-13,Carla.Korpijaakko@aecom.com_aecom,2021-07-13,Carla.Korpijaakko@aecom.com_aecom,Silah,Silah B,35.272002,27.663195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,117,White-eyed Gull,Ichthyaetus leucophthalmus,7,,Colony,Nesting,Small wadi depression,2021-07-18,Carla.Korpijaakko@aecom.com_aecom,2021-07-18,Carla.Korpijaakko@aecom.com_aecom,Sindalah,Sindalah Twin South,34.700017,28.033510
117,118,White-eyed Gull,Ichthyaetus leucophthalmus,35,,Colony,Nesting,Gravel plain,2021-07-18,Carla.Korpijaakko@aecom.com_aecom,2021-07-18,Carla.Korpijaakko@aecom.com_aecom,Sindalah,Sindalah Twin South,34.700887,28.035013
118,119,White-eyed Gull,Ichthyaetus leucophthalmus,13,,Colony,Nesting,Plateau,2021-07-18,Carla.Korpijaakko@aecom.com_aecom,2021-07-18,Carla.Korpijaakko@aecom.com_aecom,Sindalah,Sindalah Twin South,34.700226,28.034256
119,120,White-eyed Gull,Ichthyaetus leucophthalmus,4,,Colony,Loafing,,2021-08-24,damian.smith02@aecom.com_aecom,2021-08-24,damian.smith02@aecom.com_aecom,Maktal Ali,Maktal Ali 1,34.656278,28.049552
