### Author: Sebastian Brubaker

In [11]:
import pandas as pd
import geopandas as gpd
import folium
import os
import re
import numpy as np
import zipfile
import shutil
import logging
import search_utilities as su
import normalization_utilities as nu
from sentence_transformers import SentenceTransformer, util
import fitz
from PIL import Image
from rapidfuzz import fuzz
from logger import setup_logger
from tqdm import tqdm

In [6]:
# Define globals

LOGGER = setup_logger("logger", log_file='logs/11-08-2025.log')

IMG_MODEL = SentenceTransformer("clip-ViT-B-32")
TXT_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

dir_tree_df = pd.read_csv(r"experimental/eao_prj_drive_crawl_spatial.csv")
dir_tree_folders_df = dir_tree_df[dir_tree_df["is_folder"] == True]

dir_tree_2002_df = dir_tree_df[dir_tree_df["act_year"] == 2002]
dir_tree_2002_folders_df = dir_tree_2002_df[dir_tree_2002_df["is_folder"] == True]

dir_tree_2018_df = dir_tree_df[dir_tree_df["act_year"] == 2018]
dir_tree_2018_folders_df = dir_tree_2018_df[dir_tree_2018_df["is_folder"] == True]

dir_tree_spatial_df = dir_tree_df[dir_tree_df["act_year"] == 'spatial']
dir_tree_spatial_folders_df = dir_tree_spatial_df[dir_tree_spatial_df["is_folder"] == True]

print(dir_tree_df.shape)
print(dir_tree_folders_df.shape)
print(dir_tree_2002_df.shape)
print(dir_tree_2002_folders_df.shape)
print(dir_tree_2018_df.shape)
print(dir_tree_2018_folders_df.shape)
print(dir_tree_spatial_df.shape)
print(dir_tree_spatial_folders_df.shape)

(538630, 5)
(85841, 5)
(280811, 5)
(49342, 5)
(243477, 5)
(34711, 5)
(5732, 5)
(398, 5)


  dir_tree_df = pd.read_csv(r"experimental/eao_prj_drive_crawl_spatial.csv")


# BATCH LOGIC

In [7]:
# Pack it all into one function for batch processing

def prep_project(project_name:str, uid:int, drive_locations:list[str], top_k_images:int=10,
                 dest_dir:str=r"path/to/dest_dir",
                 search_images=True
                 ) -> None:
    """
    Do it all!
    """
    dump_dir_tree = su.make_dump_dir("data", root_name=project_name)

    descendants = []
    for loc in drive_locations:
        descendants.extend(su.get_descendants(dir_tree=dir_tree_df, root=loc))

    if not descendants:
        LOGGER.critical(f"No files found for {project_name} in any provided drive location.")
        return

    descendants_df = pd.DataFrame(descendants)
    descendants_path_list = list(descendants_df["full_path"])

    LOGGER.info(f"Gathering files for {project_name}...")
    su.gather_files(descendants_path_list, dump_dir_tree["all_files"], file_ext=su.ALL_TARGET_EXTENSIONS)
    su.gather_files(descendants_path_list, dump_dir_tree["zip_files"], file_ext={".zip"})

    zip_list = os.listdir(dump_dir_tree["zip_files"])
    if (zip_list):
        LOGGER.info(f"Searching {project_name} zips...")
        zip_hits_df = su.search_type_zips(dump_dir_tree["zip_files"], su.ALL_TARGET_EXTENSIONS)

        zip_hits_path = os.path.join(dump_dir_tree["root"], "zip_hits.csv")
        zip_hits_df.to_csv(zip_hits_path)

        if not zip_hits_df.empty:
            zip_paths_list = list(zip_hits_df["zip_path"])
            members_list = list(zip_hits_df["member"])
            LOGGER.info(f"Extracting files from {project_name} zips...")
            su.extract_members_from_zip(zip_paths_list, members_list, out_dir=dump_dir_tree["all_files"])

    local_files = os.listdir(dump_dir_tree["all_files"])
    local_file_paths = [os.path.join(dump_dir_tree["all_files"], f) for f in local_files]

    su.gather_files(local_file_paths, dump_dir_tree["shapefiles"], file_ext=su.SHAPEFILE_EXTENSIONS)
    su.gather_files(local_file_paths, dump_dir_tree["kml_files"], file_ext={".kml"})
    su.gather_files(local_file_paths, dump_dir_tree["kmz_files"], file_ext={".kmz"})
    su.gather_files(local_file_paths, dump_dir_tree["tabular_files"], file_ext=su.TABULAR_EXTENSIONS)
    su.gather_files(local_file_paths, dump_dir_tree["images"], file_ext=su.IMAGE_EXTENSIONS)
    su.gather_files(local_file_paths, dump_dir_tree["pdfs"], file_ext={".pdf"})

    LOGGER.info(f"{project_name} data copied")

    LOGGER.info(f"Converting {project_name} KMZs/KMLs to Shapefiles")
    nu.kmzs_to_kmls(dump_dir_tree["kmz_files"], dump_dir_tree["kml_files"])
    nu.kmls_to_shapefiles(dump_dir_tree["kml_files"], dump_dir_tree["shapefiles"])
    
    LOGGER.info(f"Tagging {project_name} Shapefiles")
    nu.append_id_to_shapefiles(dump_dir_tree["shapefiles"], dump_dir_tree["prepped_shapefiles"],
                                            "EPC_PP_SYD", uid=uid)
    
    LOGGER.info(f"Reprojecting {project_name} Shapefiles")
    nu.reproject_shps(dump_dir_tree["prepped_shapefiles"], dump_dir_tree["prepped_shapefiles"], epsg=3005)

    # Render PDFs
    if search_images:
        LOGGER.info(f"Rendering {project_name} PDFs")
        pdf_path_list = su.get_dir_item_paths(dump_dir_tree["pdfs"])
        i = 1
        for path in pdf_path_list:
            su.render_pdf_pages(path, out_dir=dump_dir_tree["pdf_images"])
            LOGGER.debug(f"Rendering: {i}/{len(pdf_path_list)} for {project_name}")
            i += 1

        LOGGER.info(f'Semantically searching {project_name} images...')
        image_corpus_paths = su.get_dir_item_paths(dump_dir_tree["images"])
        image_corpus_paths.extend(su.get_dir_item_paths(dump_dir_tree["pdf_images"]))
                                                                
        image_hits = su.semantic_search_images(image_corpus_paths,
                                            query="A map of a proposed project",
                                            model=IMG_MODEL, top_k=top_k_images,
                                            batch_size=1000,
                                            )
        
        # Get paths for copying data
        image_hit_paths = [i["file_path"] for i in image_hits]
        su.gather_files(image_hit_paths, dump_dir_tree["image_hits"]) # Copy search results locally

    prepped_shapefile_paths = su.get_dir_item_paths(dump_dir_tree["prepped_shapefiles"])

    # Create staging area
    staging_area_dict = su.make_staging_dir(dest_dir, project_name, ';'.join(drive_locations))

    # Copy data to staging area
    LOGGER.info(f'Copying {project_name} data to network drive...')
    if search_images:
        su.gather_files(image_hit_paths, staging_area_dict["image_folder"])

    su.gather_files(prepped_shapefile_paths, staging_area_dict["shapefile_folder"])

In [8]:
# Make a data structure to iterate through for batch processing projects

projects = {
            'project_name_200':\
                (200, ['path/to/drive_location1', 'path/to/drive_location2', ...]),
            }
projects

{'project_name_200': (200,
  ['path/to/drive_location1', 'path/to/drive_location2', Ellipsis])}

In [None]:
for prj_name, prj_info in tqdm(projects.items(), desc='Processing '):
    try:
        prep_project(project_name=prj_name, uid=prj_info[0], drive_locations=prj_info[1],
                    dest_dir=r'path/to/staging/area',
                    search_images=False,
                    )
    except Exception as e:
        print(f"Failed to prep project {prj_name}: {e}")
        LOGGER.critical(f"Failed to prep project {prj_name}: {e}")

Processing :   0%|          | 0/1 [00:00<?, ?it/s]





