In [None]:
from arcgis.gis import GIS
gis = GIS("home")

In [None]:
%matplotlib inline
#ArcGIS packages
import arcpy
#from arcgis.mapping import WebScene
from arcgis.gis import GIS
from arcgis.features import FeatureLayer
from IPython.display import display
from arcgis.features import GeoAccessor
from arcgis import *
from arcpy.sa import Int
# Raster processing for dataframe
from rasterstats import zonal_stats
import rasterio

# basic packages
import csv
import numpy as np
import os
import timeit
import random
import string
from playsound import playsound
import gc # Force Garbage Collection. This helps reduce memory leaks in long loops.
import warnings
from pathlib import Path
import time
import threading
from collections import defaultdict
from tqdm import tqdm #Bar status

# Data management
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point  # to get points from long lat

# Request service
#from requests import Request
import json
import re
from functools import reduce
#from owslib.wfs import WebFeatureService
import sqlite3

# Plotting packages
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

In [None]:
# Select only the desired columns from merged_df_final
ghz_df  = final_df[[
    "HazardID",
    "latitude",
    "longitude",
    "HazardType",
    "distance",
    "intensity",
    "economic_loss_million",
    "duration_minutes",
    "pop"
]].copy()
ghz_df.info()

In [None]:
# Mask for all invalid distance values (NaN or <= 0)
invalid_mask = ghz_df['distance'].isna() | (ghz_df['distance'] <= 0)

# Exclude Earthquakes and distance == 0
excluded_mask = (ghz_df['distance'] == 0)

# Final filtered DataFrame
invalid_distance_df = ghz_df[invalid_mask & ~excluded_mask].copy()

print("Filtered invalid distances:", len(invalid_distance_df))
print(invalid_distance_df.head())

In [None]:
invalid_distance_df.to_csv(r"D:\NDIS_Database\06_Infrastructureinvalid_distance_df.csv", index=False)

In [None]:
# Count invalid distances per HazardType (excluding Earthquake and distance == 0)
invalid_counts = invalid_distance_df['HazardType'].value_counts()

print(invalid_counts)

In [None]:
start_time = timeit.default_timer()

# Paths
project_folder = r"D:\ArcGISProjects\GeohazardDB"

ndis_gdb = os.path.join(project_folder, "ghzgdb_Legacy.gdb")
road_gdb = os.path.join(project_folder, "GeohazardDB.gdb")
road_layer_template = os.path.join(road_gdb, "roads")
country_layer = os.path.join(road_gdb, "eez_country")

invalid_layer = os.path.join(road_gdb, "invalid_distance")  # input geohazard points

# Near analysis preparation
ghz_list = []
near_tables = []

total_countries = int(arcpy.GetCount_management(country_layer)[0])

with arcpy.da.SearchCursor(country_layer, ["ISO_TER1", "SHAPE@"]) as country_cursor:
    for index, row in enumerate(country_cursor, start=1):
        iso = row[0]
        shape = row[1]
        print(f"‚è≥ Processing {iso} ({index}/{total_countries})...")

        inv_clip = os.path.join(ndis_gdb, f"invdist_{iso}")
        road_clip = os.path.join(road_gdb, f"road_{iso}")
        near_table = os.path.join(ndis_gdb, f"near_invdist_{iso}")

        # Clip invalid geohazard points
        if arcpy.Exists(inv_clip):
            arcpy.Delete_management(inv_clip)
        arcpy.analysis.Clip(invalid_layer, shape, inv_clip)

        # Run Near analysis
        if arcpy.Exists(near_table):
            arcpy.Delete_management(near_table)
        arcpy.analysis.GenerateNearTable(
            in_features=inv_clip,
            near_features=road_clip,
            out_table=near_table,
            location="LOCATION",
            angle="ANGLE",
            closest="CLOSEST",
            method="GEODESIC"
        )
        print(f"  ‚úÖ Near table created: {near_table}")
        near_tables.append(near_table)

        # Add 'distance' to inv_clip
        if "distance" not in [f.name for f in arcpy.ListFields(inv_clip)]:
            arcpy.AddField_management(inv_clip, "distance", "DOUBLE")

        with arcpy.da.UpdateCursor(inv_clip, ["OBJECTID", "distance"]) as up_cursor:
            for up_row in up_cursor:
                oid = up_row[0]
                with arcpy.da.SearchCursor(near_table, ["IN_FID", "NEAR_DIST"]) as near_cursor:
                    for near_row in near_cursor:
                        if near_row[0] == oid:
                            up_row[1] = near_row[1]
                            up_cursor.updateRow(up_row)
                            break

        # Add 'HazardID' to near table
        if "HazardID" not in [f.name for f in arcpy.ListFields(near_table)]:
            arcpy.AddField_management(near_table, "HazardID", "TEXT")

        with arcpy.da.UpdateCursor(near_table, ["IN_FID", "HazardID"]) as cursor:
            for row in cursor:
                with arcpy.da.SearchCursor(inv_clip, ["OBJECTID", "HazardID"]) as src:
                    for src_row in src:
                        if row[0] == src_row[0]:
                            row[1] = src_row[1]
                            cursor.updateRow(row)
                            break

        ghz_list.append(inv_clip)

# Merge clipped geohazard points
merged_output = os.path.join(ndis_gdb, "invalid_dist_merged")
if arcpy.Exists(merged_output):
    arcpy.Delete_management(merged_output)
arcpy.Merge_management(ghz_list, merged_output)
print(f"‚úÖ All invalid geohazard points merged into {merged_output}")

# Merge near tables
compiled_near_table = os.path.join(ndis_gdb, "compiled_near_table_invalid")
if arcpy.Exists(compiled_near_table):
    arcpy.Delete_management(compiled_near_table)

arcpy.CreateTable_management(ndis_gdb, "compiled_near_table_invalid")
for field in [("FROM_X", "DOUBLE"), ("FROM_Y", "DOUBLE"), ("NEAR_X", "DOUBLE"),
              ("NEAR_Y", "DOUBLE"), ("NEAR_FID", "LONG"), ("HazardID", "TEXT")]:
    arcpy.AddField_management(compiled_near_table, field[0], field[1])

with arcpy.da.InsertCursor(compiled_near_table, ["FROM_X", "FROM_Y", "NEAR_X", "NEAR_Y", "NEAR_FID", "HazardID"]) as insert_cursor:
    for table in near_tables:
        with arcpy.da.SearchCursor(table, ["FROM_X", "FROM_Y", "NEAR_X", "NEAR_Y", "NEAR_FID", "HazardID"]) as cursor:
            for row in cursor:
                insert_cursor.insertRow(row)

# Create line layer
line_fc = os.path.join(ndis_gdb, "compiled_near_lines_invalid")
if arcpy.Exists(line_fc):
    arcpy.Delete_management(line_fc)

arcpy.XYToLine_management(
    compiled_near_table, line_fc,
    "FROM_X", "FROM_Y", "NEAR_X", "NEAR_Y"
)
print(f"‚úÖ Lines created: {line_fc}")

elapsed = timeit.default_timer() - start_time
print(f"‚úÖ All invalid distance near analysis completed in {elapsed/60:.2f} minutes")


------

# PreProcessing for Staggered Decision v3.8.0

In [None]:
# Get the recalculated invalid distance
# Path to the merged feature class
merged_fc = r"D:\ArcGISProjects\GeohazardDB\ghzgdb_Legacy.gdb\invalid_dist_merged"

# List fields to extract
fields = [f.name for f in arcpy.ListFields(merged_fc) if f.type not in ("Geometry", "OID")]

# Add geometry fields if needed
fields += ["SHAPE@XY"]

# Read into a DataFrame
data = []
with arcpy.da.SearchCursor(merged_fc, fields) as cursor:
    for row in cursor:
        data.append(row)

# Create DataFrame
invalid_dist_df = pd.DataFrame(data, columns=fields)

invalid_dist_df.info()

In [None]:
# Filter only rows with non-null distance
valid_distance_df = invalid_dist_df[invalid_dist_df["distance"].notna()].copy()

# Show count and preview
print("Valid distances extracted:", len(valid_distance_df))
print(valid_distance_df[["HazardID", "distance"]].head())

In [None]:
# Select only the desired columns from merged_df_final
valid_distance_df  = valid_distance_df[[
    "HazardID",
    "latitude",
    "longitude",
    "HazardType",
    "distance",
    "intensity",
    "economic_loss_million",
    "duration_minutes",
    "pop"
]].copy()
valid_distance_df.info()

In [None]:
nuclear = ghz_df[(ghz_df["HazardType"] == "Nuclear")]
nuclear.info()

In [None]:
# Find duplicated HazardIDs
duplicates = nuclear[nuclear.duplicated("HazardID", keep=False)]

print("Total duplicated HazardIDs:", duplicates["HazardID"].nunique())
print("Duplicated entries:")
print(duplicates.sort_values("HazardID").head())


In [None]:
cleaned_nuclear_df = pd.read_csv(r"D:\NDIS_Database\cleaned_nuclear_df.csv")
cleaned_nuclear_df.info()

In [None]:
# Define fields to check
fields_to_check = ["distance", "pop", "economic_loss_million", "intensity", "duration_minutes"]

# Tier 1: Fully valid rows
valid_mask = nuclear[fields_to_check].notna().all(axis=1) & (nuclear[fields_to_check] > 0).all(axis=1)
valid_df = nuclear[valid_mask].copy()
valid_df = valid_df.sort_values("economic_loss_million", ascending=False)
valid_dedup = valid_df.drop_duplicates(subset="HazardID", keep="first")

# Tier 2: All remaining HazardIDs not in Tier 1
remaining_df = nuclear[~nuclear["HazardID"].isin(valid_dedup["HazardID"])]
fallback_dedup = remaining_df.drop_duplicates(subset="HazardID", keep="first")

# Combine both
final_dedup = pd.concat([valid_dedup, fallback_dedup], ignore_index=True)

print("‚úÖ Final deduplicated nuclear count:", len(final_dedup))


In [None]:
# Step 1: Remove all existing Nuclear rows from ghz_df
ghz_df_no_nuclear = ghz_df[ghz_df["HazardType"] != "Nuclear"].copy()

# Step 2: Append final deduplicated nuclear DataFrame
ghz_updated = pd.concat([ghz_df_no_nuclear, final_dedup], ignore_index=True)

# Step 3: Update rows using valid_distance_df
# We'll use HazardID as the join key and update all matching rows

# Ensure HazardID is same dtype
valid_distance_df["HazardID"] = valid_distance_df["HazardID"].astype(ghz_updated["HazardID"].dtype)

# Set HazardID as index for fast lookup
ghz_updated.set_index("HazardID", inplace=True)
valid_distance_df.set_index("HazardID", inplace=True)

# Update all overlapping columns with corrected values
ghz_updated.update(valid_distance_df)

# Reset index
ghz_updated.reset_index(inplace=True)

# Done
print("Final GHZ DataFrame shape:", ghz_updated.shape)


In [None]:
volcano = ghz_updated[(ghz_updated["HazardType"] == "Volcano")]
volcano.info()

In [None]:
volcano['distance'].notna().sum()

In [None]:
ghz_updated['distance'].isna().unique()

In [None]:
ghz_updated['distance'].notna().sum()

In [None]:
# Paths
project_folder = r"D:\ArcGISProjects\GeohazardDB"
output_gdb = arcpy.CreateFileGDB_management(project_folder, "ndis3.gdb")
output_fc = os.path.join(project_folder, "ndis3.gdb", "ghz_updated_fc")

# Convert to spatially-enabled DataFrame (requires lat/lon in WGS 1984)
sdf = pd.DataFrame(ghz_updated.copy())
sdf = sdf[sdf["latitude"].notna() & sdf["longitude"].notna()]  # Ensure valid coordinates

# Create point geometry
geometry = [arcpy.Point(row["longitude"], row["latitude"]) for idx, row in sdf.iterrows()]
spatial_ref = arcpy.SpatialReference(4326)  # WGS 1984
features = [arcpy.Polygon(arcpy.Array([pt]), spatial_ref) if isinstance(pt, arcpy.Point) else None for pt in geometry]

# Create Feature Class from scratch
if arcpy.Exists(output_fc):
    arcpy.Delete_management(output_fc)
arcpy.CreateFeatureclass_management(out_path=os.path.dirname(output_fc), out_name=os.path.basename(output_fc),
                                    geometry_type="POINT", spatial_reference=spatial_ref)

# Add fields from DataFrame (skip lat/lon and geometry)
fields_to_add = [col for col in sdf.columns if col not in ["latitude", "longitude"]]
for field in fields_to_add:
    sample_value = sdf[field].dropna().iloc[0] if not sdf[field].dropna().empty else ""
    field_type = "TEXT"
    if pd.api.types.is_integer_dtype(sdf[field]):
        field_type = "LONG"
    elif pd.api.types.is_float_dtype(sdf[field]):
        field_type = "DOUBLE"
    arcpy.AddField_management(output_fc, field, field_type)

# Write to feature class
insert_fields = ["SHAPE@"] + fields_to_add
with arcpy.da.InsertCursor(output_fc, insert_fields) as cursor:
    for idx, row in sdf.iterrows():
        pt = arcpy.Point(row["longitude"], row["latitude"])
        values = [pt] + [row[f] for f in fields_to_add]
        cursor.insertRow(values)

print(f"‚úÖ Feature class created: {output_fc}")

In [None]:
# Check for null or zero in 'pop' and 'distance'
invalid_pop_dist = ghz_updated[
    (ghz_updated["pop"].isna() | (ghz_updated["pop"] == 0)) |
    (ghz_updated["distance"].isna() | (ghz_updated["distance"] == 0))
]

# Show summary
print("‚ùó Rows with null or zero in pop or distance:", len(invalid_pop_dist))
print(invalid_pop_dist[["HazardID", "HazardType", "pop", "distance"]].head())

In [None]:
invalid_pop_dist.to_csv(r"D:\NDIS_Database\19_PostProcessing\invalid_pop_dist.csv", index=False)

------
# STAGGERED DECISION 3.8.0
------

In [None]:
def classify_road_distance(d):
    if d < 1000:
        return "<1 km"
    elif 1000 <= d < 5000:
        return "1‚Äì5 km"
    elif 5000 <= d < 20000:
        return "5‚Äì20 km"
    elif 20000 <= d < 50000:
        return "20‚Äì50 km"
    else:
        return ">50 km"

ghz_updated["road_bin"] = ghz_updated["distance"].apply(classify_road_distance)
ghz_updated.info()

In [None]:
def classify_population_bins(p):
    if p == 0:
        return "0"
    elif p < 1000:
        return "<1k"
    elif p < 10000:
        return "1k‚Äì10k"
    elif p < 100000:
        return "10k‚Äì100k"
    elif p < 1000000:
        return "100k‚Äì1M"
    elif p < 10000000:
        return "1M‚Äì10M"
    else:
        return ">10M"

ghz_updated["pop_bin"] = ghz_updated["pop"].apply(classify_population_bins)
ghz_updated.info()

In [None]:
nuclear = ghz_updated[(ghz_updated["HazardType"] == "Nuclear")]
nuclear.info()

In [None]:
# List of columns to extract from the 'nuclear_df' dataframe, including Latitude and Longitude
selected_columns = [
    "Country/Area",
    "Project Name",
    "Capacity (MW)",
    "Status",
    "Reactor Type",
    "Owner",
    "Wiki URL",
    "Latitude",  # Include Latitude for matching
    "Longitude"  # Include Longitude for matching
]

# Select only the relevant columns from the 'nuclear_df' dataframe
nuclear_df_selected = nuclear_df[selected_columns]

# Ensure that latitudes and longitudes are of float type in both dataframes
nuclear_df_selected.loc[:, 'Latitude'] = nuclear_df_selected['Latitude'].astype(float)
nuclear_df_selected.loc[:, 'Longitude'] = nuclear_df_selected['Longitude'].astype(float)
nuclear.loc[:, 'latitude'] = nuclear['latitude'].astype(float)
nuclear.loc[:, 'longitude'] = nuclear['longitude'].astype(float)

# Define a function to match latitude and longitude with a small tolerance
def match_lat_lon(row, df, tolerance=0.001):
    # Match latitudes and longitudes within a tolerance range
    matched_row = df[
        (df['Latitude'].between(row['latitude'] - tolerance, row['latitude'] + tolerance)) &
        (df['Longitude'].between(row['longitude'] - tolerance, row['longitude'] + tolerance))
    ]
    if not matched_row.empty:
        return matched_row.iloc[0]  # Return the first matched row
    return None  # Return None if no match is found

# Merge based on matching latitude and longitude
merged_rows = []
for idx, row in nuclear.iterrows():
    matched_row = match_lat_lon(row, nuclear_df_selected)
    if matched_row is not None:
        # Combine the original row from 'nuclear' with matched fields from 'nuclear_df'
        merged_row = pd.concat([row, matched_row], axis=0)
        merged_rows.append(merged_row)

# Convert merged rows into a new DataFrame
merged_nuclear = pd.DataFrame(merged_rows)
merged_nuclear.info()

In [None]:
# Output folder path
output_folder = r"D:\NDIS_Database\20_PaperSimulation\csv_exports"
os.makedirs(output_folder, exist_ok=True)

# Loop through each hazard type and export
for hazard in ghz_updated["HazardType"].unique():
    df_hazard = ghz_updated[ghz_updated["HazardType"] == hazard].copy()
    file_name = hazard.replace(" ", "_").replace("-", "_") + ".csv"
    output_path = os.path.join(output_folder, file_name)

    df_hazard.to_csv(output_path, index=False)
    print(f"‚úÖ Exported: {output_path}")

In [None]:
# Drop NaNs first (optional)
grouped_exact = drone_df.dropna(subset=["comm_range"]).groupby("comm_range")

# Count per group
print(grouped_exact.size())

In [None]:
def classify_comm(m):
    if pd.isna(m):
        return "Unknown"
    elif m < 1000:
        return "<1km"
    elif m < 5000:
        return "1‚Äì5km"
    elif m < 20000:
        return "5‚Äì20km"
    elif m < 50000:
        return "20‚Äì50km"
    else:
        return ">50km"

drone_df["comm_category"] = drone_df["comm_range"].apply(classify_comm)
comm_count = drone_df["comm_category"].value_counts().reindex(["<1km", "1‚Äì5km", "5‚Äì20km", "20‚Äì50km", ">50km", "Unknown"], fill_value=0)

print(comm_count)


In [None]:
comm_countsr = pd.Series({
    "<1km": 3,
    "1‚Äì5km": 37,
    "5‚Äì20km": 47,
    "20‚Äì50km": 16,
    ">50km": 15,
    "Unknown": 61
})

# Define colorblind-safe palette including one for "Unknown"
colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#999999"]

# Normalize for brightness check
def get_text_color(hex_color):
    rgb = mcolors.hex2color(hex_color)
    brightness = np.dot(rgb, [0.299, 0.587, 0.114])  # luminance
    return 'black' if brightness > 0.6 else 'white'

# Plot donut chart
fig, ax = plt.subplots(figsize=(8, 6))
fig.subplots_adjust(top=0.9, bottom=0.2)
wedges, texts, autotexts = ax.pie(
    comm_countsr,
    labels=comm_countsr.index,
    autopct='%1.1f%%',
    startangle=10,
    pctdistance=0.82,
    wedgeprops=dict(width=0.4),
    colors=colors
)

# Adjust text styles
for i, autotext in enumerate(autotexts):
    autotext.set_color(get_text_color(colors[i]))
    autotext.set_fontsize(19)
for t in texts:
    t.set_fontsize(19)

# Donut hole
centre_circle = plt.Circle((0, 0), 0.65, fc='white')
ax.add_artist(centre_circle)

# Add center title
ax.text(0, 0, "Drone\nCommunication\nRange", ha='center', va='center', fontsize=24, weight='bold', color='black')

# Caption
#plt.figtext(0.5, 0.025,
#    "Distribution of drone communication range (including unknown values).",
#    wrap=True, horizontalalignment='center', fontsize=18)

ax.axis('equal')
plt.tight_layout()

# Save if needed
plt.savefig(r'D:\NDIS_Database\13_NDIS_Display\Page3\pictures\drone_comm2.png', dpi=300, transparent=True)

plt.show()

# STAGE 2: Assign Disaster Phase

In [None]:
ghz_updated.HazardType.unique()

In [None]:
# Step 1: Define disaster phase + sensor map
disaster_map = [
    ("Volcano", "Pre-Event", ["Magnetometers", "Seismic", "Camera"]),
    ("Volcano", "During", ["Thermal Camera", "Camera", "LiDAR"]),
    ("Volcano", "Post-Event", ["LiDAR", "Camera", "Seismic"]),
    ("Volcano", "Clean-Up", ["LiDAR", "Camera", "Seismic"]),
    ("Earthquake", "Pre-Event", ["Seismic", "Magnetometers", "Camera"]),
    ("Earthquake", "During", ["Seismic", "Camera", "LiDAR"]),
    ("Earthquake", "Post-Event", ["LiDAR", "Camera", "Seismic"]),
    ("Fault", "Pre-Event", ["Seismic", "Magnetometers", "Camera"]),
    ("Fault", "Post-Event", ["Seismic", "Camera", "LiDAR"]),
    ("Landslide", "Pre-Event", ["LiDAR", "GPR", "Camera"]),
    ("Landslide", "During", ["Camera", "Thermal Camera", "LiDAR"]),
    ("Landslide", "Post-Event", ["LiDAR", "Seismic", "Camera"]),
    ("Landslide", "Clean-Up", ["Camera", "LiDAR", "Seismic"]),
    ("Tsunami", "During", ["BPR", "Camera", "Seismic"]),
    ("Tsunami", "Post-Event", ["BPR", "Camera", "LiDAR"]),
    ("Tsunami", "Clean-Up", ["Camera", "LiDAR", "Thermal Camera"]),
    ("Nuclear", "Pre-Event", ["Thermal Camera", "Camera", "LiDAR"]),
    ("Nuclear", "During", ["Thermal Camera", "Camera", "LiDAR"]),
    ("Nuclear", "Post-Event", ["Camera", "LiDAR", "Gamma Spectrometer"]),
    ("Nuclear", "Clean-Up", ["Camera", "LiDAR", "Gamma Spectrometer"]),
]

In [None]:
disaster_df = pd.DataFrame(disaster_map, columns=["HazardType", "DisasterPhase", "RecommendedSensors"])

# Step 2: Merge every HazardID with its corresponding disaster phases
ghz_core = ghz_updated[["HazardID", "HazardType", "latitude", "longitude"]]  # minimal fields to avoid memory issue
ghz_expanded = ghz_core.merge(disaster_df, on="HazardType", how="left")

# Step 3: Explode to get 1 row per recommended sensor
ghz_expanded = ghz_expanded.explode("RecommendedSensors").rename(columns={"RecommendedSensors": "RecommendedSensor"})
ghz_expanded.info()

# STAGE 3: Assign Sensor Weight

In [None]:
# Clean sensor name just in case (strip whitespace)
sensor_df["sensor_name_clean"] = sensor_df["sensor_name"].str.strip()
ghz_expanded["RecommendedSensor_clean"] = ghz_expanded["RecommendedSensor"].str.strip()

# Merge and keep only relevant columns
merged_df = ghz_expanded.merge(
    sensor_df[["sensor_name_clean", "sensor_weight", "model"]],
    left_on="RecommendedSensor_clean",
    right_on="sensor_name_clean",
    how="left"
).drop(columns=["sensor_name_clean", "RecommendedSensor_clean"])

# Rename model to sensor_model
merged_df = merged_df.rename(columns={"model": "sensor_model"})
merged_df.info()

In [None]:
merged_df.RecommendedSensor.isna().unique()

In [None]:
merged_df = merged_df.merge(
    ghz_updated[["HazardID", "distance"]],
    on="HazardID",
    how="left"
)
merged_df["distance"] = pd.to_numeric(merged_df["distance"], errors="coerce")
merged_df.info()

In [None]:
sample = merged_df.sample(100000, random_state=42)
print(sample[["RecommendedSensor", "sensor_weight", "distance", "mission_distance"]])
sample.info()

# Calculate Mission Distance (Mapping or Delivery)

In [None]:
# Area definitions (in meters)
sensor_default_area = {
    "Seismic": (None, None),
    "Magnetometers": (500, 200),
    "Lidar": (400, 400),
    "GPR": (None, None),
    "Camera": (300, 300),
    "Thermal_Camera": (300, 300),
    "Hyperspectral": (1000, 200),
    "Multispectral": (1000, 200),
    "EM": (400, 400),
    "Gravimeter": (500, 500),
    "BPR": (None, None),
    "Gamma Spectrometer": (500, 500)  # treated as mapping
}

sensor_spacing = {
    "Magnetometers": 5,
    "Lidar": 10,
    "EM": 10,
    "Gravimeter": 10,
    "Hyperspectral": 20,
    "Multispectral": 20,
    "Camera": 20,
    "Thermal_Camera": 20,
    "Gamma Spectrometer": 20
}

direct_delivery_sensors = {"Seismic", "GPR", "BPR"}

In [None]:
def calculate_mission_distance_vectorized(row):
    sensor = row["RecommendedSensor"]
    try:
        hazard_dist = float(row["distance"])
    except:
        return np.nan

    # Direct delivery ‚Üí use hazard distance directly
    if sensor in direct_delivery_sensors:
        return hazard_dist

    # Area-based mapping
    area = sensor_default_area.get(sensor, (None, None))
    spacing = sensor_spacing.get(sensor, 10)

    if area[0] is not None and area[1] is not None:
        area_length, area_width = area
        num_lines = math.ceil(area_width / spacing)
        return num_lines * area_length
    else:
        # If no area defined, fallback to estimate
        if sensor == "Magnetometers":
            return 20000
        elif sensor == "Lidar":
            return math.pi * hazard_dist
        else:
            return hazard_dist


In [None]:
def batch_calculate_mission_distance(ghz_df, chunk_size=50000):
    # Defaults
    sensor_default_area = {
        "Seismic": (None, None),
        "Magnetometers": (500, 200),
        "Lidar": (400, 400),
        "GPR": (None, None),
        "Camera": (300, 300),
        "Thermal_Camera": (300, 300),
        "Hyperspectral": (1000, 200),
        "Multispectral": (1000, 200),
        "EM": (400, 400),
        "Gravimeter": (500, 500),
        "Gamma Spectrometer": (500, 500),
        "BPR": (None, None)
    }

    sensor_spacing = {
        "Magnetometers": 5,
        "Lidar": 10,
        "EM": 10,
        "Gravimeter": 10,
        "Hyperspectral": 20,
        "Multispectral": 20,
        "Camera": 20,
        "Thermal_Camera": 20,
        "Gamma Spectrometer": 20
    }

    direct_delivery_sensors = {"Seismic", "GPR", "BPR"}

    def get_mission_distance(sensor, hazard_dist):
        try:
            hazard_dist = float(hazard_dist)
        except:
            return np.nan

        if sensor in direct_delivery_sensors:
            return hazard_dist

        area = sensor_default_area.get(sensor, (None, None))
        spacing = sensor_spacing.get(sensor, 10)

        if area[0] and area[1]:
            area_length, area_width = area
            num_lines = math.ceil(area_width / spacing)
            return num_lines * area_length
        elif sensor == "Magnetometers":
            return 20000
        elif sensor == "Lidar":
            return math.pi * hazard_dist
        else:
            return hazard_dist

    # Chunked application
    results = []
    for start in range(0, len(ghz_df), chunk_size):
        end = min(start + chunk_size, len(ghz_df))
        chunk = ghz_df.iloc[start:end].copy()
        print(f"\U0001f373 Calculating mission_distance for chunk {start} to {end}...")

        chunk["mission_distance"] = chunk.apply(
            lambda row: get_mission_distance(row["RecommendedSensor"], row["distance"]), axis=1
        )
        results.append(chunk)

    return pd.concat(results, ignore_index=True)

## Mission Distance for CPM

In [None]:
def batch_calculate_mission_distance(ghz_df, chunk_size=50000):
    # Defaults
    sensor_default_area = {
        "Seismic": (None, None),
        "Magnetometers": (500, 200),
        "Lidar": (400, 400),
        "GPR": (None, None),
        "Camera": (300, 300),
        "Thermal_Camera": (300, 300),
        "Hyperspectral": (1000, 200),
        "Multispectral": (1000, 200),
        "EM": (400, 400),
        "Gravimeter": (500, 500),
        "Gamma Spectrometer": (500, 500),
        "BPR": (None, None)
    }

    sensor_spacing = {
        "Magnetometers": 5,
        "Lidar": 10,
        "EM": 10,
        "Gravimeter": 10,
        "Hyperspectral": 20,
        "Multispectral": 20,
        "Camera": 20,
        "Thermal_Camera": 20,
        "Gamma Spectrometer": 20
    }

    direct_delivery_sensors = {"Seismic", "GPR", "BPR"}

    def get_mission_distance(sensor, hazard_dist):
        # Replace NaN with 1000 for safe fallback
        try:
            hazard_dist = float(hazard_dist)
            if np.isnan(hazard_dist):
                hazard_dist = 1000 # Fallback when it's MOR environment, assume use boat and 1000 is safe distance
        except:
            hazard_dist = 1000

        if sensor in direct_delivery_sensors:
            return hazard_dist

        area = sensor_default_area.get(sensor, (None, None))
        spacing = sensor_spacing.get(sensor, 10)

        if area[0] and area[1]:
            area_length, area_width = area
            num_lines = math.ceil(area_width / spacing)
            return num_lines * area_length
        elif sensor == "Magnetometers":
            return 20000
        elif sensor == "Lidar":
            return math.pi * hazard_dist
        else:
            return hazard_dist

    # Chunked application
    results = []
    for start in range(0, len(ghz_df), chunk_size):
        end = min(start + chunk_size, len(ghz_df))
        chunk = ghz_df.iloc[start:end].copy()
        print(f"\U0001f373 Calculating mission_distance for chunk {start} to {end}...")

        chunk["mission_distance"] = chunk.apply(
            lambda row: get_mission_distance(row["RecommendedSensor"], row["distance"]), axis=1
        )
        results.append(chunk)

    return pd.concat(results, ignore_index=True)

In [None]:
merged_df = batch_calculate_mission_distance(merged_df)

In [None]:
print(max(drone_df['comm_range']))
print(max(merged_df['distance']))
print(max(merged_df.mission_distance.isna().unique()))

----
# CPM
----

In [None]:
def compute_cpm(row):
    distance = row["distance"]  # meters
    mission_distance = row["mission_distance"]  # meters
    sensor = row["RecommendedSensor"]

    # Step 1: Travel Time
    cruise_speed = 16  # m/s, average multicopter cruise speed
    travel_time = distance / cruise_speed / 60  # in minutes

    # Step 2: Monitor Time
    if sensor in ["Camera", "LiDAR"]:
        mapping_speed = 5  # m/s
        monitor_time = mission_distance / mapping_speed / 60
    elif sensor in ["GPR", "Thermal Camera"]:
        mapping_speed = 3  # m/s
        monitor_time = mission_distance / mapping_speed / 60
    elif sensor == "Magnetometer":
        mapping_speed = 10  # m/s
        monitor_time = mission_distance / mapping_speed / 60
    elif sensor in ["Seismic", "BPR"]:
        monitor_time = 3  # fixed minutes
    else:
        monitor_time = 5  # fallback default

    # Step 3: Setup Buffer
    if sensor in ["Camera", "LiDAR", "GPR", "Thermal Camera", "Magnetometer"]:
        setup_buffer = 40  # mapping mission: GCPs, calibration
    elif mission_distance == distance:
        setup_buffer = 20  # delivery-style: point drop/pick
    else:
        setup_buffer = 30  # fallback intermediate buffer

    # Final CPM Total Time
    cpm_total_time = travel_time + monitor_time + setup_buffer

    return pd.Series({
        "travel_time": travel_time,
        "monitor_time": monitor_time,
        "setup_buffer": setup_buffer,
        "cpm_total_time": cpm_total_time
    })

In [None]:
def batch_compute_cpm(ghz_df, chunk_size=50000):
    results = []

    for start in range(0, len(ghz_df), chunk_size):
        end = min(start + chunk_size, len(ghz_df))
        chunk = ghz_df.iloc[start:end].copy()
        print(f"\u23F3 Computing CPM for chunk {start} to {end}...")

        cpm_chunk = chunk.apply(compute_cpm, axis=1)
        combined = pd.concat([chunk, cpm_chunk], axis=1)
        results.append(combined)

    return pd.concat(results, ignore_index=True)

In [None]:
start_time = timeit.default_timer()

ghz_with_cpm = batch_compute_cpm(merged_df)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

# Assign Drone

In [None]:
def batch_assign_best_drones(merged_df, drone_df, chunk_size=50000):
    import numpy as np

    results = []

    # Only keep necessary drone fields and drop NaNs
    drone_df = drone_df[["mfc_model", "max_payload_weight", "distance_range", "comm_range"]].dropna()

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f373 Assigning drones for chunk {start} to {end}...")

        # Make sure fields are numeric
        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        # Prepare result columns
        drone_cols = ["drone1", "drone2", "drone3"]
        note_cols = ["note1", "note2", "note3"]
        for col in drone_cols + note_cols:
            chunk[col] = None

        for i, row in chunk.iterrows():
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            # Handle NaN geohazard distance ‚Üí treat as offshore (>500 km)
            if np.isnan(h_dist):
                h_dist = 999999

            if np.isnan(m_dist):
                continue  # mission distance must be defined

            # Determine candidates
            if sensor == "Camera":
                candidates = drone_df.copy()  # all drones allowed
            elif np.isnan(s_weight):
                # fallback: assume heavy payload, use top 25% heavy drones
                min_required = drone_df["max_payload_weight"].quantile(0.75)
                candidates = drone_df[drone_df["max_payload_weight"] >= min_required].copy()
            else:
                candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()

            if candidates.empty:
                continue

            # Evaluate suitability
            def eval_row(dr):
                note = ""
                tier = 1e6

                # Always check comm range (for all missions)
                if dr["comm_range"] < h_dist:
                    note = "Comm range insufficient"

                if dr["distance_range"] >= m_dist:
                    tier = 1
                    note = "Full coverage"
                elif dr["distance_range"] >= 0.5 * m_dist:
                    tier = 2
                    note = "Needs 2‚Äì3 flights"
                elif dr["distance_range"] >= 0.25 * m_dist:
                    tier = 3
                    note = "Multiple sweeps"
                else:
                    tier = 4
                    note = "Limited coverage"

                # Fallback for long-range/offshore
                if h_dist > 500000 and tier == 1e6:
                    tier = 5
                    note = "Long-range mission ‚Äî consider boat or relay"

                return pd.Series([tier, note])

            candidates[["tier", "note"]] = candidates.apply(eval_row, axis=1)
            candidates = candidates[candidates["tier"] < 6].sort_values("tier").head(3)

            for j, (_, drone_row) in enumerate(candidates.iterrows()):
                chunk.at[i, drone_cols[j]] = drone_row["mfc_model"]
                chunk.at[i, note_cols[j]] = drone_row["note"]

        results.append(chunk)

    return pd.concat(results, ignore_index=True)


In [None]:
def heartbeat():
    while True:
        print("...still running...")
        time.sleep(30)

In [None]:
threading.Thread(target=heartbeat, daemon=True).start()

In [None]:
start_time = timeit.default_timer()

final_df = batch_assign_best_drones(merged_df, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
# Define the heartbeat thread function
def heartbeat(interval=3600):
    while True:
        print(f"‚è≥ Heartbeat: still running at {time.strftime('%Y-%m-%d %H:%M:%S')}")
        time.sleep(interval)

In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

assign_top3_drones_grouped_fixed(merged_df, drone_df, save_dir="D:/NDIS_Database/19_PostProcessing")

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s hour(s)"%str(elapsed/60/60))

In [None]:
def batch_assign_best_drones(merged_df, drone_df, chunk_size=50000, save_dir="D:/NDIS_Database/19_PostProcessing"):
    os.makedirs(save_dir, exist_ok=True)

    drone_df = drone_df[["mfc_model", "max_payload_weight", "distance_range", "comm_range"]].dropna()
    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f373 Assigning drones for chunk {start} to {end}...")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        drone_cols = ["drone1", "drone2", "drone3"]
        note_cols = ["note1", "note2", "note3"]
        for col in drone_cols + note_cols:
            chunk[col] = None

        for i, row in chunk.iterrows():
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            if np.isnan(h_dist):
                h_dist = 999999
            if np.isnan(m_dist):
                continue

            if sensor == "Camera":
                candidates = drone_df.copy()
            elif np.isnan(s_weight):
                min_required = drone_df["max_payload_weight"].quantile(0.75)
                candidates = drone_df[drone_df["max_payload_weight"] >= min_required].copy()
            else:
                candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()

            if candidates.empty:
                continue

            def eval_row(dr):
                note = ""
                tier = 1e6
                if dr["comm_range"] < h_dist:
                    note = "Comm range insufficient"
                if dr["distance_range"] >= m_dist:
                    tier = 1
                    note = "Full coverage"
                elif dr["distance_range"] >= 0.5 * m_dist:
                    tier = 2
                    note = "Needs 2‚Äì3 flights"
                elif dr["distance_range"] >= 0.25 * m_dist:
                    tier = 3
                    note = "Multiple sweeps"
                else:
                    tier = 4
                    note = "Limited coverage"
                if h_dist > 500000 and tier == 1e6:
                    tier = 5
                    note = "Long-range mission ‚Äî consider boat or relay"
                return pd.Series([tier, note])

            candidates[["tier", "note"]] = candidates.apply(eval_row, axis=1)
            candidates = candidates[candidates["tier"] < 6].sort_values("tier").head(3)

            for j, (_, drone_row) in enumerate(candidates.iterrows()):
                chunk.at[i, drone_cols[j]] = drone_row["mfc_model"]
                chunk.at[i, note_cols[j]] = drone_row["note"]

        # Save intermediate result
        out_path = os.path.join(save_dir, f"drones_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        print(f"\u2705 Saved chunk {start} to {end} ‚Üí {out_path}")

        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True)

In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

final_df = batch_assign_best_drones(merged_df, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
# Define the directory where feather files are saved
feather_dir = "D:/NDIS_Database/19_PostProcessing"
merged_output_path = os.path.join(feather_dir, "drones_full_result.feather")

# List all feather files in the directory (make sure they're in correct order)
feather_files = sorted([f for f in os.listdir(feather_dir) if f.endswith(".feather") and f.startswith("drones_chunk_")])

# Load and concatenate all feather chunks
chunks = [pd.read_feather(os.path.join(feather_dir, f)) for f in feather_files]
final_df = pd.concat(chunks, ignore_index=True)

# Save the final merged file
final_df.to_feather(merged_output_path)
print(f"ü•ö Merged {len(chunks)} chunks and saved to ‚Üí {merged_output_path}")

In [None]:
print(f"Sensor weights: {final_df.sensor_weight.unique()}")
print(f"RecommendedSensors: {final_df.RecommendedSensor.unique()}")
print(f"disaster phases: {final_df.DisasterPhase.unique()}")

In [None]:
final_df.mission_distance.isna().unique()

In [None]:
final_df.drone3.unique()

In [None]:
final_df.drone1.unique()

In [None]:
final_df.drone2.unique()

-----
3.8.1
-----

In [None]:
# Load feather file
ghz_with_cpm = pd.read_feather(r"D:\NDIS_Database\20_PaperSimulation\ghz_with_cpm_v1.feather")
ghz_with_cpm.info()

In [None]:
def batch_assign_best_drones(merged_df, drone_df, chunk_size=50000, save_dir="D:/NDIS_Database/19_PostProcessing"):
    os.makedirs(save_dir, exist_ok=True)

    drone_df = drone_df[["mfc_model", "max_payload_weight", "distance_range", "comm_range"]].dropna()
    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f373 Assigning drones for chunk {start} to {end}...")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        drone_cols = ["drone1", "drone2", "drone3"]
        note_cols = ["note1", "note2", "note3"]
        for col in drone_cols + note_cols:
            chunk[col] = None

        for i, row in chunk.iterrows():
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            try:
                h_dist = float(h_dist)
            except:
                h_dist = np.nan
            try:
                m_dist = float(m_dist)
            except:
                m_dist = np.nan

            if sensor == "Camera":
                candidates = drone_df.copy()
            elif pd.isna(s_weight):
                min_required = drone_df["max_payload_weight"].quantile(0.75)
                candidates = drone_df[drone_df["max_payload_weight"] >= min_required].copy()
            else:
                candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()

            if candidates.empty:
                continue

            def eval_row(dr):
                note = ""
                tier = 6
                if not pd.isna(h_dist) and dr["comm_range"] < h_dist:
                    note = "Comm range insufficient"
                if not pd.isna(m_dist):
                    if dr["distance_range"] >= m_dist:
                        tier = 1
                        note = "Full coverage"
                    elif dr["distance_range"] >= 0.5 * m_dist:
                        tier = 2
                        note = "Needs 2‚Äì3 flights"
                    elif dr["distance_range"] >= 0.25 * m_dist:
                        tier = 3
                        note = "Multiple sweeps"
                    elif h_dist > 300000:
                        tier = 4
                        note = "Fallback: long range mission, relay/boat likely"
                else:
                    tier = 5
                    note = "Fallback: distance unknown, payload match only"
                return pd.Series([tier, note])

            candidates[["tier", "note"]] = candidates.apply(eval_row, axis=1)
            candidates = candidates[candidates["tier"] < 6].sort_values(by=["tier", "max_payload_weight"])

            # If no match at all, force fallback by payload match only
            if candidates.empty and not pd.isna(s_weight):
                fallback = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
                fallback["note"] = "Force-deployed; no range match"
                fallback["tier"] = 5
                candidates = fallback.sort_values("max_payload_weight").head(3)

            # Get top 3 distinct drone models
            selected = []
            used_models = set()
            for _, drone_row in candidates.iterrows():
                model = drone_row["mfc_model"]
                if model not in used_models:
                    selected.append((model, drone_row["note"]))
                    used_models.add(model)
                if len(selected) == 3:
                    break

            for j in range(len(selected)):
                chunk.at[i, drone_cols[j]] = selected[j][0]
                chunk.at[i, note_cols[j]] = selected[j][1]

        # Save intermediate result
        out_path = os.path.join(save_dir, f"drones_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        print(f"\u2705 Saved chunk {start} to {end} ‚Üí {out_path}")

        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True)

In [None]:
sample = ghz_with_cpm.sample(50000, random_state=42)
sample.info()

In [None]:
def batch_assign_best_drones_with_usage(merged_df, drone_df, chunk_size=50000, save_dir=r"D:\NDIS_Database\20_PaperSimulation"):
    os.makedirs(save_dir, exist_ok=True)

    drone_df = drone_df[["mfc_model", "max_payload_weight", "distance_range", "comm_range"]].dropna()
    drone_df = drone_df.drop_duplicates("mfc_model").reset_index(drop=True)

    drone_usage = defaultdict(int)  # Track how often each drone is used

    all_chunks = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"üöÅ Processing chunk {start} to {end}...")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        for col in ["drone1", "drone2", "drone3", "note1", "note2", "note3"]:
            chunk[col] = None

        for i, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Matching drones {start}-{end}"):

            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            if np.isnan(h_dist):
                h_dist = 999999
                ocean_flag = True
            else:
                ocean_flag = False

            if np.isnan(m_dist):
                m_dist = h_dist

            if pd.isna(s_weight):
                continue

            # Step 1: Filter by payload
            suitable_drones = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
            if suitable_drones.empty:
                continue

            # Step 2: Assign coverage tier
            def score_drone(dr):
                note = ""
                tier = 5  # default: lowest tier
                if dr["comm_range"] < h_dist:
                    note = "Comm range insufficient"
                if dr["distance_range"] >= m_dist:
                    tier = 1
                    note = "Full coverage"
                elif dr["distance_range"] >= 0.5 * m_dist:
                    tier = 2
                    note = "2‚Äì3 swaths"
                elif dr["distance_range"] >= 0.25 * m_dist:
                    tier = 3
                    note = "Multiple flights"
                elif h_dist > 300000:
                    tier = 4
                    note = "Fallback: Consider boat/relay"
                else:
                    note = "Fallback: Coverage limited"
                return pd.Series([tier, note])

            suitable_drones[["tier", "note"]] = suitable_drones.apply(score_drone, axis=1)

            # Step 3: Compute usage priority
            suitable_drones["usage_count"] = suitable_drones["mfc_model"].map(drone_usage)
            suitable_drones["payload_gap"] = suitable_drones["max_payload_weight"] - s_weight
            suitable_drones["payload_gap"] = suitable_drones["payload_gap"].apply(lambda x: x if x >= 0 else 9999)

            # Step 4: Sort by tier, then usage count, then closest payload match
            ranked = suitable_drones.sort_values(by=["tier", "usage_count", "payload_gap"]).drop_duplicates("mfc_model")

            if ranked.empty:
                continue

            top3 = ranked.head(3)

            for j, (_, drone_row) in enumerate(top3.iterrows()):
                model = drone_row["mfc_model"]
                note = drone_row["note"]
                chunk.at[i, f"drone{j+1}"] = model
                chunk.at[i, f"note{j+1}"] = note if not ocean_flag else "Likely ocean ‚Äî distance unknown"
                drone_usage[model] += 1

        # Save
        out_path = os.path.join(save_dir, f"drones_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        print(f"‚úÖ Saved: {out_path}")
        all_chunks.append(chunk)
        gc.collect()

    final_df = pd.concat(all_chunks, ignore_index=True)
    return final_df


In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

final_df = batch_assign_best_drones_with_usage(sample, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
len(final_df.drone1.unique())

In [None]:
final_df.drone1.unique()

In [None]:
len(final_df.drone2.unique())

In [None]:
final_df.drone2.unique()

In [None]:
len(final_df.drone3.unique())

In [None]:
final_df.drone3.unique()

In [None]:
# Get drones count
drone_counts = final_df[["drone1", "drone2", "drone3"]].stack().value_counts()

# Display top 10 most used drones
print(drone_counts.head(60))

In [None]:
drone_df.info()

-----
# ver 3.8.3

In [None]:
def assign_best_drones_batch(
    merged_df,
    drone_df,
    chunk_size=50000,
    save_dir=r"D:\NDIS_Database\20_PaperSimulation\FinalDroneAssignment"
):
    os.makedirs(save_dir, exist_ok=True)

    # Pre-clean drone_df
    drone_df = drone_df.dropna(subset=["mfc_model", "max_payload_weight"]).copy()
    drone_df["drone_id"] = drone_df["mfc_model"]
    drone_df["comm_range"] = pd.to_numeric(drone_df["comm_range"], errors="coerce")
    drone_df["distance_range"] = pd.to_numeric(drone_df["distance_range"], errors="coerce")
    drone_df["max_payload_weight"] = pd.to_numeric(drone_df["max_payload_weight"], errors="coerce")

    # Tracking usage
    usage_count = {model: 0 for model in drone_df["drone_id"].unique()}

    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f680 Processing chunk {start}-{end}")

        # Clean fields
        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        chunk[["drone1", "drone2", "drone3"]] = None
        chunk[["note1", "note2", "note3"]] = None

        for idx, row in tqdm(chunk.iterrows(), total=len(chunk)):
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            if pd.isna(s_weight):
                continue  # skip if no sensor

            # Primary filter: payload
            candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
            if candidates.empty:
                continue

            # Score each candidate
            def score_drone(dr):
                score = 100
                note = ""

                if not pd.isna(h_dist):
                    if dr["comm_range"] < h_dist:
                        score += 50
                        note = "Comm range insufficient"

                if not pd.isna(m_dist):
                    if dr["distance_range"] >= m_dist:
                        score -= 5
                        note = "Full coverage"
                    elif dr["distance_range"] >= 0.5 * m_dist:
                        score += 5
                        note = "2‚Äì3 swaths needed"
                    elif dr["distance_range"] >= 0.25 * m_dist:
                        score += 10
                        note = "Multiple passes"
                    else:
                        score += 30
                        note = "Very limited range"
                else:
                    note = "Distance unknown ‚Äî likely offshore"
                    score += 20

                # Prefer payloads close to sensor weight
                payload_diff = dr["max_payload_weight"] - s_weight
                score += abs(payload_diff) * 0.05

                # Soft randomness + usage count bias
                score += usage_count.get(dr["drone_id"], 0) * 0.01
                score += random.uniform(-0.5, 0.5)

                return pd.Series([score, note])

            candidates[["score", "note"]] = candidates.apply(score_drone, axis=1)
            candidates = candidates.sort_values("score").drop_duplicates("drone_id").head(10)

            # Select top 3 unique models
            assigned = []
            notes = []
            for _, dr in candidates.iterrows():
                if dr["drone_id"] not in assigned:
                    assigned.append(dr["drone_id"])
                    notes.append(dr["note"])
                if len(assigned) == 3:
                    break

            for i in range(len(assigned)):
                chunk.at[idx, f"drone{i+1}"] = assigned[i]
                chunk.at[idx, f"note{i+1}"] = notes[i]
                usage_count[assigned[i]] += 1

        # Save chunk
        out_path = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True)


In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

final_df = assign_best_drones_batch(sample, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
# Melt drone1, drone2, drone3 into one column
melted = final_df.melt(
    id_vars=['HazardType'],
    value_vars=['drone1', 'drone2', 'drone3'],
    var_name='drone_rank',
    value_name='drone_model'
)

# Clean strings (optional but recommended)
melted['HazardType'] = melted['HazardType'].str.strip()
melted['drone_model'] = melted['drone_model'].str.strip()

# Group and count
drone_hazard_counts = melted.groupby(['HazardType', 'drone_model']).size().reset_index(name='count')
drone_hazard_counts = drone_hazard_counts.sort_values(by='count', ascending=False)

# Plot
plt.figure(figsize=(14, 10))
sns.barplot(data=drone_hazard_counts, x='count', y='drone_model', hue='HazardType')

# Log scale
plt.xscale('log')

# Labels and formatting
plt.title('Drone Model Usage per Hazard Type (Log Scale)', fontsize=16)
plt.xlabel('Count (log scale)', fontsize=14)
plt.ylabel('Drone Model', fontsize=14)
plt.legend(title='Hazard Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

----
# Drone Rotation included

In [None]:
drone_df.rename(columns={"model_name": "mfc_model"}, inplace=True)

In [None]:
def assign_best_drones_with_blacklist_batch(
    merged_df,
    drone_df,
    chunk_size=50000,
    save_dir=r"D:\NDIS_Database\FinalDroneAssignment",
    blacklist_threshold=20000
):
    os.makedirs(save_dir, exist_ok=True)

    # Clean drone_df
    drone_df = drone_df.dropna(subset=["mfc_model", "max_payload_weight"]).copy()
    drone_df["drone_id"] = drone_df["mfc_model"]
    drone_df["comm_range"] = pd.to_numeric(drone_df["comm_range"], errors="coerce")
    drone_df["distance_range"] = pd.to_numeric(drone_df["distance_range"], errors="coerce")
    drone_df["max_payload_weight"] = pd.to_numeric(drone_df["max_payload_weight"], errors="coerce")

    usage_count = {model: 0 for model in drone_df["drone_id"].unique()}
    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f680 Processing chunk {start}-{end}")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        chunk[["drone1", "drone2", "drone3"]] = None
        chunk[["note1", "note2", "note3"]] = None

        for idx, row in tqdm(chunk.iterrows(), total=len(chunk)):
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            if pd.isna(s_weight):
                continue

            candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
            if candidates.empty:
                continue

            def score_drone(dr):
                score = 100
                note = ""

                if not pd.isna(h_dist):
                    if dr["comm_range"] < h_dist:
                        score += 50
                        note = "Comm range insufficient"

                if not pd.isna(m_dist):
                    if dr["distance_range"] >= m_dist:
                        score -= 5
                        note = "Full coverage"
                    elif dr["distance_range"] >= 0.5 * m_dist:
                        score += 5
                        note = "2‚Äì3 swaths needed"
                    elif dr["distance_range"] >= 0.25 * m_dist:
                        score += 10
                        note = "Multiple passes"
                    else:
                        score += 30
                        note = "Very limited range"
                else:
                    note = "Distance unknown ‚Äî likely offshore"
                    score += 20

                # Payload proximity
                payload_diff = dr["max_payload_weight"] - s_weight
                score += abs(payload_diff) * 0.05

                # Usage penalty
                uc = usage_count.get(dr["drone_id"], 0)
                if uc >= blacklist_threshold:
                    score += 9999  # Hard exclude
                else:
                    score += uc * 0.01  # Soft bias

                score += random.uniform(-0.5, 0.5)
                return pd.Series([score, note])

            candidates[["score", "note"]] = candidates.apply(score_drone, axis=1)
            candidates = candidates.sort_values("score").drop_duplicates("drone_id").head(10)

            assigned = []
            notes = []

            for _, dr in candidates.iterrows():
                if dr["drone_id"] not in assigned:
                    assigned.append(dr["drone_id"])
                    notes.append(dr["note"])
                if len(assigned) == 3:
                    break

            for i in range(len(assigned)):
                chunk.at[idx, f"drone{i+1}"] = assigned[i]
                chunk.at[idx, f"note{i+1}"] = notes[i]
                usage_count[assigned[i]] += 1

        # Save feather
        out_path = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True)

In [None]:
def heartbeat(interval=3600):
    while True:
        tqdm.write(f"‚è≥ Heartbeat: still running at {time.strftime('%Y-%m-%d %H:%M:%S')}")
        time.sleep(interval)

In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

final_df = assign_best_drones_with_blacklist_batch(sample, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
# Get drone count
drone_counts = final_df[["drone1", "drone2", "drone3"]].stack().value_counts()

# Display top 10 most used drones
print(drone_counts.head(60))

In [None]:
final_df.drone1.unique()

In [None]:
len(final_df.drone1.unique())

In [None]:
final_df.drone2.unique()

In [None]:
len(final_df.drone2.unique())

In [None]:
final_df.drone3.unique()

In [None]:
len(final_df.drone3.unique())

In [None]:
cols_to_drop = [
    'country',
    'length',
    'width',
    'height',
    'max_speed',
    'max_alt',
    'power_source',
    'price',
    'image',
    'source',
    'configuration_harmonized'
]

dronesensor = dronesensor.drop(columns=cols_to_drop)

In [None]:
dronesensor.Sensor.unique()

In [None]:
sensor_df.sensor_name.unique()

In [None]:
drone_df.configuration.unique()

In [None]:
drone_df.info()

In [None]:
final_df.info()

In [None]:
# Define the mapping
config_map = {
    '0': 'Fixed Wing',
    '2': 'Helicopter / Bicopter',
    '3': 'Tricopter',
    '4': 'Quadcopter',
    '6': 'Hexacopter',
    '8': 'Octocopter',
    '5': 'Fixed Wing VTOL',
    '5 fixed wing VTOL': 'Fixed Wing VTOL',
    'fixed wing VTOL': 'Fixed Wing VTOL',
    'fixedwing VTOL': 'Fixed Wing VTOL',
    'fixed wing': 'Fixed Wing',
    'helicopter': 'Helicopter / Bicopter',
    'X8': 'X8 (Hybrid Octo)'
}

# Apply the mapping
drone_df["configuration_harmonized"] = drone_df["configuration"].map(config_map)

In [None]:
# Standardize both fields for safe matching
dronesensor['DroneModel_clean'] = dronesensor['DroneModel'].str.strip().str.lower()
drone_df['mfc_model_clean'] = drone_df['mfc_model'].str.strip().str.lower()

In [None]:
# Step 1: Clean up and filter drone_df first to save memory
exclude_fields = [
    'rpas_id', 'reg', 'def_payload', 'purpose', 'flight_cont',
    'materials', 'min_op_temp', 'max_op_temp', 'power',
    'gps_accuracy', 'engine', 'oas', 'flight_time', 'configuration'
]

# Drop extra fields if they exist
drone_extra = drone_df.drop(columns=[col for col in exclude_fields if col in drone_df.columns])

# Step 2: Merge only needed columns to avoid memory bloating
dronesensor_df = dronesensor.merge(
    drone_extra[['mfc_model_clean', 'image']],  # merge only relevant fields
    left_on='DroneModel_clean',
    right_on='mfc_model_clean',
    how='left',
    copy=False
)

# Step 3: Patch missing images manually (lightweight fix)
manual_image_map = {
    'Dragandfly Innovations Inc Starling X.2': 'https://draganfly.com/wp-content/uploads/2023/07/Disaster-Response.webp',
    'Dragandfly Innovations Inc Heavy Lift Drone': 'https://candrone.com/cdn/shop/products/ScreenShot2022-06-03at11.55.59AM.png'
}
mask = dronesensor_df['image'].isna() & dronesensor_df['DroneModel'].isin(manual_image_map)
dronesensor_df.loc[mask, 'image'] = dronesensor_df.loc[mask, 'DroneModel'].map(manual_image_map)

# Step 4: Run garbage collection to clean up memory (recommended in ArcGIS Pro)
gc.collect()

# Optional: Save to disk or GDB/table
# dronesensor_df.to_feather(r"path_to\light_dronesensor.feather")  # very fast load/save

print("‚úÖ Done. Remaining NaN images:", dronesensor_df['image'].isna().sum())

In [None]:
# Step 1: Clean up and filter drone_df first to save memory
exclude_fields = [
    'rpas_id', 'reg', 'def_payload', 'purpose', 'flight_cont',
    'materials', 'min_op_temp', 'max_op_temp', 'power',
    'gps_accuracy', 'engine', 'oas', 'flight_time', 'configuration'
]

# Drop extra fields if they exist
drone_extra = drone_df.drop(columns=[col for col in exclude_fields if col in drone_df.columns])

# Step 2: Merge only needed columns to avoid memory bloating
final_df = final_df.merge(
    drone_extra[['mfc_model', 'image']],  # merge only relevant fields
    left_on='DroneModel',
    right_on='mfc_model',
    how='left',
    copy=False
)

# Step 3: Patch missing images manually (lightweight fix)
manual_image_map = {
    'Dragandfly Innovations Inc Starling X.2': 'https://draganfly.com/wp-content/uploads/2023/07/Disaster-Response.webp',
    'Dragandfly Innovations Inc Heavy Lift Drone': 'https://candrone.com/cdn/shop/products/ScreenShot2022-06-03at11.55.59AM.png'
}
mask = dronesensor_df['image'].isna() & final_df['DroneModel'].isin(manual_image_map)
final_df.loc[mask, 'image'] = final_df.loc[mask, 'DroneModel'].map(manual_image_map)

# Step 4: Run garbage collection to clean up memory (recommended in ArcGIS Pro)
gc.collect()

# Optional: Save to disk or GDB/table
# dronesensor_df.to_feather(r"path_to\light_dronesensor.feather")  # very fast load/save

print("‚úÖ Done. Remaining NaN images:", final_df['image'].isna().sum())

In [None]:
dronesensor_df.info()

In [None]:
# Re-merge with selected non-excluded columns
columns_to_add = [
    'mfc_model_clean', 'country', 'manufacturer', 'mfc_model',
    'length', 'width', 'height', 'max_speed', 'max_alt',
    'power_source', 'price', 'image', 'source', 'configuration_harmonized'
]

# Filter out columns that don't exist in drone_df to avoid error
columns_to_add = [col for col in columns_to_add if col in drone_df.columns]

# Re-merge
dronesensor_df = dronesensor_df.drop(columns=[col for col in columns_to_add if col != 'mfc_model_clean'], errors='ignore')
dronesensor_df = dronesensor_df.merge(
    drone_df[columns_to_add],
    on='mfc_model_clean',
    how='left'
)

In [None]:
# Step 1: Ensure DroneModel_clean is lowercase and stripped
dronesensor_df["DroneModel_clean"] = dronesensor_df["DroneModel"].astype(str).str.strip().str.lower()

# Step 2: Prepare manual image mapping (lowercased keys)
manual_image_map = {
    "dragandfly innovations inc starling x.2": "https://draganfly.com/wp-content/uploads/2023/07/Disaster-Response.webp",
    "dragandfly innovations inc heavy lift drone": "https://candrone.com/cdn/shop/products/ScreenShot2022-06-03at11.55.59AM.png"
}

# Step 3: Create mapping Series
manual_map_series = pd.Series(manual_image_map)

# Step 4: Fill only rows with missing images using `.fillna()` and `.map()`
missing_mask = dronesensor_df["image"].isna()
dronesensor_df.loc[missing_mask, "image"] = (
    dronesensor_df.loc[missing_mask, "DroneModel_clean"].map(manual_map_series)
)


In [None]:
dronesensor_df.info()

In [None]:
dronesensor_df.image.isna().unique()

In [None]:
# Show DroneModel values where image is NaN after merge
missing_image_models = dronesensor_df[dronesensor_df['image'].isna()]['DroneModel'].unique()

print(f"üö® DroneModels with missing image after merge ({len(missing_image_models)}):")
print(missing_image_models)

In [None]:
# Mapping dictionary from Sensor column values to sensor_name values
sensor_mapping = {
    'Thermal_Camera': 'Thermal Camera',
    'Thermal_Camera': 'Thermal Camera',  # Ensure consistency for Thermal Camera
    'Camera': 'Camera',
    'Lidar': 'LiDAR',
    'Magnetometers': 'Magnetometers',
    'Seismic': 'Seismic'
}

# Standardize Sensor names in dronesensor_df
dronesensor['Sensor'] = dronesensor['Sensor'].map(sensor_mapping).fillna(dronesensor['Sensor'])

# List of fields to exclude from sensor_df
exclude_sensor = [
    'sensor_id', 'parameters_measured', 'method'
]

# Filter sensor_df to include only desired fields
sensor_extra = sensor_df.drop(columns=exclude_sensor)

# Merge the dataframes on the consistent sensor names
dronesensor = dronesensor.merge(
    sensor_extra,
    left_on='Sensor',
    right_on='sensor_name',
    how='left',
    suffixes=('', '_sensor')
)

# Check the resulting dataframe
dronesensor.info()

In [None]:
# List of columns to exclude from dronesensor_df before merge
exclude_dronesensor_columns = [
    'sensor_name',
    'sensor_weight',
    'model',
    'source_sensor',
    'sensor_name_sensor',
    'sensor_weight_sensor',
    'model_sensor',
    'source_sensor'
]

# Drop the unwanted columns from dronesensor_df
dronesensor_df = dronesensor.drop(columns=exclude_dronesensor_columns)

In [None]:
# List of fields to exclude from sensor_df
exclude_sensor = [
    'sensor_id', 'parameters_measured', 'method'
]

# Filter sensor_df to include only desired fields
sensor_extra = sensor_df.drop(columns=exclude_sensor)

# Merge
dronesensor_df = dronesensor_df.merge(
    sensor_extra,
    left_on='Sensor',
    right_on='sensor_name',
    how='left',
    suffixes=('', '_sensor')
)
dronesensor_df.info()

In [None]:
cols_to_drop = [
    "DroneModel",
    "mfc_model",
    "mfc_model_clean",
    "manufacturer"
]

dronesensor_df = dronesensor_df.drop(columns=cols_to_drop)
dronesensor_df.info()

In [None]:
cols_to_drop = [
    "DroneRank",
    "PayloadOverkill",
    "DistanceOverkill",
    "mtow",
    "distance_range",
    "max_payload_weight",
    "comm_range"
]

dronesensor_df = dronesensor_df.drop(columns=cols_to_drop)

----
# SD ver 3.8.3
----

In [None]:
def heartbeat(interval=3600):
    while True:
        tqdm.write(f"‚è≥ Heartbeat: still running at {time.strftime('%Y-%m-%d %H:%M:%S')}")
        time.sleep(interval)

In [None]:
def assign_best_drones_with_blacklist_batch(
    merged_df,
    drone_df,
    chunk_size=50000,
    save_dir=r"D:\NDIS_Database\FinalDroneAssignment",
    blacklist_threshold=20000
):
    os.makedirs(save_dir, exist_ok=True)

    # Clean drone_df
    drone_df = drone_df.dropna(subset=["mfc_model", "max_payload_weight"]).copy()
    drone_df["drone_id"] = drone_df["mfc_model"]
    drone_df["comm_range"] = pd.to_numeric(drone_df["comm_range"], errors="coerce")
    drone_df["distance_range"] = pd.to_numeric(drone_df["distance_range"], errors="coerce")
    drone_df["max_payload_weight"] = pd.to_numeric(drone_df["max_payload_weight"], errors="coerce")

    usage_count = {model: 0 for model in drone_df["drone_id"].unique()}
    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f680 Processing chunk {start}-{end}")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        chunk[["drone1", "drone2", "drone3"]] = None
        chunk[["note1", "note2", "note3"]] = None

        for idx, row in tqdm(chunk.iterrows(), total=len(chunk)):
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["RecommendedSensor"]

            if pd.isna(s_weight):
                continue

            candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
            if candidates.empty:
                continue

            def score_drone(dr):
                score = 100
                note = ""

                if not pd.isna(h_dist):
                    if dr["comm_range"] < h_dist:
                        score += 50
                        note = "Comm range insufficient"

                if not pd.isna(m_dist):
                    if dr["distance_range"] >= m_dist:
                        score -= 5
                        note = "Full coverage"
                    elif dr["distance_range"] >= 0.5 * m_dist:
                        score += 5
                        note = "2‚Äì3 swaths needed"
                    elif dr["distance_range"] >= 0.25 * m_dist:
                        score += 10
                        note = "Multiple passes"
                    else:
                        score += 30
                        note = "Very limited range"
                else:
                    note = "Distance unknown ‚Äî likely offshore"
                    score += 20

                # Payload proximity
                payload_diff = dr["max_payload_weight"] - s_weight
                score += abs(payload_diff) * 0.05

                # Usage penalty
                uc = usage_count.get(dr["drone_id"], 0)
                if uc >= blacklist_threshold:
                    score += 9999  # Hard exclude
                else:
                    score += uc * 0.01  # Soft bias

                score += random.uniform(-0.5, 0.5)
                return pd.Series([score, note])

            candidates[["score", "note"]] = candidates.apply(score_drone, axis=1)
            candidates = candidates.sort_values("score").drop_duplicates("drone_id").head(10)

            assigned = []
            notes = []

            for _, dr in candidates.iterrows():
                if dr["drone_id"] not in assigned:
                    assigned.append(dr["drone_id"])
                    notes.append(dr["note"])
                if len(assigned) == 3:
                    break

            for i in range(len(assigned)):
                chunk.at[idx, f"drone{i+1}"] = assigned[i]
                chunk.at[idx, f"note{i+1}"] = notes[i]
                usage_count[assigned[i]] += 1

        # Save feather
        out_path = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True)

In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

final_df = assign_best_drones_with_blacklist_batch(sample, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
save_dir = r"D:\NDIS_Database\FinalDroneAssignment"

# List all feather files in the save directory
feather_files = [os.path.join(save_dir, f) for f in os.listdir(save_dir) if f.endswith(".feather")]

# Read and concatenate all feather chunks
all_chunks = [pd.read_feather(fp) for fp in sorted(feather_files)]
final_df = pd.concat(all_chunks, ignore_index=True)

# Now `final_df` contains the full output
final_df.info()

In [None]:
len(final_df.drone3.unique())

In [None]:
def assign_best_drones_with_hybrid_penalty(
    merged_df,
    drone_df,
    chunk_size=50000,
    save_dir=r"D:\NDIS_Database\FinalDroneAssignment",
    global_penalty=0.001,
    category_penalty=0.01
):
    os.makedirs(save_dir, exist_ok=True)

    # Clean drone_df
    drone_df = drone_df.dropna(subset=["mfc_model", "max_payload_weight"]).copy()
    drone_df["drone_id"] = drone_df["mfc_model"]
    drone_df["comm_range"] = pd.to_numeric(drone_df["comm_range"], errors="coerce")
    drone_df["distance_range"] = pd.to_numeric(drone_df["distance_range"], errors="coerce")
    drone_df["max_payload_weight"] = pd.to_numeric(drone_df["max_payload_weight"], errors="coerce")

    # Initialize usage tracking
    global_usage_count = {model: 0 for model in drone_df["drone_id"].unique()}
    usage_by_category = {}

    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f680 Processing chunk {start}-{end}")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        chunk[["drone1", "drone2", "drone3"]] = None
        chunk[["note1", "note2", "note3"]] = None

        for idx, row in tqdm(chunk.iterrows(), total=len(chunk)):
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row["sensor_model"] if "sensor_model" in row else row["RecommendedSensor"]
            hazard = row.get("HazardType", "Unknown")

            if pd.isna(s_weight):
                continue

            candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
            if candidates.empty:
                continue

            def score_drone(dr):
                score = 100
                note = ""

                if not pd.isna(h_dist):
                    if dr["comm_range"] < h_dist:
                        score += 50
                        note = "Comm range insufficient"

                if not pd.isna(m_dist):
                    if dr["distance_range"] >= m_dist:
                        score -= 5
                        note = "Full coverage"
                    elif dr["distance_range"] >= 0.5 * m_dist:
                        score += 5
                        note = "2‚Äì3 swaths needed"
                    elif dr["distance_range"] >= 0.25 * m_dist:
                        score += 10
                        note = "Multiple passes"
                    else:
                        score += 30
                        note = "Very limited range"
                else:
                    note = "Distance unknown ‚Äî likely offshore"
                    score += 20

                # Payload proximity
                payload_diff = dr["max_payload_weight"] - s_weight
                score += abs(payload_diff) * 0.05

                # Hybrid penalty
                drone_id = dr["drone_id"]
                cat_key = (hazard, sensor, drone_id)

                g_usage = global_usage_count.get(drone_id, 0)
                c_usage = usage_by_category.get(cat_key, 0)

                score += g_usage * global_penalty
                score += c_usage * category_penalty

                score += random.uniform(-0.5, 0.5)
                return pd.Series([score, note])

            candidates[["score", "note"]] = candidates.apply(score_drone, axis=1)
            candidates = candidates.sort_values("score").drop_duplicates("drone_id").head(10)

            assigned = []
            notes = []

            for _, dr in candidates.iterrows():
                if dr["drone_id"] not in assigned:
                    assigned.append(dr["drone_id"])
                    notes.append(dr["note"])
                if len(assigned) == 3:
                    break

            for i in range(len(assigned)):
                chunk.at[idx, f"drone{i+1}"] = assigned[i]
                chunk.at[idx, f"note{i+1}"] = notes[i]
                drone_id = assigned[i]
                cat_key = (hazard, sensor, drone_id)

                global_usage_count[drone_id] += 1
                usage_by_category[cat_key] = usage_by_category.get(cat_key, 0) + 1

        # Save feather
        out_path = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True)

In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

final_df = assign_best_drones_with_hybrid_penalty(sample, drone_df, chunk_size=50000)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
final_df.drone3.unique()

In [None]:
def assign_best_drones_with_hybrid_penalty(
    merged_df,
    drone_df,
    chunk_size=50000,
    save_dir=r"D:\NDIS_Database\20_PaperSimulation\FinalDroneAssignment",
    global_penalty=0.001,
    category_penalty=0.01
):
    os.makedirs(save_dir, exist_ok=True)

    # Clean drone_df
    drone_df = drone_df.dropna(subset=["mfc_model", "max_payload_weight"]).copy()
    drone_df["drone_id"] = drone_df["mfc_model"]
    drone_df["comm_range"] = pd.to_numeric(drone_df["comm_range"], errors="coerce")
    drone_df["distance_range"] = pd.to_numeric(drone_df["distance_range"], errors="coerce")
    drone_df["max_payload_weight"] = pd.to_numeric(drone_df["max_payload_weight"], errors="coerce")

    # Usage trackers
    global_usage_count = {model: 0 for model in drone_df["drone_id"].unique()}
    usage_by_category = {}

    all_results = []

    for start in range(0, len(merged_df), chunk_size):
        end = min(start + chunk_size, len(merged_df))
        chunk = merged_df.iloc[start:end].copy()
        print(f"\U0001f680 Processing chunk {start}-{end}")

        chunk["distance"] = pd.to_numeric(chunk["distance"], errors="coerce")
        chunk["mission_distance"] = pd.to_numeric(chunk["mission_distance"], errors="coerce")
        chunk["sensor_weight"] = pd.to_numeric(chunk["sensor_weight"], errors="coerce")

        chunk[["drone1", "drone2", "drone3"]] = None
        chunk[["note1", "note2", "note3"]] = None

        for idx, row in tqdm(chunk.iterrows(), total=len(chunk)):
            s_weight = row["sensor_weight"]
            h_dist = row["distance"]
            m_dist = row["mission_distance"]
            sensor = row.get("sensor_model") or row.get("RecommendedSensor")
            hazard = row.get("HazardType", "Unknown")

            if pd.isna(s_weight):
                continue

            candidates = drone_df[drone_df["max_payload_weight"] >= s_weight].copy()
            if candidates.empty:
                continue

            def score_drone(dr):
                score = 100
                note = ""

                # Comm range penalty
                if not pd.isna(h_dist) and dr["comm_range"] < h_dist:
                    score += 30
                    note = "Comm range insufficient"

                # Mission distance penalty
                if not pd.isna(m_dist):
                    if dr["distance_range"] >= m_dist:
                        score -= 10
                        note = "Full coverage"
                    elif dr["distance_range"] >= 0.75 * m_dist:
                        score += 2
                        note = "Near full coverage"
                    elif dr["distance_range"] >= 0.5 * m_dist:
                        score += 5
                        note = "2‚Äì3 swaths needed"
                    elif dr["distance_range"] >= 0.25 * m_dist:
                        score += 15
                        note = "Multiple passes"
                    else:
                        score += 25
                        note = "Very limited range"
                else:
                    score += 10
                    note = "Distance unknown"

                # Payload proximity penalty (mild)
                payload_diff = dr["max_payload_weight"] - s_weight
                score += min(abs(payload_diff) * 0.01, 10)

                # Hybrid penalties
                drone_id = dr["drone_id"]
                cat_key = (hazard, sensor, drone_id)
                g_usage = global_usage_count.get(drone_id, 0)
                c_usage = usage_by_category.get(cat_key, 0)
                score += g_usage * global_penalty
                score += c_usage * category_penalty

                score += random.uniform(-0.25, 0.25)
                return pd.Series([score, note])

            candidates[["score", "note"]] = candidates.apply(score_drone, axis=1)
            candidates = candidates.sort_values("score").drop_duplicates("drone_id").head(10)



            assigned = []
            notes = []

            for _, dr in candidates.iterrows():
                if dr["drone_id"] not in assigned:
                    assigned.append(dr["drone_id"])
                    notes.append(dr["note"])
                if len(assigned) == 3:
                    break

            for i in range(len(assigned)):
                chunk.at[idx, f"drone{i+1}"] = assigned[i]
                chunk.at[idx, f"note{i+1}"] = notes[i]
                drone_id = assigned[i]
                cat_key = (hazard, sensor, drone_id)
                global_usage_count[drone_id] += 1
                usage_by_category[cat_key] = usage_by_category.get(cat_key, 0) + 1

        # Save feather
        out_path = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_path)
        all_results.append(chunk)
        gc.collect()

    return pd.concat(all_results, ignore_index=True), global_usage_count, usage_by_category

In [None]:
# Start the background heartbeat thread
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

# Run the main process with timer
start_time = timeit.default_timer()

result_df, global_usage_count, usage_by_category = assign_best_drones_with_hybrid_penalty(
    merged_df=ghz_with_cpm,
    drone_df=drone_df,
    chunk_size=50000
)

elapsed = timeit.default_timer() - start_time
print("\u2705 All processing completed! Elapsed time: %s minutes"%str(elapsed/60))

In [None]:
result_df.info()

In [None]:
len(final_df.drone2.unique())

In [None]:
# --- 1) Helper: rebuild usage + detect finished chunks ---
import os, glob
import pandas as pd
from collections import defaultdict

def rebuild_usage_from_saved(save_dir):
    global_usage_count = defaultdict(int)
    usage_by_category = defaultdict(int)
    done_ranges = set()

    pattern = os.path.join(save_dir, "drone_assign_chunk_*_*.feather")
    for fp in sorted(glob.glob(pattern)):
        base = os.path.basename(fp)
        try:
            _, s, e = base.replace(".feather", "").split("_")[-3:]
            start_i, end_i = int(s), int(e)
            done_ranges.add((start_i, end_i))
        except Exception:
            pass

        df = pd.read_feather(fp)
        sensor_col = "sensor_model" if "sensor_model" in df.columns else ("RecommendedSensor" if "RecommendedSensor" in df.columns else None)
        sens = df[sensor_col] if sensor_col else pd.Series(["Unknown"] * len(df))
        haz  = df["HazardType"] if "HazardType" in df.columns else pd.Series(["Unknown"] * len(df))

        for k in (1, 2, 3):
            dcol = f"drone{k}"
            if dcol not in df.columns:
                continue
            mask = df[dcol].notna()
            if not mask.any():
                continue
            sub = df.loc[mask, [dcol]].copy()
            sub_h = haz.loc[mask]
            sub_s = sens.loc[mask]
            for h, s, d in zip(sub_h, sub_s, sub[dcol]):
                d_id = str(d)
                global_usage_count[d_id] += 1
                usage_by_category[(h if pd.notna(h) else "Unknown",
                                   s if pd.notna(s) else "Unknown",
                                   d_id)] += 1

    return dict(global_usage_count), dict(usage_by_category), done_ranges


# --- 2) Faster + resumable assigner (hybrid penalties, vectorized scoring) ---
import os, gc
import numpy as np
import pandas as pd
from tqdm import tqdm

def assign_best_drones_with_hybrid_penalty_resumable(
    merged_df,
    drone_df,
    chunk_size=50_000,
    save_dir=r"D:\NDIS_Database\20_PaperSimulation\FinalDroneAssignment",
    global_penalty=0.001,
    category_penalty=0.01,
    resume=True,
    return_results=False
):
    os.makedirs(save_dir, exist_ok=True)

    ddf = drone_df.dropna(subset=["mfc_model", "max_payload_weight"]).copy()
    ddf = ddf.assign(
        drone_id = ddf["mfc_model"].astype(str),
        comm_range = pd.to_numeric(ddf["comm_range"], errors="coerce"),
        distance_range = pd.to_numeric(ddf["distance_range"], errors="coerce"),
        max_payload_weight = pd.to_numeric(ddf["max_payload_weight"], errors="coerce"),
    )
    d_ids   = ddf["drone_id"].to_numpy()
    d_comm  = ddf["comm_range"].to_numpy()
    d_dist  = ddf["distance_range"].to_numpy()
    d_pay   = ddf["max_payload_weight"].to_numpy()

    if resume:
        global_usage_count, usage_by_category, done_ranges = rebuild_usage_from_saved(save_dir)
    else:
        global_usage_count, usage_by_category, done_ranges = {}, {}, set()

    all_results = []
    n = len(merged_df)

    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)

        if (start, end) in done_ranges:
            print(f"‚è≠Ô∏è  Skipping chunk {start}-{end} (already saved)")
            continue

        print(f"üöÄ Processing chunk {start}-{end}")
        chunk = merged_df.iloc[start:end].copy()

        h_dist_col   = pd.to_numeric(chunk["distance"], errors="coerce") if "distance" in chunk.columns else pd.Series(np.nan, index=chunk.index)
        m_dist_col   = pd.to_numeric(chunk["mission_distance"], errors="coerce") if "mission_distance" in chunk.columns else pd.Series(np.nan, index=chunk.index)
        s_weight_col = pd.to_numeric(chunk["sensor_weight"], errors="coerce") if "sensor_weight" in chunk.columns else pd.Series(np.nan, index=chunk.index)

        for k in (1,2,3):
            chunk[f"drone{k}"] = None
            chunk[f"note{k}"]  = None

        it = tqdm(list(chunk.itertuples(index=True, name=None)), total=len(chunk))
        for tup in it:
            idx = tup[0]
            s_weight = s_weight_col.at[idx]
            if pd.isna(s_weight):
                continue

            h_dist = h_dist_col.at[idx]
            m_dist = m_dist_col.at[idx]
            sensor = (chunk.at[idx, "sensor_model"]
                      if "sensor_model" in chunk.columns and pd.notna(chunk.at[idx, "sensor_model"])
                      else chunk.at[idx, "RecommendedSensor"]
                      if "RecommendedSensor" in chunk.columns and pd.notna(chunk.at[idx, "RecommendedSensor"])
                      else "Unknown")
            hazard = chunk.at[idx, "HazardType"] if "HazardType" in chunk.columns and pd.notna(chunk.at[idx, "HazardType"]) else "Unknown"

            mask = d_pay >= s_weight
            if not mask.any():
                continue

            cand_ids  = d_ids[mask]
            cand_comm = d_comm[mask]
            cand_dist = d_dist[mask]
            cand_pay  = d_pay[mask]

            scores = np.full(cand_ids.shape[0], 100.0)
            notes  = np.empty(cand_ids.shape[0], dtype=object); notes[:] = ""

            if pd.notna(h_dist):
                comm_bad = cand_comm < h_dist
                scores[comm_bad] += 30.0
                notes[comm_bad] = "Comm range insufficient"

            if pd.notna(m_dist):
                full      = cand_dist >= m_dist
                near_full = (~full) & (cand_dist >= 0.75*m_dist)
                half      = (~full) & (~near_full) & (cand_dist >= 0.50*m_dist)
                quarter   = (~full) & (~near_full) & (~half) & (cand_dist >= 0.25*m_dist)
                very_low  = (~full) & (~near_full) & (~half) & (~quarter)

                scores[full]      -= 10.0; notes[full]      = "Full coverage"
                scores[near_full] +=  2.0; notes[near_full] = "Near full coverage"
                scores[half]      +=  5.0; notes[half]      = "2‚Äì3 swaths needed"
                scores[quarter]   += 15.0; notes[quarter]   = "Multiple passes"
                scores[very_low]  += 25.0; notes[very_low]  = "Very limited range"
            else:
                scores += 10.0; notes[:] = "Distance unknown"

            payload_diff = np.abs(cand_pay - s_weight)
            scores += np.minimum(payload_diff * 0.01, 10.0)

            g_vec = np.fromiter((global_usage_count.get(d, 0) for d in cand_ids), dtype=float, count=cand_ids.size)
            c_vec = np.fromiter((usage_by_category.get((hazard, sensor, d), 0) for d in cand_ids), dtype=float, count=cand_ids.size)
            scores += g_vec * global_penalty
            scores += c_vec * category_penalty

            # jitter can be disabled for determinism:
            # scores += np.random.uniform(-0.25, 0.25, size=cand_ids.size)

            # top10 then top3 without full sort
            k10 = min(10, scores.size-1)
            top10_idx = np.argpartition(scores, kth=k10)[:k10+1]
            order10 = np.argsort(scores[top10_idx])
            pick_idx = top10_idx[order10][:3]

            chosen_ids  = cand_ids[pick_idx]
            chosen_notes= notes[pick_idx]

            for rank, (d_id, nte) in enumerate(zip(chosen_ids, chosen_notes), start=1):
                chunk.at[idx, f"drone{rank}"] = d_id
                chunk.at[idx, f"note{rank}"]  = nte
                global_usage_count[d_id] = global_usage_count.get(d_id, 0) + 1
                usage_by_category[(hazard, sensor, d_id)] = usage_by_category.get((hazard, sensor, d_id), 0) + 1

        out_tmp  = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather.tmp")
        out_path = os.path.join(save_dir, f"drone_assign_chunk_{start}_{end}.feather")
        chunk.reset_index(drop=True).to_feather(out_tmp)
        os.replace(out_tmp, out_path)

        if return_results:
            all_results.append(chunk)

        del chunk
        gc.collect()

    if return_results:
        return pd.concat(all_results, ignore_index=True), global_usage_count, usage_by_category
    else:
        return None, global_usage_count, usage_by_category


In [None]:
save_dir = r"D:\NDIS_Database\20_PaperSimulation\FinalDroneAssignment"

# heartbeat is fine to keep
threading.Thread(target=heartbeat, args=(3600,), daemon=True).start()

start_time = timeit.default_timer()

_, global_usage_count, usage_by_category = assign_best_drones_with_hybrid_penalty_resumable(
    merged_df=ghz_with_cpm,
    drone_df=drone_df,
    chunk_size=50_000,
    save_dir=save_dir,
    resume=True,          # <-- key: skips already-saved chunks, rebuilds usage
    return_results=False  # keep RAM low; results are on disk
)

elapsed = (timeit.default_timer() - start_time)/60
print(f"‚úÖ Finished (resume). Elapsed minutes: {elapsed:.2f}")
#print("S900 usage:", global_usage_count.get("DJI S900", 0))

In [None]:
import glob, os
feathers = sorted(glob.glob(os.path.join(save_dir, "drone_assign_chunk_*_*.feather")))
final_df = pd.concat((pd.read_feather(fp) for fp in feathers), ignore_index=True)
final_df.info()

----
# ExplodeData

---

In [None]:
# ------------------------------------------------------------

# ======= CONFIG =======
CHUNK_SIZE = 500_000  # tune for RAM
OUT_DIR = r"D:\NDIS_Database\20_PaperSimulation\FinalExploded"
OUT_PREFIX = "part"

# Parquet speed/size tradeoffs (good defaults for 10M+ rows)
PARQUET_KW = dict(
    engine="pyarrow",
    compression="zstd",
    compression_level=3,     # 3‚Äì5 sweet spot
    use_dictionary=True,     # repeated strings (e.g., models) compress well
    write_statistics=True,
    row_group_size=1_000_000 # ~1M rows per row group
)

def explode_drones(df: pd.DataFrame,
                   drone_cols=('drone1','drone2','drone3'),
                   note_cols=('note1','note2','note3'),
                   out_drone_col='drone_model',
                   out_note_col='note') -> pd.DataFrame:
    # keep id columns
    id_cols = [c for c in df.columns if c not in (*drone_cols, *note_cols)]

    # align columns by rank labels "1","2","3"
    d = df.loc[:, drone_cols].rename(columns=dict(zip(drone_cols, ['1','2','3'])))
    n = df.loc[:, note_cols ].rename(columns=dict(zip(note_cols,  ['1','2','3'])))

    # build hierarchical columns then stack ‚Üí long form (vectorized)
    tmp = pd.concat({out_drone_col: d, out_note_col: n}, axis=1)
    long = tmp.stack(level=1).reset_index(level=1).rename(columns={'level_1': 'drone_rank'})

    # join back id columns; drop empty drones
    out = (
        df[id_cols]
        .join(long)
        .loc[lambda x: x[out_drone_col].notna() & (x[out_drone_col].astype(str).str.len() > 0)]
        .reset_index(drop=True)
    )

    # tidy up types and sort (stable)
    if 'drone_rank' in out.columns:
        try:
            out['drone_rank'] = out['drone_rank'].astype('int8')
        except Exception:
            pass

    sort_cols = [c for c in ['HazardID','RecommendedSensor','DisasterPhase','drone_rank'] if c in out.columns]
    if sort_cols:
        out = out.sort_values(sort_cols, kind='stable', ignore_index=True)
    return out



In [None]:
# ======= CHUNKED PROCESS =======
os.makedirs(OUT_DIR, exist_ok=True)

n = len(final_df)
num_chunks = math.ceil(n / CHUNK_SIZE)
print(f"Processing {n:,} rows in {num_chunks} chunk(s) of {CHUNK_SIZE:,}...")

t0 = time.time()
for i in range(0, n, CHUNK_SIZE):
    j = min(i + CHUNK_SIZE, n)
    t_chunk0 = time.time()

    chunk = final_df.iloc[i:j]
    exploded = explode_drones(chunk)

    out_path = os.path.join(OUT_DIR, f"{OUT_PREFIX}_{i//CHUNK_SIZE:05d}.parquet")
    exploded.to_parquet(out_path, index=False, **PARQUET_KW)

    del chunk, exploded
    gc.collect()

    print(f"‚úì {i//CHUNK_SIZE+1}/{num_chunks}  rows[{i:,}:{j:,}) -> {out_path}  ({time.time()-t_chunk0:.1f}s)")

print(f"Done in {(time.time()-t0)/60:.2f} min. Parts in: {OUT_DIR}")

# ---- OPTIONAL: FEATHER FOR MAX I/O SPEED (scratch only) ----
# exploded.to_feather(out_path.replace(".parquet", ".feather"), compression="lz4")