# NDIS v1.0 - Processed Using Decision Tree Algorithm

In [None]:
from arcgis.gis import GIS
gis = GIS("home")

In [None]:
%matplotlib inline
# basic packages
import csv
import numpy as np
import os
import timeit
import random
import string
from playsound import playsound

# Data management
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point  # to get points from long lat

# Request service
#from requests import Request
import json
import re
from functools import reduce
#from owslib.wfs import WebFeatureService

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

## Select Valuable Input Features

In [None]:
# Set the path to this geodatabase
gdb_path = r"D:\ArcGISProjects\GeohazardDB\GeohazardDB.gdb"  # This gdb path

### Load Datasets

In [None]:
# Import Database/Dataset
# Specify the feature class name
ghz_clean = "cleaned_geohazard_data"  # Geohazard feature class
ghz_clean_path = f"{gdb_path}\\{ghz_clean}"

# Use arcpy to create a list of fields
ghz_clean_fields = [f.name for f in arcpy.ListFields(f"{gdb_path}\\{ghz_clean}")]

# Use arcpy to create a search cursor and load the data into a list of dictionaries
ghz_clean_data = []
with arcpy.da.SearchCursor(f"{gdb_path}\\{ghz_clean}", ghz_clean_fields) as cursor:
    for row in cursor:
        ghz_clean_data.append(dict(zip(ghz_clean_fields, row)))

# Convert the list of dictionaries into a DataFrame
ghz_celan_df = pd.DataFrame(ghz_clean_data)
ghz_celan_df.info()

In [None]:
# Convert the DataFrame to a CSV file
ghz_celan_df.to_csv(r"D:\NDIS_Database\ghz_processed.csv", index=False)

In [None]:
# Filter the dataset to include only distances <= 500 km (500,000 m)
filtered_geohazard = ghz_celan_df[ghz_celan_df["distance"] <= 1046000]

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.gca()

# Set the color scheme to purple-green
sns.set_palette("PRGn")

# Define adjusted bin edges: finer resolution for small distances, coarser for large ones
adjusted_bins = np.concatenate([
    np.arange(0, 10000, 1000),  # 0 - 10 km: 1 km bins
    np.arange(10000, 50000, 5000),  # 10 - 50 km: 5 km bins
    np.arange(50000, 100000, 10000),  # 50 - 100 km: 10 km bins
    np.arange(100000, 1046000, 1046000)  # 100 - 500 km: 50 km bins
])

ax = sns.histplot(filtered_geohazard["distance"], bins=adjusted_bins, kde=False, edgecolor="black")

# Customize the plot
plt.xlabel("Distance (m)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Distribution of Geohazard Distances (Max 1000 km)", fontsize=14)
plt.grid(False)

# Format x and y axis labels to avoid scientific notation
ax.ticklabel_format(style='plain', axis='x')
ax.ticklabel_format(style='plain', axis='y')

# Label each bar with frequency count
for bar in ax.patches:
    height = bar.get_height()
    if height > 0:
        plt.text(bar.get_x() + bar.get_width()/2, height + 5000,
                 f"{int(height)}", ha='center', fontsize=10, color="black")

# Show the plot
# Save the plot as PNG with transparent background
fig.savefig('D:/NDIS_Database/FE_Display/distance_dist.png', transparent=True)
plt.show()

### Pre-Processing With RPAS and Sensor Dataset

In [None]:
# By Max Payload
dc = drone_cleaned.distance_range.unique()
dc

In [None]:
# Given range values
range_values = np.array([
    1.50000e+05, 4.00000e+03, 1.00000e+04, 1.80000e+04, 7.00000e+03,
    2.00000e+03, 1.50000e+04, 8.00000e+03, 5.00000e+03, 1.00000e+03,
    1.60000e+04, 3.50000e+04, 1.20000e+04, 2.00000e+04, 7.00000e+02,
    1.35000e+05, 5.00000e+04, 2.10000e+01, 2.40000e+04, 3.00000e+02,
    4.50000e+05, 3.00000e+04, 2.70000e+04, 1.04600e+06, 5.63000e+05,
    1.04607e+05, 5.25000e+04, 4.00000e+04, 3.02400e+04, 4.50000e+04,
    1.90000e+05, 3.00000e+03, 8.60000e+04
])

# Determine min and max for binning
min_value = range_values.min()
max_value = range_values.max()

# Create 10 optimized bins using log spacing (to handle large range variation)
bins = np.logspace(np.log10(min_value), np.log10(max_value), num=11)

# Display the bins
bins

In [None]:
# Load Drone and Sensor data
drone_data = pd.read_csv("D:/NDIS_Database/rpas_nonan.csv") # Shortlisted verison of RPAS gdb with no NaN data on distance
sensor_data = pd.read_csv("D:/NDIS_Database/sensor.csv") # Geophysical sensor list

# Display the first few rows to verify
drone_data
sensor_data

## Defining Rules for Selection

In [None]:
sample_data = pd.read_csv("D:/NDIS_Database/sample_data.csv")
sample_data

---

In [None]:
# Load the datasets
geohazard_df = ghz_celan_df.copy()
drone_df     = drone_data.copy()
sensor_df    = sensor_data.copy()

### Sensor Selection

In [None]:
# Step 1: Sensor Selection (Updated with GPR priority for Landslide)
def select_best_sensor(hazard_type, distance):
    if hazard_type in [1, 3, 4, 5]:  # Volcano, Tsunami, Fault, Earthquake
        matching_sensors = sensor_df[sensor_df["sensor_name"] == "Seismic"]
    elif hazard_type == 2:  # Landslide (prioritize GPR)
        matching_sensors = sensor_df[sensor_df["sensor_name"] == "GPR"]
        if matching_sensors.empty:  # If no GPR, fallback to Magnetometer
            matching_sensors = sensor_df[sensor_df["sensor_name"] == "Lidar"]
    else:
        return "No suitable sensor found"

    # If multiple sensors qualify, apply additional filtering
    if len(matching_sensors) > 1:
        if hazard_type in [4, 5]:  # Fault, Earthquake (Underground hazards)
            best_sensor = matching_sensors.loc[matching_sensors["sensor_weight"].idxmax()]
        elif hazard_type in [1, 3]:  # Volcano, Tsunami (Surface hazards)
            best_sensor = matching_sensors.loc[matching_sensors["sensor_weight"].idxmin()]
        elif hazard_type == 2:  # Landslide (Check distance to prefer lighter sensors if far)
            if distance > 200:
                best_sensor = matching_sensors.loc[matching_sensors["sensor_weight"].idxmin()]
            else:
                best_sensor = matching_sensors.loc[matching_sensors["sensor_weight"].idxmax()]
        else:
            best_sensor = matching_sensors.iloc[0]
    elif len(matching_sensors) == 1:
        best_sensor = matching_sensors.iloc[0]
    else:
        return "No suitable sensor found"

    return best_sensor["sensor_name"]

# Apply sensor selection to dataset
geohazard_df["selected_sensor"] = geohazard_df.apply(lambda row: select_best_sensor(row["HazardType"], row["distance"]), axis=1)
geohazard_df

In [None]:
# Step 2: Drone Selection (Updated matching logic with ±50% tolerance & special rule for large distances)
def select_best_drone(hazard_type, hazard_distance):
    # Step 1: Find exact matches
    matching_drones = drone_df[drone_df["distance_range"] == hazard_distance]

    # Step 2: If no exact match, find drones within ±50% of hazard distance (only if < 70,000)
    if matching_drones.empty and hazard_distance < 70000:
        lower_bound = hazard_distance * 0.5
        upper_bound = hazard_distance * 1.5
        matching_drones = drone_df[(drone_df["distance_range"] >= lower_bound) & (drone_df["distance_range"] <= upper_bound)]

    # Step 3: For large distances (> 70,000), match with the next available `distance_range`
    if matching_drones.empty and hazard_distance >= 70000:
        sorted_drones = drone_df.sort_values("distance_range")
        next_distance = sorted_drones[sorted_drones["distance_range"] > hazard_distance]["distance_range"].min()
        matching_drones = sorted_drones[sorted_drones["distance_range"] == next_distance]

    # Step 4: Apply hierarchical selection if multiple drones qualify
    if len(matching_drones) > 1:
        if hazard_type in [1, 3]:  # Volcano, Tsunami → Min Flight Time
            best_drone = matching_drones.loc[matching_drones["flight_time"].idxmin()]
        elif hazard_type in [2, 4, 5]:  # Landslide, Fault, Earthquake → Max Flight Time
            best_drone = matching_drones.loc[matching_drones["flight_time"].idxmax()]
        else:
            best_drone = matching_drones.iloc[0]
    elif len(matching_drones) == 1:
        best_drone = matching_drones.iloc[0]
    else:
        return "No suitable drone found"

    return best_drone["mfc_model"]

# Apply drone selection to dataset
geohazard_df["selected_drone"] = geohazard_df.apply(lambda row: select_best_drone(row["HazardType"], row["distance"]), axis=1)
geohazard_df

In [None]:
# Apply the function to each row in the geohazard dataset
sample_data["selected_drone"] = sample_data.apply(lambda row: select_best_drone(row["HazardType"], row["distance"]), axis=1)

# Display the final geohazard dataset with selected drones
sample_data

## Model Training Independent Approach

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Prepare features (X) and target variables (y)
X = geohazard_df[['HazardType', 'distance']].copy()  # Features for training
y_sensor = geohazard_df['selected_sensor']  # Target variable for sensor selection
y_drone = geohazard_df['selected_drone']  # Target variable for drone selection

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train_sensor, y_test_sensor = train_test_split(X, y_sensor, test_size=0.2, random_state=42)
X_train, X_test, y_train_drone, y_test_drone = train_test_split(X, y_drone, test_size=0.2, random_state=42)

# Train Decision Tree Classifiers
sensor_clf = DecisionTreeClassifier(random_state=42)
sensor_clf.fit(X_train, y_train_sensor)

drone_clf = DecisionTreeClassifier(random_state=42)
drone_clf.fit(X_train, y_train_drone)

# Predict on test set
y_pred_sensor = sensor_clf.predict(X_test)
y_pred_drone = drone_clf.predict(X_test)

# Evaluate model performance
sensor_accuracy = accuracy_score(y_test_sensor, y_pred_sensor)
drone_accuracy = accuracy_score(y_test_drone, y_pred_drone)

sensor_report = classification_report(y_test_sensor, y_pred_sensor)
drone_report = classification_report(y_test_drone, y_pred_drone)

sensor_accuracy, drone_accuracy, sensor_report, drone_report

-----

## Model Training Combined Approach

### Define the Target Output and Decision Criteria (Splitting Conditions)

In [None]:
# Load the datasets
geohazard_df = ghz_celan_df.copy() # for sample data change to sample_data for real process ghz_celan_df

### Feature Selection: Select Sensor

In [None]:
# Step 1: Filter Drone Selection Based on Sensor Selection

# Function to select the best sensor based on hazard type and distance
def select_best_sensor_v3(hazard_type, distance):
    if hazard_type == 2:  # Landslide
        return "GPR" if distance <= 100 else "Lidar"
    else:
        return "Seismic"

In [None]:
# Reapply the simplified sensor selection logic
geohazard_df["selected_sensor"] = geohazard_df.apply(lambda row: select_best_sensor_v3(row["HazardType"], row["distance"]), axis=1)
geohazard_df

### Define Target Output: Drone Selection

In [None]:
# Step 2: Drone Selection Based on Sensor Selection

# Function to filter drones based on sensor selection
def filter_drones_by_sensor(sensor_name):
    if "Seismic" in sensor_name:
        return drone_df  # All drones are assumed compatible with seismic sensors
    elif "GPR" in sensor_name:
        return drone_df[drone_df["max_payload_weight"] >= 3500]  # GPR needs high payload drones
    elif "Lidar" in sensor_name:
        return drone_df[drone_df["max_payload_weight"] >= 900]  # Lidar needs medium payload drones
    else:
        return drone_df  # Default case, no filtering

# Apply filtering logic
geohazard_df["filtered_drones"] = geohazard_df["selected_sensor"].apply(filter_drones_by_sensor)

### Function to Select Drone

In [None]:
# Function to select the best drone after filtering
def select_best_drone(hazard_type, hazard_distance, filtered_drones):
    if filtered_drones.empty:
        return "No suitable drone found"

    # Step 1: Find exact matches
    matching_drones = filtered_drones[filtered_drones["distance_range"] == hazard_distance]

    # Step 2: If no exact match, find drones within ±50% of hazard distance (only if < 70,000)
    if matching_drones.empty and hazard_distance < 70000:
        lower_bound = hazard_distance * 0.5
        upper_bound = hazard_distance * 1.5
        matching_drones = filtered_drones[
            (filtered_drones["distance_range"] >= lower_bound) & (filtered_drones["distance_range"] <= upper_bound)]

    # Step 3: For large distances (> 70,000), match with the next available `distance_range`
    if matching_drones.empty and hazard_distance >= 70000:
        sorted_drones = filtered_drones.sort_values("distance_range")
        next_distance = sorted_drones[sorted_drones["distance_range"] > hazard_distance]["distance_range"].min()
        matching_drones = sorted_drones[sorted_drones["distance_range"] == next_distance]

    # Step 4: Apply hierarchical selection if multiple drones qualify
    if len(matching_drones) > 1:
        if hazard_type in [1, 3]:  # Volcano, Tsunami → Min Flight Time
            best_drone = matching_drones.loc[matching_drones["flight_time"].idxmin()]
        elif hazard_type in [2, 4, 5]:  # Landslide, Fault, Earthquake → Max Flight Time
            best_drone = matching_drones.loc[matching_drones["flight_time"].idxmax()]
        else:
            best_drone = matching_drones.iloc[0]
    elif len(matching_drones) == 1:
        best_drone = matching_drones.iloc[0]
    else:
        return "No suitable drone found"

    return best_drone["mfc_model"]

In [None]:
# Apply drone selection
geohazard_df["selected_drone"] = geohazard_df.apply(
    lambda row: select_best_drone(row["HazardType"], row["distance"], row["filtered_drones"]), axis=1
)

# Drop temporary 'filtered_drones' column to clean up dataset
geohazard_df.drop(columns=["filtered_drones"], inplace=True)
geohazard_df

In [None]:
# Function to select the best drone based on geohazard distance and payload capacity
def select_best_drone_v2(hazard_distance, sensor_weight):
    # Step 1: Find drones with exact or closest distance match
    matching_drones = drone_df[drone_df["distance_range"] >= hazard_distance]

    # Step 2: If no drone exactly matches, find the next closest available distance
    if matching_drones.empty:
        next_distance = drone_df[drone_df["distance_range"] > hazard_distance]["distance_range"].min()
        matching_drones = drone_df[drone_df["distance_range"] == next_distance]

    # Step 3: Filter drones that can handle the sensor payload weight
    feasible_drones = matching_drones[matching_drones["max_payload_weight"] >= sensor_weight]

    # Step 4: If no drones can handle the payload, pick the next closest distance-range drone with a higher weight capacity
    if feasible_drones.empty:
        sorted_drones = drone_df[drone_df["distance_range"] > hazard_distance].sort_values("distance_range")
        for _, drone in sorted_drones.iterrows():
            if drone["max_payload_weight"] >= sensor_weight:
                return drone["mfc_model"]
        return "No suitable drone found"

    # Step 5: Select the best drone (lowest distance range that fits the payload)
    best_drone = feasible_drones.iloc[0]

    return best_drone["mfc_model"]

In [None]:
# Apply drone selection based on refined logic
geohazard_df["selected_drone"] = geohazard_df.apply(
    lambda row: select_best_drone_v2(row["distance"], sensor_df[sensor_df["sensor_name"] == row["selected_sensor"]]["sensor_weight"].values[0] if row["selected_sensor"] in sensor_df["sensor_name"].values else 0),
    axis=1
)

geohazard_df

______
# Optimized Vectorized Filtering

In [None]:
# Step 1: Default all sensors to "Seismic"
geohazard_df["selected_sensor"] = "Seismic"

# Step 2: Assign "GPR" to Landslides with distance ≤ 100m
geohazard_df.loc[(geohazard_df["HazardType"] == 2) & (geohazard_df["distance"] <= 100), "selected_sensor"] = "GPR"

# Step 3: Assign "Lidar" to Landslides with distance > 100m
geohazard_df.loc[(geohazard_df["HazardType"] == 2) & (geohazard_df["distance"] > 100), "selected_sensor"] = "Lidar"

# Step 4: Merge with Sensor Data to get sensor properties
geohazard_df = geohazard_df.merge(sensor_df, left_on="selected_sensor", right_on="sensor_name", how="left")

geohazard_df

In [None]:
# Step 1: Precompute sorted drone dataset
drone_df_sorted = drone_df.sort_values(["distance_range", "max_payload_weight"])

# Step 2: Optimized drone selection function
def fast_select_drone_v2(geohazard_df, drone_df):
    geohazard_df = geohazard_df.copy()
    geohazard_df["selected_drone"] = None

    for index, row in geohazard_df.iterrows():
        hazard_distance = row["distance"]
        sensor_weight = sensor_df.loc[sensor_df["sensor_name"] == row["selected_sensor"], "sensor_weight"].values
        sensor_weight = sensor_weight[0] if len(sensor_weight) > 0 else 0

        # Step 1: Find the first drone that can handle short distances
        if hazard_distance < drone_df_sorted["distance_range"].min():
            matching_drones = drone_df_sorted  # Allow all drones for very short distances
        else:
            # Step 2: Select drones with matching or closest higher distance
            matching_drones = drone_df_sorted[drone_df_sorted["distance_range"] >= hazard_distance]

        # Step 3: Filter drones that can carry the sensor payload
        feasible_drones = matching_drones[matching_drones["max_payload_weight"] >= sensor_weight]

        # Step 4: If no direct match, select the **next available drone that can carry the payload**
        if feasible_drones.empty:
            sorted_drones = drone_df_sorted[drone_df_sorted["max_payload_weight"] >= sensor_weight]
            if not sorted_drones.empty:
                best_drone = sorted_drones.iloc[0]
            else:
                best_drone = None
        else:
            best_drone = feasible_drones.iloc[0]

        # Assign the best drone found
        geohazard_df.at[index, "selected_drone"] = best_drone["mfc_model"] if best_drone is not None else "No suitable drone found"

    return geohazard_df

# Apply the optimized selection
geohazard_df = fast_select_drone_v2(geohazard_df, drone_df)
geohazard_df

In [None]:
# Precompute sorted drone dataset
drone_df_sorted = drone_df.sort_values(["distance_range", "max_payload_weight"])

# Optimized function for drone selection
def fast_select_drone_v3(geohazard_df, drone_df):
    geohazard_df = geohazard_df.copy()
    geohazard_df["selected_drone"] = None

    for index, row in geohazard_df.iterrows():
        hazard_distance = row["distance"]
        sensor_weight = sensor_df.loc[sensor_df["sensor_name"] == row["selected_sensor"], "sensor_weight"].values
        sensor_weight = sensor_weight[0] if len(sensor_weight) > 0 else 0

        # Step 1: Find exact matches
        matching_drones = drone_df_sorted[drone_df_sorted["distance_range"] == hazard_distance]

        # Step 2: If no exact match, find drones within ±50% of hazard distance (only if < 70,000)
        if matching_drones.empty and hazard_distance < 70000:
            lower_bound = hazard_distance * 0.5
            upper_bound = hazard_distance * 1.5
            matching_drones = drone_df_sorted[
                (drone_df_sorted["distance_range"] >= lower_bound) & (drone_df_sorted["distance_range"] <= upper_bound)
            ]

        # Step 3: For large distances (> 70,000), match with the next available `distance_range`
        if matching_drones.empty and hazard_distance >= 70000:
            sorted_drones = drone_df_sorted.sort_values("distance_range")
            next_distance = sorted_drones[sorted_drones["distance_range"] > hazard_distance]["distance_range"].min()
            matching_drones = sorted_drones[sorted_drones["distance_range"] == next_distance]

        # Step 4: Filter drones that can handle the sensor payload weight
        feasible_drones = matching_drones[matching_drones["max_payload_weight"] >= sensor_weight]

        # Step 5: If no feasible drones found, select the **next closest drone that can carry the payload**
        if feasible_drones.empty:
            sorted_drones = drone_df_sorted[drone_df_sorted["max_payload_weight"] >= sensor_weight]
            if not sorted_drones.empty:
                best_drone = sorted_drones.iloc[0]
            else:
                best_drone = None
        else:
            best_drone = feasible_drones.iloc[0]

        # Assign the best drone found
        geohazard_df.at[index, "selected_drone"] = best_drone["mfc_model"] if best_drone is not None else "No suitable drone found"

    return geohazard_df

# Apply the optimized selection
geohazard_df = fast_select_drone_v3(geohazard_df, drone_df)
geohazard_df

In [None]:
# Count occurrences of "No suitable drone found"
no_suitable_drones_count = (geohazard_df["selected_drone"] == "No suitable drone found").sum()

# Display the count
print(f"Number of geohazards with no suitable drone: {no_suitable_drones_count}")

In [None]:
# Calculate percentage of "No suitable drone found"
no_suitable_drones_pct = (no_suitable_drones_count / len(geohazard_df)) * 100

# Display the percentage
print(f"Percentage of geohazards without a suitable drone: {no_suitable_drones_pct:.2f}%")

In [None]:
# Filter the dataframe to show only rows where "No suitable drone found"
no_suitable_drone_df = geohazard_df[geohazard_df["selected_drone"] == "No suitable drone found"]
no_suitable_drone_df

In [None]:
max(no_suitable_drone_df['distance'])

In [None]:
# Step 1: Precompute sorted drone dataset
drone_df_sorted = drone_df.sort_values(["distance_range", "max_payload_weight"])

# Step 2: Optimized drone selection function
def fast_select_drone_v2(geohazard_df, drone_df):
    geohazard_df = geohazard_df.copy()
    geohazard_df["selected_drone"] = None

    for index, row in geohazard_df.iterrows():
        hazard_distance = row["distance"]
        sensor_weight = sensor_df.loc[sensor_df["sensor_name"] == row["selected_sensor"], "sensor_weight"].values
        sensor_weight = sensor_weight[0] if len(sensor_weight) > 0 else 0

        # Step 1: Find the first drone that can handle short distances
        if hazard_distance < drone_df_sorted["distance_range"].min():
            matching_drones = drone_df_sorted  # Allow all drones for very short distances
        else:
            # Step 2: Select drones with matching or closest higher distance
            matching_drones = drone_df_sorted[drone_df_sorted["distance_range"] >= hazard_distance]

        # Step 3: Filter drones that can carry the sensor payload
        feasible_drones = matching_drones[matching_drones["max_payload_weight"] >= sensor_weight]

        # Step 4: If no direct match, select the **next available drone that can carry the payload**
        if feasible_drones.empty:
            sorted_drones = drone_df_sorted[drone_df_sorted["max_payload_weight"] >= sensor_weight]
            if not sorted_drones.empty:
                best_drone = sorted_drones.iloc[0]
            else:
                best_drone = None
        else:
            best_drone = feasible_drones.iloc[0]

        # Assign the best drone found
        geohazard_df.at[index, "selected_drone"] = best_drone["mfc_model"] if best_drone is not None else "No suitable drone found"

    return geohazard_df

# Apply the optimized selection
geohazard_df = fast_select_drone_v2(geohazard_df, drone_df)
geohazard_df

In [None]:
# Count occurrences of "No suitable drone found"
no_suitable_drones_count = (geohazard_df["selected_drone"] == "No suitable drone found").sum()

# Display the count
print(f"Number of geohazards with no suitable drone: {no_suitable_drones_count}")

____
# Decision Tree Model Training (Combined Approach)

## Step 1: Prepare Features (X) and Target (y)

In [None]:
# Select features for model training
X = geohazard_df[["HazardType", "distance"]]  # Features
y = geohazard_df["selected_drone"]  # Target (drone selection)

# Encode categorical target variable (drone models)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

## Step 2: Train the Decision Tree Model

In [None]:
# Initialize and train Decision Tree Classifier
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

## Step 3: Evaluate Model Performance

In [None]:
# Predict on test set
y_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report with zero_division handling
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))

In [None]:
# Initialize and train Decision Tree Classifier
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report with zero_division handling
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))

In [None]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
y_pred_final = dt_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_final)

# Plot confusion matrix
plt.figure(figsize=(12, 6))
sns.heatmap(cm, annot=False, fmt='d', cmap='Blues')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Additional Fix: Ensure All Classes Are in Training Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

In [None]:
# Predict on test set
y_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

## Step 4: (Optional) Visualize the Decision Tree

In [None]:
from sklearn.tree import export_text

# Print decision rules
tree_rules = export_text(dt_model, feature_names=["HazardType", "distance"])
print(tree_rules)

In [None]:
# Fix 1: Increase Tree Depth to Capture More Classes
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=10, random_state=42)

In [None]:
# Get unique predicted classes
predicted_classes = np.unique(y_pred)

# Find labels that exist in test set but were never predicted
missing_labels = set(np.unique(y_test)) - set(predicted_classes)
print(f"Missing predicted labels: {missing_labels}")

In [None]:
# Ensure all labels are included in the classification report
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=label_encoder.classes_, zero_division=0))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, stratify=y_encoded, random_state=42)

In [None]:
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)
# Predict on test set
y_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
y_pred_final = dt_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_final)

# Plot confusion matrix
plt.figure(figsize=(12, 6))
sns.heatmap(cm, annot=False, fmt='d', cmap='Blues')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(dt_model, X, y_encoded, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot class distribution before balancing
plt.figure(figsize=(12, 6))
sns.barplot(x=geohazard_df["selected_drone"].value_counts().index,
            y=geohazard_df["selected_drone"].value_counts().values)
plt.xticks([], [])  # Hide x-axis labels for clarity
plt.ylabel("Count")
plt.title("Class Distribution Before Balancing")
plt.show()

# Plot class distribution after balancing (e.g., after SMOTE or downsampling)
plt.figure(figsize=(12, 6))
sns.barplot(x=balanced_df["selected_drone"].value_counts().index,
            y=balanced_df["selected_drone"].value_counts().values)
plt.xticks([], [])  # Hide x-axis labels for clarity
plt.ylabel("Count")
plt.title("Class Distribution After Balancing")
plt.show()

In [None]:
from sklearn.metrics import classification_report
import pandas as pd

# Generate classification report
report = classification_report(y_test, y_pred_final, output_dict=True)

# Convert to DataFrame for visualization
df_report = pd.DataFrame(report).transpose()

# Display classification metrics
import ace_tools as tools
tools.display_dataframe_to_user(name="Classification Report", dataframe=df_report)

In [None]:
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=15, class_weight="balanced", random_state=42)
dt_model.fit(X_train, y_train)
# Predict on test set
y_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
print(geohazard_df["selected_drone"].value_counts())

In [None]:
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=10, class_weight="balanced", random_state=42)
dt_model.fit(X_train, y_train)
# Predict on test set
y_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(dt_model, X, y_encoded, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

In [None]:
from sklearn.tree import export_text

# Print Decision Tree structure as text
tree_rules = export_text(dt_model, feature_names=["HazardType", "distance"])
print("Decision Tree Structure:")
print(tree_rules)

-----
### ----- Fix Issues -----

### Step 1: Downsample dominant drones to balance the dataset size.

In [None]:
from sklearn.utils import resample

In [None]:
# Find the max count for the smallest class
min_class_count = geohazard_df["selected_drone"].value_counts().min()

# Downsample majority classes
balanced_df = geohazard_df.groupby("selected_drone", group_keys=False).apply(lambda x: x.sample(min_class_count, random_state=42))

In [None]:
# Use SMOTE
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [None]:
# Train the data Again
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=10, class_weight="balanced", random_state=42)
dt_model.fit(X_train_balanced, y_train_balanced)


# Predict on test set
y_pred_balanced = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Generate classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))