In [13]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.cluster import KMeans
import folium
import os
import zipfile

In [14]:
# Define ZIP file path
zip_file_path = r"C:\Users\gudal\Downloads\joyeetadey Optimal-Ambulance-Positioning-for-Road-Crashes main Datasets.zip"  # Change this to your ZIP file path
extract_folder = "extracted_data"

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# Find the CSV file inside the extracted folder
csv_file = None
for file in os.listdir(extract_folder):
    if file.endswith(".csv"):
        csv_file = os.path.join(extract_folder, file)
        break

if csv_file is None:
    raise FileNotFoundError("No CSV file found in the extracted ZIP folder.")

# Load accident dataset
df = pd.read_csv(csv_file)

# Display first few rows to understand the structure
print(df.head())

   uid             datetime  latitude  longitude
0    1  2018-01-01 00:25:46 -1.188850  36.931382
1    2  2018-01-01 02:02:39 -0.662939  37.208730
2    3  2018-01-01 02:31:49 -0.662939  37.208730
3    4  2018-01-01 03:04:01 -1.288087  36.826583
4    5  2018-01-01 03:58:49 -1.188850  36.931382


In [15]:
# Handle missing values (Fill with mean/median or drop rows with critical missing data)
df.dropna(subset=["latitude", "longitude"], inplace=True)  # Drop rows missing coordinates
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numerical columns with median

In [16]:
# Convert DataFrame to GeoDataFrame for spatial operations
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")

In [17]:
# Cluster accident-prone areas using K-Means
num_clusters = 5  # Adjust as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(df[["latitude", "longitude"]])

# Save the clustered data
df.to_csv("processed_accident_data.csv", index=False)


In [18]:
# Visualize clusters on a map
map_center = [df["latitude"].mean(), df["longitude"].mean()]
map_object = folium.Map(location=map_center, zoom_start=12)
colors = ["red", "blue", "green", "purple", "orange"]

for idx, row in df.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=5,
        color=colors[row["cluster"] % len(colors)],
        fill=True,
        fill_color=colors[row["cluster"] % len(colors)],
        fill_opacity=0.6,
    ).add_to(map_object)


In [19]:
# Save the map
map_object.save("accident_clusters_map.html")

print("Data Preprocessing Completed! Processed file saved as 'processed_accident_data.csv' and map as 'accident_clusters_map.html'")

Data Preprocessing Completed! Processed file saved as 'processed_accident_data.csv' and map as 'accident_clusters_map.html'
