In [1]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

def validate_coordinates(dataset):
    """Validate latitude and longitude values."""
    valid_latitude = dataset['latitude'].between(-90, 90)
    valid_longitude = dataset['longitude'].between(-180, 180)
    return dataset[valid_latitude & valid_longitude]

def generate_color_palette(num_colors):
    """Generate a color palette based on the number of clusters."""
    return sns.color_palette("hsv", num_colors).as_hex()

def create_cluster_legend(map_obj, color_palette, cluster_count):
    """Add a legend to the folium map."""
    legend_html = '''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; 
                width: 200px; height: auto; 
                z-index:9999; 
                font-size:14px; 
                background-color:white;
                border:2px solid grey;
                padding: 10px;">
    <h4 style="margin: 0;">Cluster Legend</h4>
    <ul style="list-style-type: none; padding: 0;">
    '''

    for i in range(cluster_count):
        legend_html += f'<li><span style="color: {color_palette[i]};">&#9679;</span> Cluster {i + 1}</li>'

    legend_html += '</ul></div>'
    map_obj.get_root().html.add_child(folium.Element(legend_html))

def calculate_distance(coord1, coord2):
    """Calculate the geodesic distance between two latitude/longitude points."""
    return geodesic(coord1, coord2).miles

def perform_clustering_and_visualization(dataset, cluster_count):
    # Validate coordinates
    dataset = validate_coordinates(dataset)
    
    # Extract latitude and longitude for clustering
    location_data = dataset[['latitude', 'longitude']]

    # Apply K-means clustering
    kmeans_model = KMeans(n_clusters=cluster_count, random_state=0)
    kmeans_model.fit(location_data)

    # Add cluster labels to the DataFrame
    dataset['cluster_id'] = kmeans_model.labels_

    # Initialize Folium map centered on the mean latitude and longitude
    map_center = [dataset['latitude'].mean(), dataset['longitude'].mean()]
    map_with_clusters = folium.Map(location=map_center, zoom_start=4)

    # Generate a color palette for the clusters
    color_palette = generate_color_palette(cluster_count)

    # Add circle markers to the map
    for lat, lon, cluster_label, state, location in zip(
            dataset['latitude'], dataset['longitude'], dataset['cluster_id'], dataset['state'], dataset['city']):
        folium.CircleMarker(
            [lat, lon],
            radius=3,
            color=color_palette[cluster_label],  
            fill=True,
            fill_color=color_palette[cluster_label],  
            fill_opacity=0.7,
            popup=f"State: {state}<br>Location: {location}"  
        ).add_to(map_with_clusters)

    # Create a legend for clusters
    create_cluster_legend(map_with_clusters, color_palette, cluster_count)

    # Save the map to an HTML file named by the number of clusters
    map_with_clusters.save(f'cluster_visualization_{cluster_count}.html')

    # Count the number of data points in each cluster
    cluster_frequency = dataset['cluster_id'].value_counts()

    # Identify the most frequent cluster
    most_frequent_cluster = cluster_frequency.idxmax()
    
    # Filter dataset to include only data points from the most frequent cluster
    frequent_cluster_data = dataset[dataset['cluster_id'] == most_frequent_cluster]

    # Determine the state most represented in the most frequent cluster
    state_frequency = frequent_cluster_data['state'].value_counts()
    most_represented_state = state_frequency.idxmax()

    # Calculate distances between clusters
    if cluster_count >= 2:
        cluster_centroids = dataset.groupby('cluster_id')[['latitude', 'longitude']].mean().reset_index()
        
        # Create a DataFrame to store distances
        distance_table = pd.DataFrame(index=range(cluster_count), columns=range(cluster_count))

        # Calculate distances between all pairs of clusters
        for i in range(cluster_count):
            for j in range(i + 1, cluster_count):
                coord1 = (cluster_centroids.loc[i, 'latitude'], cluster_centroids.loc[i, 'longitude'])
                coord2 = (cluster_centroids.loc[j, 'latitude'], cluster_centroids.loc[j, 'longitude'])
                distance = calculate_distance(coord1, coord2)
                distance_table.iloc[i, j] = distance
                distance_table.iloc[j, i] = distance  # Fill symmetric value

        # Output the distance table
        print(f"Distance Table for {cluster_count} clusters:")
        print(distance_table)

    # Output summary information
    print(f"For {cluster_count} clusters:")
    print(f"- The most frequent cluster is Cluster {most_frequent_cluster + 1} with {cluster_frequency.max()} points.")
    print(f"- The most represented state in this cluster is {most_represented_state}.")

# Load dataset from CSV file
dataset = pd.read_csv('fatal-police-shootings-data-continental-us.csv')

# Remove rows with missing values in key columns, if any
dataset.dropna(subset=['latitude', 'longitude', 'name', 'date'], inplace=True)

# Remove duplicate rows based on a subset of columns that define uniqueness
dataset.drop_duplicates(subset=['latitude', 'longitude', 'name', 'date'], inplace=True)

# Validate and clean the dataset (e.g., coordinates, missing values)
dataset = validate_coordinates(dataset)

# List of cluster counts to use for analysis
cluster_counts = [2, 3, 4, 5, 6]

# Execute clustering and visualization for each specified number of clusters
for cluster_count in cluster_counts:
    perform_clustering_and_visualization(dataset, cluster_count)


Distance Table for 2 clusters:
             0            1
0          NaN  1609.307259
1  1609.307259          NaN
For 2 clusters:
- The most frequent cluster is Cluster 2 with 3822 points.
- The most represented state in this cluster is TX.
Distance Table for 3 clusters:
             0            1            2
0          NaN  1972.757039  1145.005296
1  1972.757039          NaN   874.207726
2  1145.005296   874.207726          NaN
For 3 clusters:
- The most frequent cluster is Cluster 2 with 2544 points.
- The most represented state in this cluster is FL.
Distance Table for 4 clusters:
             0            1            2            3
0          NaN  2083.872862   634.165895  1419.701158
1  2083.872862          NaN  1476.014038   737.260978
2   634.165895  1476.014038          NaN   787.114941
3  1419.701158   737.260978   787.114941          NaN
For 4 clusters:
- The most frequent cluster is Cluster 2 with 2438 points.
- The most represented state in this cluster is FL.
Distance