In [4]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
import seaborn as sns
from geopy.distance import geodesic

def generate_color_palette(num_colors):
    """Generate a color palette based on the number of clusters."""
    return sns.color_palette("hsv", num_colors).as_hex()

def create_cluster_legend(map_object, color_palette, cluster_count):
    """Create a legend for the clusters on the map."""
    legend_html = f'''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; width: 200px; height: auto; 
                z-index: 9999; font-size:14px; 
                background-color: white; opacity: 0.8;
                border: 2px solid black; border-radius: 5px;">
    <strong>Cluster Legend</strong><br>
    '''
    
    for i in range(cluster_count):
        legend_html += f'<i style="background: {color_palette[i]}; width: 20px; height: 20px; display: inline-block; margin-right: 5px;"></i> Cluster {i + 1}<br>'

    legend_html += '</div>'
    map_object.get_root().html.add_child(folium.Element(legend_html))

def calculate_distance(coord1, coord2):
    """Calculate the geodesic distance between two latitude/longitude points."""
    return geodesic(coord1, coord2).miles

def cluster_and_plot(dataset):
    # Perform K-means clustering
    cluster_count = 6  # Set the number of clusters
    kmeans = KMeans(n_clusters=cluster_count, random_state=0)
    kmeans.fit(dataset[['latitude', 'longitude']])

    # Add cluster labels to the DataFrame
    dataset['cluster_label'] = kmeans.labels_

    # Initialize Folium map
    map_clusters = folium.Map(location=[dataset['latitude'].mean(), dataset['longitude'].mean()], zoom_start=4)

    # Generate a color palette for the clusters
    color_palette = generate_color_palette(cluster_count)

    # Add markers to the map
    for lat, lon, cluster_label, state, city in zip(
            dataset['latitude'], dataset['longitude'], dataset['cluster_label'], dataset['state'], dataset['city']):
        folium.CircleMarker(
            [lat, lon],
            radius=3,  
            color=color_palette[cluster_label],  
            fill=True,
            fill_color=color_palette[cluster_label], 
            fill_opacity=0.7,
            popup=f"State: {state}<br>City: {city}<br>Cluster: {cluster_label + 1}"  # Popup with state, city, and cluster number
        ).add_to(map_clusters)

    # Create a legend for clusters
    create_cluster_legend(map_clusters, color_palette, cluster_count)

    # Save the map with a specific filename
    map_clusters.save('K-Means_cluster_analysis_with_6_clusters.html')

    # Calculate cluster centroids
    centroids = dataset.groupby('cluster_label')[['latitude', 'longitude']].mean().reset_index()
    centroid_coords = [(row['latitude'], row['longitude']) for _, row in centroids.iterrows()]

    # Calculate distances between each pair of clusters
    distance_matrix = pd.DataFrame(index=range(cluster_count), columns=range(cluster_count))

    for i in range(cluster_count):
        for j in range(i + 1, cluster_count):
            distance = calculate_distance(centroid_coords[i], centroid_coords[j])
            distance_matrix.iloc[i, j] = distance
            distance_matrix.iloc[j, i] = distance  # Fill symmetric value

    # Display the distance matrix
    print("\nDistance Matrix (in miles):")
    print(distance_matrix)

    # Group the data by cluster, state, age, manner of death, and gender, and count occurrences
    correlation_data = dataset.groupby(['cluster_label', 'state', 'age', 'manner_of_death', 'gender']).size().reset_index(name='count')

    # Sort the data by count in descending order
    correlation_data.sort_values(by='count', ascending=False, inplace=True)

    # Display the correlations for each cluster
    for cluster in range(cluster_count):
        cluster_data = correlation_data[correlation_data['cluster_label'] == cluster]
        print(f"\nCluster {cluster + 1} (Color: {color_palette[cluster]}):")
        print("\n{:<10} {:<20} {:<20} {:<20} {:<10}".format('State', 'Age', 'Manner of Death', 'Gender', 'Count'))
        for state in cluster_data['state'].unique():
            state_cluster_data = cluster_data[cluster_data['state'] == state]
            age_range = f"{state_cluster_data['age'].min()} - {state_cluster_data['age'].max()}"
            genders = state_cluster_data['gender'].unique()
            gender_str = " & ".join(genders) if len(genders) > 1 else genders[0]
            print("{:<10} {:<20} {:<20} {:<20} {:<10}".format(state, age_range, state_cluster_data.iloc[0]['manner_of_death'], gender_str, state_cluster_data['count'].sum()))

# Load data from CSV file into a DataFrame
dataset = pd.read_csv('fatal-police-shootings-data-continental-us.csv')

# Remove duplicates and handle missing values
dataset.drop_duplicates(inplace=True)  # Remove duplicate rows
dataset.dropna(inplace=True)  # Drop rows with missing values, if needed

# Perform clustering and plotting
cluster_and_plot(dataset)



Distance Matrix (in miles):
             0            1            2            3            4  \
0          NaN  1703.314331   682.403511  1145.335274   597.771303   
1  1703.314331          NaN  1373.879675   611.261052  2300.787281   
2   682.403511  1373.879675          NaN   764.507592  1164.801391   
3  1145.335274   611.261052   764.507592          NaN  1738.839597   
4   597.771303  2300.787281  1164.801391  1738.839597          NaN   
5   671.357394  2098.853074   744.869565  1488.432137   673.695033   

             5  
0   671.357394  
1  2098.853074  
2   744.869565  
3  1488.432137  
4   673.695033  
5          NaN  

Cluster 1 (Color: #ffd500):

State      Age                  Manner of Death      Gender               Count     
IN         15.0 - 69.0          shot                 M & F                96        
KY         18.0 - 75.0          shot                 M & F                72        
IL         13.0 - 63.0          shot                 M & F                10