In [1]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
import seaborn as sns
from geopy.distance import geodesic
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def validate_coordinates(datasets):
    """Validate latitude and longitude values."""
    valid_latitude = datasets['latitude'].between(-90, 90)
    valid_longitude = datasets['longitude'].between(-180, 180)
    return datasets[valid_latitude & valid_longitude]

def generate_color_palette(num_colors):
    """Generate a color palette based on the number of clusters."""
    return sns.color_palette("hsv", num_colors).as_hex()

def create_cluster_legend(map_obj, color_palette, cluster_count):
    """Add a legend to the folium map."""
    legend_html = '''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; 
                width: 200px; height: auto; 
                z-index:9999; 
                font-size:14px; 
                background-color:white;
                border:2px solid grey;
                padding: 10px;">
    <h4 style="margin: 0;">Cluster Legend</h4>
    <ul style="list-style-type: none; padding: 0;">
    '''

    for i in range(cluster_count):
        legend_html += f'<li><span style="color: {color_palette[i]};">&#9679;</span> Cluster {i + 1}</li>'

    legend_html += '</ul></div>'
    map_obj.get_root().html.add_child(folium.Element(legend_html))

def calculate_distance(coord1, coord2):
    """Calculate the geodesic distance between two latitude/longitude points."""
    return geodesic(coord1, coord2).miles

def cluster_and_plot_yearly_data(datasets):
    # Convert date column to datetime type
    datasets['date'] = pd.to_datetime(datasets['date'], errors='coerce')
    
    # Drop rows with missing or invalid dates
    datasets.dropna(subset=['date'], inplace=True)
    
    # Extract year from the date
    datasets['year'] = datasets['date'].dt.year

    # Drop duplicate rows
    datasets.drop_duplicates(inplace=True)

    # Drop rows with missing values in critical columns (latitude, longitude, state, city)
    datasets.dropna(subset=['latitude', 'longitude', 'state', 'city'], inplace=True)

    # Group data by year and state, count occurrences, and reset index
    year_state_counts = datasets.groupby(['year', 'state']).size().reset_index(name='count')

    # Find the top five states for each year
    top_states_by_year = year_state_counts.groupby('year').apply(lambda x: x.nlargest(5, 'count')).reset_index(drop=True)

    # Initialize an empty dictionary to store the top cities for each state and year
    top_cities_by_year = {}

    # Iterate over each row of the top states DataFrame
    for index, row in top_states_by_year.iterrows():
        state_data = datasets[(datasets['state'] == row['state']) & (datasets['year'] == row['year'])]
        top_cities_by_year.setdefault(row['year'], []).append((row['state'], state_data))

    # Generate maps for each year
    for year, state_data_list in top_cities_by_year.items():
        # Initialize Folium map centered on the United States
        map_clusters = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

        # Perform K-means clustering for each state's data
        for state, state_data in state_data_list:
            # Validate coordinates
            state_data = validate_coordinates(state_data)
            state_data.dropna(subset=['latitude', 'longitude'], inplace=True)

            # Check if there are any remaining rows after dropping NaN values
            if not state_data.empty:
                # Select relevant features for clustering
                coordinates = state_data[['latitude', 'longitude']]

                # Perform K-means clustering
                cluster_count = 5  # Number of clusters
                kmeans = KMeans(n_clusters=cluster_count, random_state=0)
                kmeans.fit(coordinates)

                # Add cluster labels to the DataFrame
                state_data['cluster_label'] = kmeans.labels_

                # Generate a color palette for the clusters
                color_palette = generate_color_palette(cluster_count)

                # Add dots for each data point
                for lat, lon, cluster_label, city in zip(
                        state_data['latitude'], state_data['longitude'], state_data['cluster_label'], state_data['city']):
                    folium.CircleMarker(
                        location=[lat, lon],
                        radius=3, 
                        color=color_palette[cluster_label],
                        fill=True,
                        fill_color=color_palette[cluster_label],
                        fill_opacity=0.7,
                        popup=f"Year: {year}<br>State: {state}<br>City: {city}<br>Cluster: {cluster_label + 1}"  # Popup with state, city, and cluster number
                    ).add_to(map_clusters)

        # Create a legend for clusters
        create_cluster_legend(map_clusters, color_palette, cluster_count)

        # Calculate cluster centroids
        centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['latitude', 'longitude'])

        # Calculate distances between each pair of centroids
        distance_matrix = pd.DataFrame(index=range(cluster_count), columns=range(cluster_count))

        for i in range(cluster_count):
            for j in range(i + 1, cluster_count):
                distance = calculate_distance(
                    (centroids.iloc[i]['latitude'], centroids.iloc[i]['longitude']),
                    (centroids.iloc[j]['latitude'], centroids.iloc[j]['longitude'])
                )
                distance_matrix.iloc[i, j] = distance
                distance_matrix.iloc[j, i] = distance  # Fill symmetric value

        # Display the distance matrix
        print(f"\nDistance Matrix for year {year} (in miles):")
        print(distance_matrix)

        # Save the map as an HTML file with the year
        map_clusters.save(f'clusters_map_{year}.html')

        # Print states and cities for each year
        print(f"\nStates and cities according to year {year}:")
        for state_data in state_data_list:
            state = state_data[0]
            cities = state_data[1]['city'].unique()[:3]  # Select top 3 cities
            incidents = state_data[1].shape[0]  # Count incidents
            print(f"{state} ({year}): {', '.join(cities)}; Incidents: {incidents}")

# Load data from CSV file into a DataFrame
datasets = pd.read_csv('fatal-police-shootings-data-continental-us.csv')

# Perform clustering and generate maps based on year
cluster_and_plot_yearly_data(datasets)



Distance Matrix for year 2015 (in miles):
            0           1           2           3           4
0         NaN   78.652316   89.060826  137.699073  117.610018
1   78.652316         NaN  144.087423   63.819611  110.531583
2   89.060826  144.087423         NaN  182.251448   96.055497
3  137.699073   63.819611  182.251448         NaN  112.764563
4  117.610018  110.531583   96.055497  112.764563         NaN

States and cities according to year 2015:
CA (2015): San Ysidro, Calexico, San Diego; Incidents: 190
TX (2015): Brownsville, Edinburg, Mission; Incidents: 100
FL (2015): Homestead, Miami, Little Havana; Incidents: 61
AZ (2015): Bisbee, Sierra Vista, Tucson; Incidents: 42
OK (2015): Colbert, Rufe, Ardmore; Incidents: 32

Distance Matrix for year 2016 (in miles):
            0           1           2           3           4
0         NaN  192.558602    96.20425   47.774471   46.024433
1  192.558602         NaN  201.770692  218.998079  156.996562
2    96.20425  201.770692         