In [None]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
import seaborn as sns
from geopy.distance import geodesic
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def generate_color_palette(num_colors):
    """Generate a color palette based on the number of clusters."""
    return sns.color_palette("hsv", num_colors).as_hex()

def create_cluster_legend(map_obj, color_palette, cluster_count):
    """Add a legend to the folium map."""
    legend_html = '''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; 
                width: 200px; height: auto; 
                z-index:9999; 
                font-size:14px; 
                background-color:white;
                border:2px solid grey;
                padding: 10px;">
    <h4 style="margin: 0;">Cluster Legend</h4>
    <ul style="list-style-type: none; padding: 0;">
    '''

    for i in range(cluster_count):
        legend_html += f'<li><span style="color: {color_palette[i]};">&#9679;</span> Cluster {i + 1}</li>'

    legend_html += '</ul></div>'
    map_obj.get_root().html.add_child(folium.Element(legend_html))

def calculate_distance(coord1, coord2):
    """Calculate the geodesic distance between two latitude/longitude points."""
    return geodesic(coord1, coord2).miles

# Load data from CSV file into a DataFrame
datasets = pd.read_csv('fatal-police-shootings-data-continental-us.csv')

# Print column names to check for 'race' and 'demographics'
print("Columns in the dataset:", datasets.columns.tolist())

# Convert date column to datetime type
datasets['date'] = pd.to_datetime(datasets['date'], errors='coerce')

# Drop rows with missing or invalid dates
datasets.dropna(subset=['date'], inplace=True)

# Extract year from the date
datasets['year'] = datasets['date'].dt.year

# Drop duplicate rows
datasets.drop_duplicates(inplace=True)

# Drop rows with missing values in critical columns (latitude, longitude, state, city, race)
datasets.dropna(subset=['latitude', 'longitude', 'state', 'city', 'race'], inplace=True)

# Group data by state and city, count the occurrences, and reset index
state_city_counts = datasets.groupby(['state', 'city']).size().reset_index(name='count')

# Find the top five cities overall
top_cities = state_city_counts.nlargest(5, 'count')

# Perform K-means clustering
kmeans = KMeans(n_clusters=len(top_cities), random_state=0)

# Initialize Folium map centered on the United States
map_clusters = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# Prepare to collect output information
output_info = []

# Perform K-means clustering for the top cities
coordinates = datasets[['latitude', 'longitude']].dropna()
if not coordinates.empty:
    kmeans.fit(coordinates)

    # Generate a color palette for the clusters
    color_palette = generate_color_palette(len(top_cities))

    for index, row in top_cities.iterrows():
        city_data = datasets[(datasets['state'] == row['state']) & (datasets['city'] == row['city'])]
        city_data.dropna(subset=['latitude', 'longitude'], inplace=True)  # Drop rows with missing latitude or longitude

        if not city_data.empty:
            unique_races = city_data['race'].unique()  # Get unique race categories for the city
            
            # Add marker for each city with additional info
            for _, city_row in city_data.iterrows():
                cluster_label = kmeans.predict([[city_row['latitude'], city_row['longitude']]])[0]
                popup_content = f"<b>City:</b> {city_row['city']}<br>" \
                                f"<b>State:</b> {city_row['state']}<br>" \
                                f"<b>Race:</b> {city_row['race']}<br>"

                # Include demographics if the column exists
                if 'demographics' in city_row:
                    popup_content += f"<b>Demographics:</b> {city_row['demographics']}<br>"

                popup_content += f"<b>Incidents:</b> {row['count']}"

                folium.CircleMarker(
                    location=[city_row['latitude'], city_row['longitude']],
                    radius=3,  
                    color=color_palette[cluster_label],
                    fill=True,
                    fill_color=color_palette[cluster_label],
                    fill_opacity=0.7,
                    popup=popup_content
                ).add_to(map_clusters)

            # Collect output information with unique races
            output_info.append(f"State: {row['state']}, City: {row['city']}, "
                               f"Incidents: {row['count']}, Races: {list(unique_races)}")

# Create a legend for clusters
create_cluster_legend(map_clusters, color_palette, len(top_cities))

# Calculate cluster centroids
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['latitude', 'longitude'])

# Calculate distances between each pair of centroids
distance_matrix = pd.DataFrame(index=range(len(top_cities)), columns=range(len(top_cities)))

for i in range(len(centroids)):
    for j in range(i + 1, len(centroids)):
        distance = calculate_distance(
            (centroids.iloc[i]['latitude'], centroids.iloc[i]['longitude']),
            (centroids.iloc[j]['latitude'], centroids.iloc[j]['longitude'])
        )
        distance_matrix.iloc[i, j] = distance
        distance_matrix.iloc[j, i] = distance  # Fill symmetric value

# Display the distance matrix
print("\nDistance Matrix (in miles):")
print(distance_matrix)

# Save the map as an HTML file
map_clusters.save('k_mean_clustering_based_on_race_demographics_and_incidents.html')

# Print collected output information
for info in output_info:
    print(info)
