In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import DBSCAN
from geopy.distance import great_circle

import folium
from folium import plugins
from string import Template 

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("events.csv")

In [None]:
data.head()

In [None]:
print("Missing values:\n", data.isnull().sum())

In [None]:
data[data["county"].isna()]

In [None]:
data[data["city"].isna()]

In [None]:
print("Summary Statistics:\n", data.describe())

In [None]:
# event-type distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='event-type')
plt.title('Distribution of Event Types')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Plot a small sample of the data as it takes too long
plot_df = data.sample(10000)

# Calculate event counts by location
location_counts = plot_df['city'].value_counts().reset_index()
location_counts.columns = ['city', 'count']

# Create a map centered on a specific location (e.g., mean latitude and longitude)
map_center = [plot_df['latitude'].mean(), plot_df['longitude'].mean()]
mymap = folium.Map(location=map_center, zoom_start=4)

# Add markers for each location with event counts
for idx, row in location_counts.iterrows():
    folium.CircleMarker(
        location=[plot_df.loc[plot_df['city'] == row['city'], 'latitude'].iloc[0],
                  plot_df.loc[plot_df['city'] == row['city'], 'longitude'].iloc[0]],
        radius=row['count'] / 100,  # Adjust the marker size based on event counts
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.5,
        popup=f"{row['city']}: {row['count']} events",
    ).add_to(mymap)

# Display the map within the Jupyter Notebook
mymap

In [None]:
#Event Counts by Location
location_counts = data['city'].value_counts().nlargest(10)

In [None]:
plt.figure(figsize=(10, 6))
location_counts.plot(kind='bar')
plt.title('Top 10 Cities by Number of Events')
plt.xlabel('City')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Event Counts by Year
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='year')
plt.title('Number of Events by Year')
plt.xlabel('Year')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Correlation Analysis
correlation_matrix = data[['latitude', 'longitude']].corr()

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
#Outlier Detection
plt.figure(figsize=(10, 6))
sns.boxplot(data=data[['latitude', 'longitude']])
plt.title('Boxplot of Latitude and Longitude')
plt.show()

In [None]:
#Missing Values Analysis
missing_values = data.isnull().sum()

plt.figure(figsize=(8, 5))
missing_values.plot(kind='bar')
plt.title('Missing Values by Column')
plt.xlabel('Column')
plt.ylabel('Number of Missing Values')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Script to calculate total events within 50 KM of a point
def count_events_within_radius(latitude, longitude, radius_km):
    total_events = 0
    for idx, row in data.iterrows():
        event_coords = (row['latitude'], row['longitude'])
        distance = great_circle((latitude, longitude), event_coords).kilometers
        if distance <= radius_km:
            total_events += 1
    return total_events

In [None]:
point_latitude = 40.7128
point_longitude = -74.0060
radius_km = 50
total_events_nearby = count_events_within_radius(point_latitude, point_longitude, radius_km)
print("Total events within 50 KM of the point:", total_events_nearby)

In [None]:
# Visualize the clusters on a map
map_center = [data['latitude'].mean(), data['longitude'].mean()]
mymap = folium.Map(location=map_center, zoom_start=4)

def clusters_by_event_type(event_type, color):
    plot_Data = data[data["event-type"]==event_type]
    
    # Extract latitude and longitude
    coords = plot_Data[['latitude', 'longitude']].values
    
    # Perform DBSCAN clustering
    epsilon = 10 / kms_per_radian  # Adjust epsilon (neighborhood radius) as needed
    min_samples = 10  # Adjust min_samples as needed
    dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric='euclidean').fit(coords)
    plot_Data['cluster'] = dbscan.labels_

    
    # Add markers for each cluster
    for cluster_id in set(plot_Data['cluster']):
        if cluster_id != -1:  # Exclude noise points (cluster_id=-1)
            cluster_data = plot_Data[plot_Data['cluster'] == cluster_id]
            cluster_center = [cluster_data['latitude'].mean(), cluster_data['longitude'].mean()]
            folium.CircleMarker(
                radius=row['count'] / 100,
                location=cluster_center,
                fill=True,
                color=color,
                fill_color=color
                # popup=f"Cluster {cluster_id}",
                #icon=folium.Icon(color='red')
            ).add_to(mymap)


legend_string_template = Template('&nbsp; $cluster &nbsp; <i class="fa fa-circle fa-2x" style="color:$color"></i><br>')
legend_string = '''
     <div style="position: fixed; 
         bottom: 50px; right: 50px; width: 200px; height: 200px; 
         border:2px solid grey; z-index:9999; font-size:14px;
    ">
    &nbsp; LEGEND <br>'''

event_types = data['event-type'].unique()
colors = ['red', 'blue', 'green', 'purple', 'orange', 'lightgreen']
for event_type, color in zip(event_types, colors):
    clusters_by_event_type(event_type, color)
    legend_string+=legend_string_template.substitute({'cluster':event_type, 'color':color})

legend_string+='</div>'

mymap.get_root().html.add_child(folium.Element(legend_string))

# Display the map with cluster markers
mymap