In [10]:
import numpy as np
from sklearn.cluster import DBSCAN
import pandas as pd
import geopandas as gpd
from shapely import wkt
import matplotlib as plt
import pyproj
import os
os.chdir("D:/Projects/crime-prediction")
pyproj.datadir.set_data_dir("D:/ProgramData/anaconda3/envs/crime-prediction/Library/share/proj")

In [11]:

    
def cluster_lat_long(df, radius_km, min_samples):
    df = df.reset_index()
    coords_rad = np.radians(df[['Latitude', 'Longitude']].values)
    kms_per_radian = 6371.0088
    epsilon = radius_km / kms_per_radian
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm="ball_tree", metric="haversine").fit(coords_rad)
    
    labels = db.labels_
    core_samples_mask = np.zeros_like(labels, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    unique_labels = set(labels)
  
    core_samples_per_label = {}
    for k in unique_labels:
        # Get indices of points belonging to the current cluster k
        class_member_mask = (labels == k)

        # Find core samples within this cluster
        cluster_core_samples_indices = df[class_member_mask & core_samples_mask]

        # If there are core samples for this cluster, take the first one
        if len(cluster_core_samples_indices) > 0:
            core_samples_per_label[k] = cluster_core_samples_indices.iloc[0]

    def m(x):
     return pd.Series({
             'ClusterLatitude': core_samples_per_label[labels[x.name]]['Latitude'],
             'ClusterLongitude': core_samples_per_label[labels[x.name]]['Longitude']
            })
        
    df[['ClusterLatitude', 'ClusterLongitude']] = df.apply(lambda x: m(x), axis=1)
    return df

In [12]:
pre_processed_file_path = 'data/preprocessed/preprocessed-1.csv'

In [13]:
def parse_preprocessed_data_1():
    df = pd.read_csv(
                         pre_processed_file_path,
                         parse_dates=['Offense Date'],                       
                         dtype={
                             'Precinct': 'category',
                             'Offense Category': 'category',
                             'Latitude': np.float32,
                             'Longitude': np.float32
                         }
                        )
    gdf = gpd.GeoDataFrame(df, geometry= df["geometry"].apply(wkt.loads), crs='EPSG:4326')
    return gdf

In [14]:
gdf = parse_preprocessed_data_1()

In [15]:
gdf['Offense Date Rounded Hour'] = gdf['Offense Date'].dt.round('h')
unique_combinations = gdf[['Latitude', 'Longitude','Offense Date Rounded Hour']].drop_duplicates()
grouped_df = unique_combinations.groupby('Offense Date Rounded Hour')
cf = grouped_df[['Latitude', 'Longitude','Offense Date Rounded Hour']].apply(lambda x: cluster_lat_long(x, radius_km=50,min_samples=1))
print(cf)
# group_counts =  grouped_df['Latitude'].size().to_frame().sort_values(by='Latitude', ascending=False)
# print(group_counts[group_counts['Latitude']>1])

                               index    Latitude   Longitude  \
Offense Date Rounded Hour                                      
1975-12-16 15:00:00       0   283435   47.538105 -122.267265   
1979-02-09 14:00:00       0  1404346 -122.286972   47.608807   
1981-02-15 05:00:00       0  1226176   47.508259 -122.383598   
1981-08-22 20:00:00       0  1389347 -122.274963   47.545860   
1988-09-29 02:00:00       0   715777   47.612679 -122.339386   
...                              ...         ...         ...   
2025-07-09 20:00:00       1   186880   47.661385 -122.333130   
                          2   452330 -122.344925   47.619583   
2025-07-09 22:00:00       0   155748   47.580246 -122.385208   
                          1   933375   47.521038 -122.369347   
2025-07-09 23:00:00       0  1450112 -122.302727   47.721024   

                            Offense Date Rounded Hour  ClusterLatitude  \
Offense Date Rounded Hour                                                
1975-12-16 15:00:00

In [16]:
cf.to_csv('data/preprocessed/preprocessed-2.csv', index=False)

In [17]:
group_counts =  grouped_df['Latitude'].size().to_frame().sort_values(by='Latitude', ascending=False)
print(group_counts[group_counts['Latitude']>1])

                           Latitude
Offense Date Rounded Hour          
2020-03-08 00:00:00             295
2020-05-01 00:00:00             247
2020-03-15 00:00:00             220
2020-04-01 00:00:00             162
2020-05-10 00:00:00             142
...                             ...
2015-11-10 05:00:00               2
2023-02-05 19:00:00               2
2023-02-05 23:00:00               2
2023-02-06 02:00:00               2
2023-02-05 06:00:00               2

[147811 rows x 1 columns]
