In [1]:
import sys
import os
import pandas as pd
import geopandas as gpd
import numpy as np
from datetime import datetime
from shapely.geometry import Point
import random
import pyproj

pyproj.datadir.set_data_dir("D:/ProgramData/anaconda3/envs/crime-prediction/Library/share/proj")

  _set_context_ca_bundle_path(ca_bundle_path)


In [2]:
raw_file_path = 'data/SPD_Crime_Data__2008-Present_20250710.csv'

In [3]:
# read beats data to determine if crime locatiosn are within seattle city limits
beats_file ="data/beats-data.geojson"
beats = gpd.read_file(beats_file)
city_limits = gpd.GeoDataFrame(pd.DataFrame({"City": "Seattle"}, index=[0]), geometry=[beats.geometry.union_all()], crs="EPSG:4326")

  _init_proj_data()


In [4]:
# read raw data of crimes from seattle crime data
def parse_float(value):
    if value =='REDACTED':
        return -1.0
    return float(value)

def parse_beat(value):
    if value == '-' or value == '99' or value == 'OOJ':
        return None
    return value

def parse_raw_data():
    raw_df = pd.read_csv(
                         raw_file_path,
                         usecols= [
                             'Offense Date', 
                             'Report DateTime',
                             'Precinct',
                             'Offense Category',
                             'Latitude',
                             'Longitude',
                             'Beat',
                             'Sector',
                             'Precinct',
                             'Neighborhood',
                             'Reporting Area'
                         ],
                         parse_dates=['Offense Date', 'Report DateTime'],
                         converters={
                             'Latitude': parse_float,
                             'Longitude': parse_float,
                             'Beat': parse_beat
                         },                                 
                         date_format="%m/%d/%Y %I:%M:%S %p",
                         dtype={
                             'Precinct': 'category',
                             'Offense Category': 'category'
                         }
                        )
    # find locations that are outside the seattle city boundary for cleanup
    raw_gdf = gpd.GeoDataFrame(raw_df, geometry=gpd.points_from_xy(raw_df["Longitude"], raw_df["Latitude"]), crs="EPSG:4326")
    raw_gdf = gpd.sjoin(raw_gdf, city_limits, how="left", predicate="within")
    
    
    # set the latitude and longitude to None for locations outside the city boundary.
    # later these latitude and longitude will be sampled from the beat
    raw_gdf.loc[pd.notna(raw_gdf['City']) == False, ['Latitude', 'Longitude']] = [None, None]
    raw_gdf.drop('index_right', axis=1, inplace=True) 
    raw_gdf.drop('City', axis=1, inplace=True) 
    
    # replace offense date older than 2008 with report date
    raw_gdf.loc[raw_gdf["Offense Date"].dt.year < 2008, 'Offense Date'] = raw_gdf["Report DateTime"]
    
    
    print(f"Raw data count: {len(raw_gdf):,}")
    print(f"Missing Lat Long data count: {len(raw_gdf.loc[pd.notna(raw_gdf['Latitude']) == False]):,}")
    print(f"Date Range: {raw_gdf['Offense Date'].min()} to {raw_gdf['Offense Date'].max()}")
    return raw_gdf

In [5]:
raw_gdf = parse_raw_data()

Raw data count: 1,466,080
Missing Lat Long data count: 388,166
Date Range: 1975-12-16 15:00:00 to 2025-07-09 22:52:00


In [6]:
def sample_point_in_beat(beat_polygon):
    min_x, min_y, max_x, max_y = beat_polygon.bounds
    while True:
        point = Point(random.uniform(min_x, max_x), random.uniform(min_y, max_y))
        if beat_polygon.contains(point):
            return point
            
def fill_missing_coordinates(df, beats_gdf):
    beat_polygons = beats.set_index('name')['geometry']
    points =  df.loc[pd.notna(df['Latitude']) == False, 'Beat'].map(
        lambda x: sample_point_in_beat(beat_polygons[x]) if pd.notna(x) else None
    )
    df.loc[pd.notna(df['Latitude']) == False, 'Latitude'] = points.apply(lambda p: p.x if p else None)
    df.loc[pd.notna(df['Longitude']) == False, 'Longitude'] = points.apply(lambda p: p.y if p else None)

    
    
    

In [7]:
fill_missing_coordinates(raw_gdf, beats)
print(f"Data with missing location count: {len(raw_gdf.loc[pd.notna(raw_gdf['Latitude']) == False]):,}")
raw_gdf = raw_gdf.loc[pd.notna(raw_gdf['Latitude'])]

Data with missing location count: 14,268


In [9]:
raw_gdf.to_csv('data/preprocessed/preprocessed-1.csv', index=False)