In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
import json
import datetime
import pyproj
import os
os.chdir("D:/Projects/crime-prediction")
pyproj.datadir.set_data_dir("D:/ProgramData/anaconda3/envs/crime-prediction/Library/share/proj")

In [9]:
pre_processed_file_path = 'data/preprocessed/preprocessed-1.csv'
weather_data_file_path = 'data/preprocessed/weather_data.json'

In [10]:
with open(weather_data_file_path, 'r') as f:
    weather_data = json.load(f)

In [11]:
def parse_preprocessed_data():
    df = pd.read_csv(
                         pre_processed_file_path,
                         parse_dates=['Offense Date'],                       
                         dtype={
                             'Precinct': 'category',
                             'Offense Category': 'category',
                             'Latitude': np.float32,
                             'Longitude': np.float32
                         }
                        )
    gdf = gpd.GeoDataFrame(df, geometry= df["geometry"].apply(wkt.loads), crs='EPSG:4326')
    return gdf

In [12]:
gdf = parse_preprocessed_data()
print(len(gdf))

1280004


In [13]:
def getWeatherForDate(date):
    # Convert date to ISO format string (YYYY-MM-DD) by rounding to day and removing timezone info
    day_str = date.round('D').replace(tzinfo=None).isoformat()
    # Extract the hour component from the date
    hour_of_day = date.hour
    # Return weather data for the specified day and hour from the weather_data dictionary
    return (weather_data[day_str][hour_of_day]['feelslike'], 
            weather_data[day_str][hour_of_day]['windspeed'],
            weather_data[day_str][hour_of_day]['precip'],
            weather_data[day_str][hour_of_day]['visibility']
           )

In [14]:
# Map weather data to each row in the geodataframe
# - Extracts temperature (feelslike), wind speed, precipitation, and visibility
# - Uses the getWeatherForDate function which takes an offense date and returns weather metrics
# - The apply method processes each row individually and returns a Series of weather values
# - Results are assigned to new columns in the geodataframe
gdf[['Temperature', 'Wind Speed', 'Precipitation', 'Visibility']] = gdf.apply(lambda row: getWeatherForDate(row['Offense Date']), axis=1, result_type='expand')

In [15]:
gdf.to_csv('data/preprocessed/preprocessed-2.csv', index=False)