In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
import json
import datetime
import pyproj
import os
os.chdir("D:/Projects/crime-prediction")
pyproj.datadir.set_data_dir("D:/ProgramData/anaconda3/envs/crime-prediction/Library/share/proj")

In [2]:
pre_processed_file_path = 'data/preprocessed/preprocessed-3.csv'
race_and_age_data_file_path = 'data/preprocessed/race_and_age_data.geojson'

In [3]:
def parse_preprocessed_data():
    df = pd.read_csv(
                         pre_processed_file_path,
                         parse_dates=['Offense Date'],                       
                         dtype={
                             'Precinct': 'category',
                             'Offense Category': 'category',
                             'Latitude': np.float32,
                             'Longitude': np.float32
                         }
                        )
    df["Id"] = df.index
    gdf = gpd.GeoDataFrame(df, geometry= df["geometry"].apply(wkt.loads), crs='EPSG:4326')
    return gdf

In [4]:
crime_gdf = parse_preprocessed_data()
print(len(crime_gdf))

1280004


In [5]:
race_and_age_gdf = gpd.read_file(race_and_age_data_file_path)

In [6]:
print(race_and_age_gdf.columns)
print(crime_gdf.columns)

Index(['Tract Name', 'White_Total', 'Black_Total', 'Color_Total',
       '18_Under_Total', '65_Over_Total', 'geometry'],
      dtype='object')
Index(['Report Number', 'Offense Date', 'Latitude', 'Longitude', 'Beat',
       'Precinct', 'Sector', 'Neighborhood', 'Reporting Area',
       'Offense Category', 'geometry', 'centroid', 'Temperature', 'Wind Speed',
       'Precipitation', 'Visibility', 'Tract Name', 'Tract Population',
       'Tract Population Below Poverty', 'Id'],
      dtype='object')


In [7]:
crime_gdf = gpd.sjoin(crime_gdf, race_and_age_gdf, how="left", predicate="within") \
                .drop(columns=['index_right', 'Tract Name_right']) \
                .rename(columns={'Tract Name_left': 'Tract Name'}) \
                .groupby("Id") \
                .first()
print(crime_gdf.columns)

Index(['Report Number', 'Offense Date', 'Latitude', 'Longitude', 'Beat',
       'Precinct', 'Sector', 'Neighborhood', 'Reporting Area',
       'Offense Category', 'geometry', 'centroid', 'Temperature', 'Wind Speed',
       'Precipitation', 'Visibility', 'Tract Name', 'Tract Population',
       'Tract Population Below Poverty', 'White_Total', 'Black_Total',
       'Color_Total', '18_Under_Total', '65_Over_Total'],
      dtype='object')


In [8]:
crime_gdf.to_csv('data/preprocessed/preprocessed-4.csv', index=False)