## predict crime by location and time

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colors

from shapely.geometry import box
from shapely.plotting import plot_polygon, plot_points

import geopandas as gpd

from shapely import STRtree,buffer

import warnings
warnings.simplefilter(action='ignore', category=Warning)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder

import pointpats
from shapely import Point

#seting up palette 
palette = ["#FAC8BE", "#80E1C6", "#FFB3E1", "#6CC3FC", "#FFD168", "#C894E1"]
sns.set(rc={"axes.facecolor":"#e6e6e6","figure.facecolor":"#f5f5f5"})
cmap = colors.ListedColormap( ["#FAC8BE", "#80E1C6", "#FFB3E1", "#6CC3FC", "#FFD168", "#C894E1"])

In [2]:
# Load the dataset
df = pd.read_csv('./data/crime_dc.csv')

# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,neighborhood_cluster,offense_group,census_tract,longitude,end_date,offense_text,shift,district,yblock,ward,...,xblock,block,start_date,ccn,offense,anc,report_date,method,location,latitude
0,cluster 38,violent,7404.0,-76.976591,,HOMICIDE,midnight,7.0,131428.0,8.0,...,402032.0,1800 - 1824 block of frederick douglass place se,2023-02-23 16:24:00,23029603,HOMICIDE,8C,2023-02-24 00:00:00,gun,"38.8506559459,-76.9765909045",38.850656
1,cluster 20,violent,9504.0,-76.993575,2023-06-16 18:53:00,ASSAULT W/DANGEROUS WEAPON,day,4.0,140787.56,5.0,...,400557.06,900 - 998 block of michigan avenue ne,2023-06-16 18:29:00,23096959,ASSAULT W/DANGEROUS WEAPON,5B,2023-06-17 11:16:47,others,"38.934972488,-76.9935749549",38.934972
2,cluster 25,property,8410.0,-76.990893,2023-04-13 13:00:00,THEFT/OTHER,midnight,1.0,136927.0,6.0,...,400790.0,1100 - 1199 block of h street ne,2023-04-12 19:30:00,23058339,THEFT/OTHER,6A,2023-04-15 03:15:41,others,"38.900195113,-76.9908927027",38.900195
3,cluster 1,property,3801.0,-77.040824,2023-08-20 22:02:00,THEFT/OTHER,evening,3.0,138822.23,1.0,...,396459.65,1731 - 1785 block of florida avenue nw,2023-08-20 21:10:00,23137253,THEFT/OTHER,1C,2023-08-26 21:56:04,others,"38.9172612211,-77.0408237172",38.917261
4,cluster 6,property,10700.0,-77.040859,2023-05-20 15:54:00,THEFT/OTHER,evening,2.0,137483.0,2.0,...,396456.0,1130 - 1199 block of connecticut avenue nw,2023-05-20 15:40:00,23079620,THEFT/OTHER,2C,2023-05-20 19:48:30,others,"38.9051969682,-77.0408588935",38.905197


In [3]:
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

df=df[df['start_date'].dt.year>2022]

In [4]:
df.index=pd.RangeIndex(start=0,stop=df.shape[0])
df=df.reset_index()
df

Unnamed: 0,index,neighborhood_cluster,offense_group,census_tract,longitude,end_date,offense_text,shift,district,yblock,...,xblock,block,start_date,ccn,offense,anc,report_date,method,location,latitude
0,0,cluster 38,violent,7404.0,-76.976591,NaT,HOMICIDE,midnight,7.0,131428.00,...,402032.00,1800 - 1824 block of frederick douglass place se,2023-02-23 16:24:00,23029603,HOMICIDE,8C,2023-02-24 00:00:00,gun,"38.8506559459,-76.9765909045",38.850656
1,1,cluster 20,violent,9504.0,-76.993575,2023-06-16 18:53:00,ASSAULT W/DANGEROUS WEAPON,day,4.0,140787.56,...,400557.06,900 - 998 block of michigan avenue ne,2023-06-16 18:29:00,23096959,ASSAULT W/DANGEROUS WEAPON,5B,2023-06-17 11:16:47,others,"38.934972488,-76.9935749549",38.934972
2,2,cluster 25,property,8410.0,-76.990893,2023-04-13 13:00:00,THEFT/OTHER,midnight,1.0,136927.00,...,400790.00,1100 - 1199 block of h street ne,2023-04-12 19:30:00,23058339,THEFT/OTHER,6A,2023-04-15 03:15:41,others,"38.900195113,-76.9908927027",38.900195
3,3,cluster 1,property,3801.0,-77.040824,2023-08-20 22:02:00,THEFT/OTHER,evening,3.0,138822.23,...,396459.65,1731 - 1785 block of florida avenue nw,2023-08-20 21:10:00,23137253,THEFT/OTHER,1C,2023-08-26 21:56:04,others,"38.9172612211,-77.0408237172",38.917261
4,4,cluster 6,property,10700.0,-77.040859,2023-05-20 15:54:00,THEFT/OTHER,evening,2.0,137483.00,...,396456.00,1130 - 1199 block of connecticut avenue nw,2023-05-20 15:40:00,23079620,THEFT/OTHER,2C,2023-05-20 19:48:30,others,"38.9051969682,-77.0408588935",38.905197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25850,25850,cluster 23,property,8802.0,-76.988471,2023-05-13 17:21:00,THEFT/OTHER,day,5.0,137671.00,...,401000.00,1100 - 1199 block of penn street ne,2023-05-13 17:20:00,23422572,THEFT/OTHER,5D,2023-06-15 10:02:38,others,"38.9068971135,-76.9884706924",38.906897
25851,25851,cluster 24,property,9000.0,-76.960097,2023-08-09 06:45:00,THEFT F/AUTO,day,5.0,140142.00,...,403459.94,2855 - 3200 block of bladensburg road ne,2023-08-08 21:30:00,23130097,THEFT F/AUTO,5C,2023-08-09 11:53:00,others,"38.9291504511,-76.960096837",38.929150
25852,25852,cluster 8,property,4702.0,-77.011543,2023-08-16 09:15:00,THEFT/OTHER,day,1.0,136737.04,...,398998.67,50 - 79 block of massachusetts avenue nw,2023-08-16 09:00:00,23135742,THEFT/OTHER,6E,2023-08-18 08:03:43,others,"38.8984836697,-77.01154328",38.898484
25853,25853,cluster 15,property,1304.0,-77.058081,2023-09-19 18:18:00,THEFT/OTHER,evening,2.0,140772.69,...,394964.33,3319 - 3499 block of connecticut avenue nw,2023-09-19 18:01:00,23154929,THEFT/OTHER,3C,2023-09-19 19:12:25,others,"38.9348242806,-77.0580805327",38.934824


In [5]:
df['day_of_week'] = df['start_date'].dt.dayofweek
df['hour'] = df['start_date'].dt.hour

In [6]:
metro_lines = gpd.read_file('./data/dc-maps/maps/metro-lines.geojson')
atm = gpd.read_file('./data/dc-maps/maps/atm-banking.geojson')
banks = gpd.read_file('./data/dc-maps/maps/bank-locations.geojson')
gas = gpd.read_file('./data/dc-maps/maps/gas-stations.geojson')
hospitals = gpd.read_file('./data/dc-maps/maps/hospitals.geojson')
# benches = gpd.read_file('./data/dc-maps/maps/benches.geojson') # toomany
dc_boundary = gpd.read_file('./data/dc-maps/maps/dc-boundary.geojson')
district = gpd.read_file('./data/dc-maps/maps/district-mask.geojson')
libraries = gpd.read_file('./data/dc-maps/maps/libraries.geojson')
metro_stations=gpd.read_file('./data/dc-maps/maps/metro-station-entrances-district.geojson')
parks = gpd.read_file('./data/dc-maps/maps/national-parks.geojson')
museums = gpd.read_file('./data/dc-maps/maps/museums-in-dc.geojson')
police_stations = gpd.read_file('./data/dc-maps/maps/police-stations.geojson')
shuttle_bus = gpd.read_file('./data/dc-maps/maps/shuttle-bus-stops.geojson')
shopping_centers = gpd.read_file('./data/dc-maps/maps/shopping-centers.geojson')
camera = gpd.read_file('./data/dc-maps/maps/camera-enforcement-locations.geojson')
grocery=gpd.read_file('./data/dc-maps/maps/grocery-store-locations.geojson')
metro_bus = gpd.read_file('./data/dc-maps/maps/metro-bus-stops.geojson')
post_office = gpd.read_file('./data/dc-maps/maps/post-offices.geojson')
schools = gpd.read_file('./data/dc-maps/maps/public-schools.geojson')

In [7]:
def fishnet(geometry, step=0.01):
    bounds = geometry.bounds
    xmin,ymin,xmax,ymax=bounds
    result = []
    for i in np.arange(xmin, xmax, step):
        for j in np.arange(ymin, ymax, step):
            b = box(i, j, i+step, j+step)
            g = geometry.intersection(b)
            if g.is_empty:
                continue
            result.append(g)
    return result

# make crime_data a geopanda frame
df['geometry'] = gpd.points_from_xy(df['longitude'], df['latitude'])
crime_data = gpd.GeoDataFrame(df, crs="EPSG:4326")


# polygon=dc_boundary['geometry'][0]
# res=fishnet(polygon, 0.005)
# nets=gpd.GeoDataFrame(geometry=res)
# nets['index']=range(len(nets))

In [8]:
def count_close_building(crime_data, buildings, new_col_name='counts', max_distance=0.002):
    data=crime_data
    temp=data[['index','geometry']]
    buildings_location=buildings[['geometry']]

    # Find events within the buffered_museums
    # 1 degree is approximately 111 km;
    temp_buffered = gpd.sjoin_nearest( temp, buildings_location, how='left',max_distance=max_distance,distance_col='distance')
    temp_buffered=temp_buffered[~temp_buffered['index_right'].isnull()]

    counts=pd.DataFrame(temp_buffered.groupby('index').size().rename(new_col_name))

    temp_merged=temp.merge(counts,left_on='index', right_on='index', how='left').fillna(0)

    return temp_merged

In [9]:
# temp=count_close_building(crime_data,museums,'museums_count')
# crime_data.loc[:,'museums_count']=temp.loc[:,'museums_count']

# col_name='gas'
# temp=count_close_building(crime_data,gas,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='metro_bus'
# temp=count_close_building(crime_data,metro_bus,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='atm'
# temp=count_close_building(crime_data,atm,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='banks'
# temp=count_close_building(crime_data,banks,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='grocery'
# temp=count_close_building(crime_data,grocery,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='metro_stations'
# temp=count_close_building(crime_data,metro_stations,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='post_office'
# temp=count_close_building(crime_data,post_office,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='schools'
# temp=count_close_building(crime_data,schools,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='libraries'
# temp=count_close_building(crime_data,libraries,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]

# col_name='shuttle_bus'
# temp=count_close_building(crime_data,shuttle_bus,col_name)
# crime_data.loc[:,col_name]=temp.loc[:,col_name]



In [10]:
crime_data

Unnamed: 0,index,neighborhood_cluster,offense_group,census_tract,longitude,end_date,offense_text,shift,district,yblock,...,gas,metro_bus,atm,banks,grocery,metro_stations,post_office,schools,libraries,shuttle_bus
0,0,cluster 38,violent,7404.0,-76.976591,NaT,HOMICIDE,midnight,7.0,131428.00,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,cluster 20,violent,9504.0,-76.993575,2023-06-16 18:53:00,ASSAULT W/DANGEROUS WEAPON,day,4.0,140787.56,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2,cluster 25,property,8410.0,-76.990893,2023-04-13 13:00:00,THEFT/OTHER,midnight,1.0,136927.00,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,cluster 1,property,3801.0,-77.040824,2023-08-20 22:02:00,THEFT/OTHER,evening,3.0,138822.23,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,cluster 6,property,10700.0,-77.040859,2023-05-20 15:54:00,THEFT/OTHER,evening,2.0,137483.00,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25850,25850,cluster 23,property,8802.0,-76.988471,2023-05-13 17:21:00,THEFT/OTHER,day,5.0,137671.00,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25851,25851,cluster 24,property,9000.0,-76.960097,2023-08-09 06:45:00,THEFT F/AUTO,day,5.0,140142.00,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25852,25852,cluster 8,property,4702.0,-77.011543,2023-08-16 09:15:00,THEFT/OTHER,day,1.0,136737.04,...,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25853,25853,cluster 15,property,1304.0,-77.058081,2023-09-19 18:18:00,THEFT/OTHER,evening,2.0,140772.69,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


In [55]:
tree=STRtree(crime_data['geometry'])

In [None]:
# need to generate some non-crime events

In [25]:
# nets['net_index']=nets['index']
# nets=nets.drop('index',axis=1)

In [30]:
# crime_data_nets=gpd.sjoin(crime_data,nets,how='left',op='within',rsuffix='nets')

In [31]:
# crime_data_nets=crime_data_nets.drop('index_nets',axis=1)


In [34]:
# crime_data_nets.columns

Index(['index', 'neighborhood_cluster', 'offense_group', 'census_tract',
       'longitude', 'end_date', 'offense_text', 'shift', 'district', 'yblock',
       'ward', 'year', 'offensekey', 'bid', 'sector', 'psa', 'ucr_rank',
       'block_group', 'voting_precinct', 'xblock', 'block', 'start_date',
       'ccn', 'offense', 'anc', 'report_date', 'method', 'location',
       'latitude', 'day_of_week', 'hour', 'geometry', 'museums_count', 'gas',
       'metro_bus', 'atm', 'banks', 'grocery', 'metro_stations', 'post_office',
       'schools', 'libraries', 'shuttle_bus', 'net_index'],
      dtype='object')

In [35]:
crime_data_nets[['ward','day_of_week', 'hour', 'museums_count', 'gas',
       'metro_bus', 'atm', 'banks', 'grocery', 'metro_stations', 'post_office',
       'schools', 'libraries', 'shuttle_bus','index']]

Unnamed: 0,ward,day_of_week,hour,museums_count,gas,metro_bus,atm,banks,grocery,metro_stations,post_office,schools,libraries,shuttle_bus,index
0,8.0,3,16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,5.0,4,18,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
2,6.0,2,19,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,1.0,6,21,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3
4,2.0,5,15,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25850,5.0,5,17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25850
25851,5.0,1,21,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,25851
25852,6.0,2,9,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,25852
25853,3.0,1,18,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,25853


In [92]:
# let give a day_of_week and hour, query the dataset to see if it's corresponding to crime
random_location=pointpats.random.poisson(polygon, size=1)
events=tree.query_nearest(Point(random_location),max_distance=0.013,all_matches=True).tolist()

In [94]:
crime_data.iloc[events][['start_date','day_of_week','hour','offense_text']]

Unnamed: 0,start_date,day_of_week,hour,offense_text
2688,2023-01-26 01:27:00,3,1,MOTOR VEHICLE THEFT
9662,2023-03-22 22:25:00,2,22,ROBBERY
3846,2023-02-02 17:16:00,3,17,THEFT F/AUTO
7522,2023-02-02 17:52:00,3,17,MOTOR VEHICLE THEFT


In [68]:
# PCA on the lat and lon to get 1d
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
crime_data_nets['PCA_location']=pca.fit_transform(crime_data_nets[['longitude','latitude']])

In [74]:
temp=crime_data_nets.groupby(['PCA_location','day_of_week','hour'])['index'].count()

In [77]:
np.max(temp.values)

7

In [89]:
crime_data_nets

Unnamed: 0,index,neighborhood_cluster,offense_group,census_tract,longitude,end_date,offense_text,shift,district,yblock,...,atm,banks,grocery,metro_stations,post_office,schools,libraries,shuttle_bus,net_index,PCA_location
0,0,cluster 38,violent,7404.0,-76.976591,NaT,HOMICIDE,midnight,7.0,131428.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,644,-0.058103
1,1,cluster 20,violent,9504.0,-76.993575,2023-06-16 18:53:00,ASSAULT W/DANGEROUS WEAPON,day,4.0,140787.56,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,585,0.006194
2,2,cluster 25,property,8410.0,-76.990893,2023-04-13 13:00:00,THEFT/OTHER,midnight,1.0,136927.00,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,578,-0.016873
3,3,cluster 1,property,3801.0,-77.040824,2023-08-20 22:02:00,THEFT/OTHER,evening,3.0,138822.23,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,231,0.033277
4,4,cluster 6,property,10700.0,-77.040859,2023-05-20 15:54:00,THEFT/OTHER,evening,2.0,137483.00,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,228,0.026046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25850,25850,cluster 23,property,8802.0,-76.988471,2023-05-13 17:21:00,THEFT/OTHER,day,5.0,137671.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,607,-0.014775
25851,25851,cluster 24,property,9000.0,-76.960097,2023-08-09 06:45:00,THEFT F/AUTO,day,5.0,140142.00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,722,-0.024049
25852,25852,cluster 8,property,4702.0,-77.011543,2023-08-16 09:15:00,THEFT/OTHER,day,1.0,136737.04,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,455,-0.001408
25853,25853,cluster 15,property,1304.0,-77.058081,2023-09-19 18:18:00,THEFT/OTHER,evening,2.0,140772.69,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,151,0.057628
