In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import decomposition
from scipy import stats
from sklearn import cluster
from vincenty import vincenty
from h3 import h3
from folium import Map, Marker, GeoJson
from folium.plugins import MarkerCluster
import folium
import branca.colormap as cm
from geojson.feature import *
import json
from IPython.display import Image, display
import calendar
from tqdm import tqdm
from datetime import datetime

In [2]:
TRAIN_PATH = 'C:/Users/umang/Downloads/NYPD_Complaint_Data_Current__Year_To_Date_.csv'
df = pd.read_csv(TRAIN_PATH)

In [7]:
def spatid(df,time_bin,spatial_resolution):
    df.dropna()
    
    #generating hexid for locations from latitude and Longitude data columns
    df["hex_id"] = df.apply(lambda row: h3.geo_to_h3(row["Latitude"], row["Longitude"], spatial_resolution), axis = 1)
    
    # converting time into number of minutes in the day
    df['time'] = df['CMPLNT_FR_TM'].apply(lambda x:datetime.strptime(x,'%H:%M:%S'))
    dfhour=df['time'].apply(lambda x:x.hour)*60
    dfmin=df['time'].apply(lambda x:x.minute)
    df['time']=dfhour+dfmin
    df['time']=df['time'].astype(int)
    
    #creating time bins in a day based on 'time_bin' parameter
    df['bin']=df['time']/time_bin+1
    df['bin']=df['bin'].astype(int)
    
    #grouping data based on day, time and location
    srs1=df.groupby(['CMPLNT_FR_DT', 'hex_id','bin']).size().sort_values(ascending=False) 
    
    df_output=srs1.to_frame()
    df_output.reset_index(inplace=True)  
    df_output.rename({0: 'Incident_count'}, axis=1, inplace=True)
    
    #creating threshold for alarm where incident report increases beyond 95th percentile (2 std deviations) of the reports
    # in the given location within the time bin
    srs2=df_output.groupby('hex_id')['Incident_count'].quantile(.997).sort_values(ascending=False)
    df_threshold=srs2.to_frame()
    df_threshold=df_threshold.rename(columns={'Incident_count':'Threshold'})
    df_threshold.reset_index(inplace=True)
    
    #joining the threshold df with output df
    df_joined=pd.merge(df_output,df_threshold,how='left',on=['hex_id','hex_id'])
    
    #generating alarm based on threshold
    df_alarm=df_joined[df_joined['Incident_count']>=df_joined['Threshold']]
    
    return df_alarm

In [8]:
df_test=spatid(df,60,7)

In [9]:
df_test['Incident_count'].sum()

10229

In [13]:
df_test.head(20)

Unnamed: 0,CMPLNT_FR_DT,hex_id,bin,Incident_count,Threshold
0,01/01/2020,872a10088ffffff,1,40,5.0
1,06/01/2020,872a1072cffffff,2,35,11.797
2,06/01/2020,872a100d6ffffff,22,33,7.0
3,06/01/2020,872a100d2ffffff,22,32,10.0
4,06/01/2020,872a1072cffffff,1,32,11.797
5,05/30/2020,872a1072cffffff,23,32,11.797
6,06/01/2020,872a100d2ffffff,23,31,10.0
7,05/29/2020,872a100daffffff,22,31,6.518
8,06/01/2020,872a100d6ffffff,21,30,7.0
9,05/29/2020,872a100daffffff,21,30,6.518
