In [2]:
import os, sys
import shapefile as shp
import csv
import numpy as np
import math
import pandas as pd
from osgeo import ogr
import time
import urllib.request
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely.speedups
GRID_AREA = 5
UNIT = 'miles'   #'km' or 'miles'
from sklearn import metrics
from xgboost import XGBClassifier
import pickle
# settings for number of hotspots need to be predicted
HOTSPOT_COUNT = 25

### Create Grid

In [4]:
def main(outputGridfn):
    shapefile = "./county_boundary/county_boundary.shp"
    driver = ogr.GetDriverByName("ESRI Shapefile")    #eg: GeoJSON, ESRI
    dataSource = driver.Open(shapefile, 0)
    layer = dataSource.GetLayer()
    xmin,xmax,ymin,ymax = layer.GetExtent()
    if UNIT == 'km':
        gridWidth = gridHeight = math.sqrt(GRID_AREA)*0.009
    if UNIT == 'miles':
        gridWidth = gridHeight = math.sqrt(GRID_AREA)*0.0145

    # get rows
    rows = math.ceil((ymax-ymin)/gridHeight)
    # get columns
    cols = math.ceil((xmax-xmin)/gridWidth)

    # start grid cell envelope
    ringXleftOrigin = xmin
    ringXrightOrigin = xmin + gridWidth
    ringYtopOrigin = ymax
    ringYbottomOrigin = ymax-gridHeight

    # create output file
    outDriver = ogr.GetDriverByName('ESRI Shapefile')
    if os.path.exists(outputGridfn):
        os.remove(outputGridfn)
    outDataSource = outDriver.CreateDataSource(outputGridfn)
    outLayer = outDataSource.CreateLayer(outputGridfn,geom_type=ogr.wkbMultiPolygon)
    featureDefn = outLayer.GetLayerDefn()

    # create grid cells
    countcols = 0
    while countcols < cols:
        countcols += 1
        print('.........')
        # reset envelope for rows
        ringYtop = ringYtopOrigin
        ringYbottom =ringYbottomOrigin
        countrows = 0

        while countrows < rows:
            countrows += 1
            ring = ogr.Geometry(ogr.wkbLinearRing)
            ring.AddPoint(ringXleftOrigin, ringYtop)
            ring.AddPoint(ringXrightOrigin, ringYtop)
            ring.AddPoint(ringXrightOrigin, ringYbottom)
            ring.AddPoint(ringXleftOrigin, ringYbottom)
#             ring.AddPoint(ringXleftOrigin, ringYtop)
            poly = ogr.Geometry(ogr.wkbPolygon)
            poly.AddGeometry(ring)

            # add new geom to layer
            outFeature = ogr.Feature(featureDefn)
            outFeature.SetGeometry(poly)
            outLayer.CreateFeature(outFeature)
            outFeature.Destroy

            # new envelope for next poly
            ringYtop = ringYtop - gridHeight
            ringYbottom = ringYbottom - gridHeight

        # new envelope for next poly
        ringXleftOrigin = ringXleftOrigin + gridWidth
        ringXrightOrigin = ringXrightOrigin + gridWidth

    # Close DataSources
    outDataSource.Destroy()

if __name__ == "__main__":

    #
    # example run : $ python grid.py <full-path><output-shapefile-name>.shp xmin xmax ymin ymax gridHeight gridWidth
    #

    main("./Square_grid/new_grid.shp")
    with open("./county_boundary/county_boundary.prj") as f:
        with open("./Square_grid/new_grid.prj", "w") as f1:
            for line in f:
                f1.write(line)

    f.close()
    f1.close()

.........
.........
.........
.........
.........
.........
.........
.........
.........
.........
.........
.........
.........
.........
.........
.........


### Clipping out the required map from Grid

In [21]:
shapely.speedups.enable()
# File paths
border_fp = "./county_boundary/county_boundary.shp"
grid_fp = "./Square_Grid/new_grid.shp"
# Read files
gridd = gpd.read_file(grid_fp)
denv = gpd.read_file(border_fp)
print("merging started....")
result = gpd.overlay(gridd,denv, how='intersection')
print("merging ended....")

outfp = "./clipped_grid/overlay_analysis.shp"
# Use Shapefile driver
result.to_file(outfp, driver="Shapefile")

with open("./county_boundary/county_boundary.prj") as f:
    with open("./clipped_grid/overlay_analysis.prj", "w") as f1:
        for line in f:
            f1.write(line)

merging started....
merging ended....


### Final Output as CSV of Grid Details

In [5]:
shapefile = "./clipped_grid/overlay_analysis.shp"
driver = ogr.GetDriverByName("ESRI Shapefile")
dataSource = driver.Open(shapefile, 1)
layer = dataSource.GetLayer()
new_field1 = ogr.FieldDefn("AREA", ogr.OFTReal)
new_field2 = ogr.FieldDefn("xmin", ogr.OFTReal)
new_field3 = ogr.FieldDefn("xmax", ogr.OFTReal)
new_field4 = ogr.FieldDefn("ymin", ogr.OFTReal)
new_field5 = ogr.FieldDefn("ymax", ogr.OFTReal)
layer.CreateField(new_field1)
layer.CreateField(new_field2)
layer.CreateField(new_field3)
layer.CreateField(new_field4)
layer.CreateField(new_field5)

for feature in layer:
    geom = feature.GetGeometryRef()
    # print(geom)
    if UNIT == 'km':
        area = (geom.GetArea()/(math.pow( (math.sqrt(GRID_AREA)*0.009), 2)))*GRID_AREA # real area adjusted to scale
    if UNIT == 'miles':
        area = (geom.GetArea()/(math.pow( (math.sqrt(GRID_AREA)*0.0145), 2)))*GRID_AREA # real area adjusted to scale
    xmin, xmax, ymin, ymax = geom.GetEnvelope()
    feature.SetField("AREA", area)
    feature.SetField("xmin", xmin)
    feature.SetField("xmax", xmax)
    feature.SetField("ymin", ymin)
    feature.SetField("ymax", ymax)
    layer.SetFeature(feature)
# dataSource = None
print('done area adding')
shpfile=r'./clipped_grid/overlay_analysis.shp' #sys.argv[1]
csvfile=r'grid_details.csv' #sys.argv[2]

#Open files
csvfile=open(csvfile,'wt')
ds=ogr.Open(shpfile)
lyr=ds.GetLayer()
#Get field names
dfn=lyr.GetLayerDefn()
nfields=dfn.GetFieldCount()
fields=[]
for i in range(nfields):
    fields.append(dfn.GetFieldDefn(i).GetName())
csvwriter = csv.DictWriter(csvfile, fields)
try:csvwriter.writeheader() #python 2.7+
except:csvfile.write(','.join(fields)+'\n')

# Write attributes and kml out to csv
for feat in lyr:
    attributes=feat.items()
    geom=feat.GetGeometryRef()
    csvwriter.writerow(attributes)

#clean up
del csvwriter,lyr,ds
csvfile.close()


done area adding


### Crime_CSV to SHP

In [23]:
out_file = './crime_points/crime_points.shp'

#Set up blank lists for data
c,x,y,=[],[],[]


crime_df1 = pd.read_csv("./denver_cleaned.csv")
crime_df1['case_id']=crime_df1.index
crime_df = crime_df1[['case_id','Longitude','Latitude']]

for i, row in crime_df.iterrows():
    c.append(row[0])
    x.append(float(row[1]))
    y.append(float(row[2]))
# print(x)

#Set up shapefile writer and create empty fields
w = shp.Writer('./crime_points/crime_points',shp.POINT,1)
# w.point
w.autoBalance = 1 #ensures gemoetry and attributes match
w.field('CaseID','N')
w.field('X','F',11,8)
w.field('Y','F',11,8)

#loop through the data and write the shapefile
for j,k in enumerate(x):
    w.point(k,y[j]) #write the geometry
    w.record(c[j],k,y[j]) #write the attributes
    print("...")

with open("./county_boundary/county_boundary.prj") as f:
    with open("./crime_points/crime_points.prj", "w") as f1:
        for line in f:
            f1.write(line)

### Merge Crime points with Clipped Grid

In [6]:
shapely.speedups.enable()
# Read files
final_cols=['FID', 'geometry']
final = gpd.GeoDataFrame.from_file('./clipped_grid/overlay_analysis.shp')
final=final[final_cols]
point = gpd.GeoDataFrame.from_file('./crime_points/crime_points.shp')
# print(final.head(5))
# print(point.head(5))
print("merging started....")
result = gpd.sjoin(final,point, op='intersects',how='right')
print("merging ended....")
# print(result.head(5))
outfp = "./points_grid/result_grid.shp"
# # Use Shapefile driver
result.to_file(outfp, driver="Shapefile")

with open("./county_boundary/county_boundary.prj") as f:
    with open("./points_grid/result_grid.prj", "w") as f1:
        for line in f:
            f1.write(line)

merging started....


  outputs = ufunc(*inputs)


merging ended....


### Final Output

In [27]:
shpfile=r'./points_grid/result_grid.shp' #sys.argv[1]
csvfile=r'test.csv' #sys.argv[2]


#Open files
# FID ==> GRID_ID
# id ==> CASE_ID
csvfile=open(csvfile,'wt')
ds = ogr.Open(shpfile)
lyr = ds.GetLayer()

#Get field names
dfn = lyr.GetLayerDefn()
nfields = dfn.GetFieldCount()
fields=[]
for i in range(nfields):
    fields.append(dfn.GetFieldDefn(i).GetName())
csvwriter = csv.DictWriter(csvfile, fields)
try:csvwriter.writeheader()
except:csvfile.write(','.join(fields)+'\n')

# Write attributes out to csv
for feat in lyr:
    attributes=feat.items()
    geom=feat.GetGeometryRef()
    csvwriter.writerow(attributes)

#clean up
del csvwriter,lyr,ds
csvfile.close()

df = pd.read_csv("test.csv")
df = df.where((pd.notnull(df)), None)
df1=df[['FID','CaseID']]
df1.columns=['GridId','CaseId']

df1.to_csv(r'./output_final.csv',header=True,index=None)

### Time Window Mapping

In [30]:
# time stamping the data with different time window, e.g., weekly, fortnight, monthly, quarterly, half-yearly
def calculate_timewindowOverAllYears(day, month, year, min_year, min_month, time_window):
    '''
    function:
    input:
    output:
    '''
    day = int(day)
    month = int(month)
    year = int(year)
    min_year = int(min_year)
    min_month = int(min_month)

    #time_window: daily
    if time_window == 'day':
        a = date(int(year),int(month),int(day))
        b = date(int(min_year),1,1)
        tw = (a-b).days + 1

    #time_window: weekly
    if time_window == 'week':
        if day<=7:
            tw = 1 + 4*((month-1) + 12*(year-min_year))
        elif day<=14:
            tw = 2 + 4*((month-1) + 12*(year-min_year))
        elif day<=21:
            tw = 3 + 4*((month-1) + 12*(year-min_year))
        else:
            tw = 4 + 4*((month-1) + 12*(year-min_year))

    # time_window: fortnight
    if time_window == 'fortnight':
        if day<=15:
            tw = 1 + 2*((month-1) + 12*(year-min_year))
        else:
            tw = 2 + 2*((month-1) + 12*(year-min_year))

    #time_window: monthly
    if time_window == 'month':
        tw = (month - min_month) + 12*(year - min_year)

    #time_window: quarterly
    if time_window == 'quarter':
        if month<=3:
            tw = 1 + 4*(year - min_year)
        elif month<=6:
            tw = 2 + 4*(year - min_year)
        elif month<=9:
            tw = 3 + 4*(year - min_year)
        else:
            tw = 4 + 4*(year - min_year)

    #time_window: half-yearly
    if time_window == 'half-year':
        if month<=6:
            tw = 1 + 2*(year - min_year)
        else:
            tw = 2 + 2*(year - min_year)

    return int(tw)


s_t=time.time()
cases = pd.read_csv('./denver_cleaned.csv')
cases['CaseId'] = cases.index

time_window = 'month'
min_year = int(min(cases['Year']))
min_month = int(min(cases[cases.Year == min_year].Month))

cases['MonthId'] = cases.apply(lambda row : calculate_timewindowOverAllYears(row['Day'], row['Month'], row['Year'], min_year, min_month, time_window),axis=1)

# reads the gridid mapped cases in case_grid_df dataframe
# conn = pyodbc.connect('DRIVER={' + DB_DRIVER + '};'
#                       'SERVER=' + LOCAL_SERVER + ';'
#                       'DATABASE=' + FEATURE_PREPROCESSING_DB + ';'
#                       'UID=' + LOCAL_SERVER_UID + ';'
#                       'PWD=' + LOCAL_SERVER_PWD + ';'
#                       )
# case_grid_df = pd.read_sql('SELECT * FROM CasesGridTwMapped', con = conn)
case_grid_df=pd.read_csv('./output_final.csv')
caseid_gridid_mapping = case_grid_df[['CaseId', 'GridId']]
casees_complete =pd.merge(cases, caseid_gridid_mapping, on='CaseId')

caseid_monthid_gridid = casees_complete[['CaseId', 'MonthId', 'GridId']]
caseid_monthid_gridid = caseid_monthid_gridid.where((pd.notnull(caseid_monthid_gridid)), None)

caseid_monthid_gridid.to_csv(r'./grid_tw_mapped.csv',header=True,index=None)
e_t = time.time()
exe_time = e_t - s_t
print('execution time: '+str(exe_time))

# create the tw_mapping frame and save that TwMapping table in front_db
tw_mapping = casees_complete[['Year', 'Month', 'MonthId']].drop_duplicates()
# add an extra month for the (max_month+1) used by prediction module
row_monthid_max = tw_mapping.loc[tw_mapping['MonthId'].idxmax()]
max_monthid = int(row_monthid_max[2])
max_month = int(row_monthid_max[1])
max_year = int(row_monthid_max[0])
if max_month == 12:
    future_month = 1
    future_year = max_year + 1
else:
    future_month = max_month + 1
    future_year = max_year
future_monthid = max_monthid + 1

tw_mapping = tw_mapping.append({'Year':future_year, 'Month':future_month, 'MonthId':future_monthid}, ignore_index=True)

tw_mapping.to_csv(r'./front_db_tw.csv',index=None,header=True)

execution time: 9.359074354171753


### Aggregating GridId CaseId and MonthId

In [8]:
# create crime dataframe aggregated over specified time_window, e.g., weekly, fortnight, monthly, quarterly etc.
def aggregate_crime_data_grid_timewindow(crime_points, grids):
    '''
    function:
    input:
    output:
    '''
    crime_points_trimmed = crime_points[['GridId', 'MonthId']]
    X =  crime_points_trimmed.groupby(['GridId','MonthId'])['GridId'].count().to_frame()

    X.columns = ['CrimeCount']
    X = X.reset_index()

    # preparing the time_window aggregated crime data and grid data for joining
    tws = pd.DataFrame(data = X['MonthId'].unique(),columns=['MonthId'])

    grids['key'] = 0
    tws['key'] = 0
    grids = grids.merge(tws, how='outer')

    data = X.set_index(['GridId','MonthId'])
    grids_temp = grids.set_index(['GridId','MonthId'])

    # joining the crime data and grid data to get the grid area in crime data. this area is used for calculating the normalized crime
    EntireFrame = data.join(grids_temp,how='outer')
    EntireFrame = EntireFrame.fillna(0)
    EntireFrame = EntireFrame[['CrimeCount','Area']]
    EntireFrame.reset_index(inplace=True)
    EntireFrame = EntireFrame[EntireFrame['Area']>0]
    EntireFrame['NormCrimeCount'] = EntireFrame['CrimeCount'] * (GRID_AREA /EntireFrame['Area'])
    return EntireFrame


s_t = time.time()
grid_info = pd.read_csv('./grid_details.csv')
grid_info = grid_info.rename(columns={'FID' : 'GridId', 'AREA' : 'Area'})
cases_grid_tw_mapped = pd.read_csv('./grid_tw_mapped.csv')
cases_grid_tw_mapped.replace(to_replace='', value=np.nan)
cases_grid_tw_mapped= cases_grid_tw_mapped[np.isfinite(cases_grid_tw_mapped['GridId'])]


# aggregates the cases based on spece and time
entire_frame = aggregate_crime_data_grid_timewindow(cases_grid_tw_mapped, grid_info)

s_t=time.time()

entire_frame.to_csv(r'./grid_tw_crime_aggregated.csv',index=None,header=True)
print('cases space-time aggregation done and stored in feature_preprocessing_db GridTwCrimeAggregated csv')
e_t = time.time()
exe_time = e_t - s_t
print('execution time: '+str(exe_time))


cases space-time aggregation done and stored in feature_preprocessing_db GridTwCrimeAggregated csv
execution time: 0.03390836715698242


### Building the model

#### Feature generators

In [9]:
# monthly mean number of crimes
def MeanNumberOfCrimes(frame):
    """
    function:
    input:
    output:
    """
    return frame[['id', 'Count']].groupby(['id']).mean()


# monthly normalized number of crimes
def NormalizedNumberOfCrimes(frame):
    """
    function:
    input:
    output:
    """
    df = frame[['id', 'Count']].groupby(['id']).sum()
    return df / df['Count'].max()


# rank based on monthly normalized number of crimes
def RankGrids(frame):
    """
    function:
    input:
    output:
    """
    df = NormalizedNumberOfCrimes(frame)
    df['rank'] = df.rank(ascending=False)
    return df['rank']


# hotspot frequency of the grid
def HotspotFrequency(frame, numberOfHotspots):
    """
    function:
    input:
    output:
    """
    df = frame.groupby('id')['HotSpot'].sum().to_frame()
    return df


# hotspot history
def HotspotHistory(frame, numberOfHotspots):
    """
    function:
    input:
    output:
    """
    df = HotspotFrequency(frame, numberOfHotspots)
    df['HotSpot'] = np.where(df['HotSpot'] > 0, 1, 0)
    return df


# total crimes in the neighbors in last 1 month
def CrimesNeighborSum_1(row, frame):
    """
    function:
    input:
    output:
    """
    # getting the neighbor ids for a grid
    neighboringIds = row['NEIGHBORS'].split(",")
    # getting the rows corresponding to the neighbors in 1 month prior
    neighborRows = frame[
        (frame['twOverAllYears'] == row['twOverAllYears'] - 1) & (frame['id'].isin(neighboringIds))]
    return sum(neighborRows['Count'])


# total crimes in the neighbors in last 2 month
def CrimesNeighborSum_2(row, frame):
    """
    function:
    input:
    output:
    """
    # getting the neighbor ids for a grid
    neighboringIds = row['NEIGHBORS'].split(",")
    # getting the rows corresponding to the neighbors in 1 month prior
    neighborRows1 = frame[
        (frame['twOverAllYears'] == row['twOverAllYears'] - 1) & (frame['id'].isin(neighboringIds))]
    # getting the rows corresponding to the neighbors in 2 month prior
    neighborRows2 = frame[
        (frame['twOverAllYears'] == row['twOverAllYears'] - 2) & (frame['id'].isin(neighboringIds))]
    return sum(neighborRows1['Count']) + sum(neighborRows2['Count'])


# total crimes in the neighbors in last 3 month
def CrimesNeighborSum_3(row, frame):
    """
    function:
    input:
    output:
    """
    # getting the neighbor ids for a grid
    neighboringIds = row['NEIGHBORS'].split(",")
    # getting the rows corresponding to the neighbors in 1 month prior
    neighborRows1 = frame[
        (frame['twOverAllYears'] == row['twOverAllYears'] - 1) & (frame['id'].isin(neighboringIds))]
    # getting the rows corresponding to the neighbors in 2 month prior
    neighborRows2 = frame[
        (frame['twOverAllYears'] == row['twOverAllYears'] - 2) & (frame['id'].isin(neighboringIds))]
    # getting the rows corresponding to the neighbors in 3 month prior
    neighborRows3 = frame[
        (frame['twOverAllYears'] == row['twOverAllYears'] - 3) & (frame['id'].isin(neighboringIds))]
    return sum(neighborRows1['Count']) + sum(neighborRows2['Count']) + sum(neighborRows3['Count'])


# feature engineering for the crime dataset
def feature_engineering(EntireFrame, number_of_hotspots):
    """
    function:
    input:
    output:
    """
    # getting the hotspot crime dataframe
    HotSpotFrame = AddHotSpotColumnToData(EntireFrame, number_of_hotspots)

    # getting the max and min time_window in the dataset
    twMax = int(EntireFrame['twOverAllYears'].max())
    twMin = int(EntireFrame['twOverAllYears'].min()) + 3

    # seting the month window for calculating the statistical features for the crime data points
    windows = [1, 2, 3]
    EntireFeatureFrame = pd.DataFrame()
    for tw in range(twMin, twMax + 1):
        features = pd.DataFrame(data=EntireFrame['id'].unique(), columns=['id'])
        for i in windows:
            frame = EntireFrame[
                (EntireFrame['twOverAllYears'] >= tw - i) & (EntireFrame['twOverAllYears'] < tw)]
            hsframe = HotSpotFrame[
                (HotSpotFrame['twOverAllYears'] >= tw - i) & (HotSpotFrame['twOverAllYears'] < tw)]

            features = features.merge(MeanNumberOfCrimes(frame).reset_index(), left_on='id', right_on='id')
            features = features.rename(columns={'Count': 'mean' + str(i)})
            features = features.merge(NormalizedNumberOfCrimes(frame).reset_index(), left_on='id', right_on='id')
            features = features.rename(columns={'Count': 'norm' + str(i)})
            features = features.merge(RankGrids(frame).reset_index(), left_on='id', right_on='id')
            features = features.rename(columns={'rank': 'rank' + str(i)})
            features = features.merge(HotspotFrequency(hsframe, number_of_hotspots).reset_index(), left_on='id',
                                      right_on='id')
            features = features.rename(columns={'HotSpot': 'HotSpot' + str(i)})
            features = features.merge(HotspotHistory(hsframe, number_of_hotspots).reset_index(), left_on='id',
                                      right_on='id')
            features = features.rename(columns={'HotSpot': 'HotSpotFreq' + str(i)})

        features['twOverAllYears'] = tw
        EntireFeatureFrame = EntireFeatureFrame.append(features)

    return EntireFeatureFrame


# add hotspot column to dataframe
def AddHotSpotColumnToData(frame, numberOfHotspots):
    """
    function:
    input:
    output:
    """
    frame['HotSpot'] = 0
    # calculating the hotspot threshold value for specific month
    thresholdValues = frame.groupby('twOverAllYears').apply(
        lambda grp: grp.sort_values(by='Normalized Crimes', ascending=False).iloc[numberOfHotspots - 1][
            'Normalized Crimes'])
    frame['thresh'] = thresholdValues[frame['twOverAllYears']].values
    # determing whether a grid is hotspot or not based on the monthly threshold value
    frame['HotSpot'] = np.where(frame['Normalized Crimes'] >= frame['thresh'], 1, 0)
    return frame[['id', 'twOverAllYears', 'HotSpot']]


# bucketing the crime data based on crime count
def AddBucketColumnToData(frame, time_window):
    """
    function:
    input:
    output:
    """
    # time_window: daily
    if time_window == 'day':
        # buckets for 2 level classification for daily
        frame['CrimeBucket_1'] = pd.cut(frame['Count'], [-1, 0, 10000], labels=['zero', 'non-zero'])
        frame['CrimeBucket_2'] = pd.cut(frame['Count'], [0, 1, 2, 3, 4, 7, 10000], labels=['1', '2', '3', '4', '5', '6'])

    # time_window: weekly
    if time_window == 'week':
        # buckets for 2 level classification for weekly
        frame['CrimeBucket_1'] = pd.cut(frame['Count'], [-1, 0, 10000], labels=['zero', 'non-zero'])
        frame['CrimeBucket_2'] = pd.cut(frame['Count'], [0, 3, 5, 9, 13, 33, 10000], labels=['1', '2', '3', '4', '5', '6'])

    # time_window: monthlyly
    if time_window == 'month':
        # buckets for 2 level classification for monthly
        frame['CrimeBucket_1'] = pd.cut(frame['Count'], [-1, 0, 10000], labels=['zero', 'non-zero'])
        # frame['CrimeBucket_2'] = pd.cut(frame['Count'], [0, 7, 15, 28, 41, 103, 10000], labels=['1', '2', '3', '4', '5', '6'])
        frame['CrimeBucket_2'] = pd.cut(frame['Count'], [0, 3, 11, 24, 54, 141, 10000], labels=['1', '2', '3', '4', '5', '6'])

    return frame[['id', 'twOverAllYears', 'CrimeBucket_1', 'CrimeBucket_2']]


def CrimeInfoPreprocessing(grid_month_crime_info):
    grid_month_crime_info.columns = ['id', 'twOverAllYears', 'Count', 'area', 'Normalized Crimes']
    grid_month_crime_info['id'] = pd.to_numeric(grid_month_crime_info['id'])
    grid_month_crime_info['twOverAllYears'] = pd.to_numeric(grid_month_crime_info['twOverAllYears'])
    grid_month_crime_info['Count'] = pd.to_numeric(grid_month_crime_info['Count'])
    grid_month_crime_info['area'] = pd.to_numeric(grid_month_crime_info['area'])
    grid_month_crime_info['Normalized Crimes'] = pd.to_numeric(grid_month_crime_info['Normalized Crimes'])

    return grid_month_crime_info


#### Model Training helpers

In [11]:

#############################################################################
# First level (zero, non-zero) crime volume bucket prediction model building
#############################################################################

def TrainFirstLevelModel(feature_frame):
    feature_frame['Volume_Label'] = feature_frame['CrimeBucket_1']
    feature_frame = feature_frame.drop(['CrimeBucket_1', 'CrimeBucket_2'], axis=1)

    # put a split timewindow in the total dataset for future train/test split
    total_tw = feature_frame.twOverAllYears.unique()
    split_tw = math.floor(len(total_tw) * 0.8)

    train = feature_frame.loc[feature_frame['twOverAllYears'] <= split_tw]
    test = feature_frame.loc[feature_frame['twOverAllYears'] > split_tw]

    X_train = train.loc[:, train.columns != 'Volume_Label']
    Y_train = train['Volume_Label']
    X_test = test.loc[:, test.columns != 'Volume_Label']
    Y_test = test['Volume_Label']

    # linearizing the data
    Y_train = Y_train.ravel()

    # fit a XGBoost model to the data
    model = XGBClassifier()
    model.fit(X_train, Y_train)
    # print(model)

    # Test the model on test split
    expected = Y_test
    predicted = model.predict(X_test)

    confusion_matrix = metrics.confusion_matrix(expected, predicted)
    classification_report = metrics.classification_report(expected, predicted)
    accuracy = metrics.accuracy_score(expected, predicted)
    return {"model": model,
            "confusion_matrix": confusion_matrix,
            "classification_report": classification_report,
            "accuracy": accuracy}


#############################################################
# Second level crime volume bucket prediction model building
#############################################################


def TrainSecondLevelModel(feature_frame):
    feature_frame['Volume_Label'] = feature_frame['CrimeBucket_2']
    feature_frame = feature_frame.drop(['CrimeBucket_1', 'CrimeBucket_2'], axis=1)
    feature_frame = feature_frame[feature_frame['Volume_Label'].notnull()]

    # Splitting the featurised crime data into training and test
    # put a split timewindow in the total dataset for future train/test split
    total_tw = feature_frame.twOverAllYears.unique()
    split_tw = math.floor(len(total_tw) * 0.8)

    train = feature_frame.loc[feature_frame['twOverAllYears'] <= split_tw]
    test = feature_frame.loc[feature_frame['twOverAllYears'] > split_tw]

    X_train = train.loc[:, train.columns != 'Volume_Label']
    Y_train = train['Volume_Label']
    X_test = test.loc[:, test.columns != 'Volume_Label']
    Y_test = test['Volume_Label']

    # linearizing the data
    Y_train = Y_train.ravel()

    # fit a XGBoost model to the data
    model = XGBClassifier()
    model.fit(X_train, Y_train)

    # Test the model on test split
    expected = Y_test
    predicted = model.predict(X_test)

    confusion_matrix = metrics.confusion_matrix(expected, predicted)
    classification_report = metrics.classification_report(expected, predicted)
    accuracy = metrics.accuracy_score(expected, predicted)
    return {"model": model,
            "confusion_matrix": confusion_matrix,
            "classification_report": classification_report,
            "accuracy": accuracy}

#### Building the model...

In [12]:
print("running crime volume prediction model building service ...")

s_t = time.time()

grid_month_crime_info = pd.read_csv('./grid_tw_crime_aggregated.csv')
# conn.close()

grid_month_crime_info = CrimeInfoPreprocessing(grid_month_crime_info)

time_window = 'month'

# doing the feature engineering and getting the fully featured crime data
EntireFeatureFrame = feature_engineering(grid_month_crime_info, HOTSPOT_COUNT)
print(EntireFeatureFrame.head(5))
# add bucket label to the crime data
crime_bucket_frame = AddBucketColumnToData(grid_month_crime_info, time_window)

# getting the hotspot crime dataframe
hotspot_frame = AddHotSpotColumnToData(grid_month_crime_info, HOTSPOT_COUNT)

# joining the feature_frame with crime_bucket_frame
feature_frame = EntireFeatureFrame.copy()
feature_frame = feature_frame.merge(crime_bucket_frame, left_on=['id', 'twOverAllYears'], right_on=['id', 'twOverAllYears'])

# train first level model
first_level_model = TrainFirstLevelModel(feature_frame)

# save the model to disk
first_level_model_save_file = './model/volume_prediction_model_first_level.p'
pickle.dump(first_level_model["model"], open(first_level_model_save_file, 'wb'))

print('first level model saved to:', first_level_model_save_file)
print('confusion matrix')
print(first_level_model["confusion_matrix"])
print('\n')
print('classification report')
print(first_level_model["classification_report"])
print('accuracy: ' + str(first_level_model["accuracy"]))

# train first level model
second_level_model = TrainSecondLevelModel(feature_frame)

# save the model to disk
second_level_model_save_file = './model/volume_prediction_model_second_level.p'
pickle.dump(second_level_model["model"], open(second_level_model_save_file, 'wb'))

print('second level model saved to:', second_level_model_save_file)
print('confusion matrix')
print(second_level_model["confusion_matrix"])
print('\n')
print('classification report')
print(second_level_model["classification_report"])
print('accuracy: ' + str(second_level_model["accuracy"]))

print('model building done')
e_t = time.time()
exe_time = e_t - s_t
print('execution time: '+str(exe_time))

running crime volume prediction model building service ...
     id  mean1     norm1  rank1  HotSpot1  HotSpotFreq1  mean2     norm2  \
0   7.0    2.0  0.005495   52.5         0             0    3.5  0.009915   
1   8.0    5.0  0.027473   40.5         1             1   12.0  0.067989   
2   9.0    3.0  0.008242   50.5         0             0    4.0  0.011331   
3  13.0    4.0  0.010989   48.0         0             0    2.0  0.005666   
4  14.0   18.0  0.049451   32.5         0             0   18.5  0.052408   

   rank2  HotSpot2  HotSpotFreq2      mean3     norm3  rank3  HotSpot3  \
0   50.5         1             1   3.333333  0.009174   49.0         1   
1   29.0         2             1  14.333333  0.078899   28.0         3   
2   47.5         0             0   4.666667  0.012844   47.0         0   
3   52.5         0             0   2.666667  0.007339   52.0         0   
4   34.0         0             0  16.666667  0.045872   36.0         0   

   HotSpotFreq3  twOverAllYears  
0    

  return self._values.ravel(order=order)


first level model saved to: ./model/volume_prediction_model_first_level.p
confusion matrix
[[1015    6]
 [   5  234]]


classification report
              precision    recall  f1-score   support

    non-zero       1.00      0.99      0.99      1021
        zero       0.97      0.98      0.98       239

    accuracy                           0.99      1260
   macro avg       0.99      0.99      0.99      1260
weighted avg       0.99      0.99      0.99      1260

accuracy: 0.9912698412698413
second level model saved to: ./model/volume_prediction_model_second_level.p
confusion matrix
[[ 23  20   2   1   0   0]
 [ 16  58  24   2   1   0]
 [  0  15 105  29   3   0]
 [  0   1  53 142  47   1]
 [  0   0   0  14 292   9]
 [  0   0   0   0  11 152]]


classification report
              precision    recall  f1-score   support

           1       0.59      0.50      0.54        46
           2       0.62      0.57      0.59       101
           3       0.57      0.69      0.62       152
     

In [17]:
print("running crime volume prediction service ...")

s_t = time.time()

grid_month_crime_info = pd.read_csv('./grid_tw_crime_aggregated.csv')

twmapping_info = pd.read_csv('./front_db_tw.csv')

grid_month_crime_info = CrimeInfoPreprocessing(grid_month_crime_info)
max_month = grid_month_crime_info['twOverAllYears'].max()
pred_month = max_month + 1

# Create rows for next month 'id' and 'area' columns remain same
append_rows = grid_month_crime_info[grid_month_crime_info['twOverAllYears'] == max_month]
append_rows['twOverAllYears'].values[:] = pred_month
append_rows['Count'].values[:] = 0
append_rows['Normalized Crimes'].values[:] = 0

grid_month_crime_info = grid_month_crime_info.append(append_rows)

time_window = 'month'

# doing the feature engineering and getting the fully featured crime data
EntireFeatureFrame = feature_engineering(grid_month_crime_info, HOTSPOT_COUNT+1)

# add bucket label to the crime data
crime_bucket_frame = AddBucketColumnToData(grid_month_crime_info, time_window)

# getting the hotspot crime dataframe
hotspot_frame = AddHotSpotColumnToData(grid_month_crime_info, HOTSPOT_COUNT)

# joining the feature_frame with crime_bucket_frame
feature_frame = EntireFeatureFrame.copy()
feature_frame = feature_frame.merge(crime_bucket_frame, left_on=['id', 'twOverAllYears'], right_on=['id', 'twOverAllYears'])

# print(feature_frame.describe())
future_feature_frame = feature_frame[feature_frame['twOverAllYears'] == pred_month]
future_feature_frame = future_feature_frame.drop(['CrimeBucket_1', 'CrimeBucket_2'], axis=1)

# load the first level model from disk
first_level_model_save_file = "./model/volume_prediction_model_first_level.p"
first_level_model = pickle.load(open(first_level_model_save_file, 'rb'))

# generate first level predictions
first_level_predictions = first_level_model.predict(future_feature_frame)
first_level_predictions_proba = first_level_model.predict_proba(future_feature_frame)

# load the second level model from disk
second_level_model_save_file = "./model/volume_prediction_model_second_level.p"
second_level_model = pickle.load(open(second_level_model_save_file, 'rb'))

# generate second level predictions
second_level_predictions = second_level_model.predict(future_feature_frame)
second_level_predictions_proba = second_level_model.predict_proba(future_feature_frame)
print("2nd level predictions")
print(second_level_predictions)
# print(second_level_predictions_proba)
# merge predictions
first_level_predictions = [0 if x == 'zero' else 1 for x in first_level_predictions]
second_level_predictions = [int(x) for x in second_level_predictions]
preds = [first_level_predictions, second_level_predictions]
predictions = [x * y for x, y in zip(*preds)]
flipped_first_level_predictions = [(x - 1) * -1 for x in first_level_predictions]
future_feature_frame['VolumeBucketPrediction'] = predictions
future_feature_frame['VolumeBucketPredictionProba'] = np.amax(first_level_predictions_proba,axis=-1) * flipped_first_level_predictions + np.amax(second_level_predictions_proba, axis=-1) * first_level_predictions

mapping_year, mapping_month, _ = tuple(list(twmapping_info[twmapping_info['MonthId'] == pred_month].values[0]))

future_feature_frame['Month'] = mapping_month
future_feature_frame['Year'] = mapping_year


# generate hotspot labels
hotspot_thresh = np.sort(np.array(list(future_feature_frame['VolumeBucketPrediction'] + future_feature_frame['VolumeBucketPredictionProba'])))[-HOTSPOT_COUNT]
future_feature_frame['HotspotPrediction'] = ((future_feature_frame['VolumeBucketPrediction'] + future_feature_frame['VolumeBucketPredictionProba']) > hotspot_thresh)
future_feature_frame['HotspotPrediction'] = future_feature_frame['HotspotPrediction'].astype(int)

print("predictions generated for month", mapping_month, mapping_year, "...")

# drop useless colums
future_feature_frame = future_feature_frame[['id', 'Month', 'Year', 'VolumeBucketPrediction', 'VolumeBucketPredictionProba', 'HotspotPrediction']]
future_feature_frame.columns = ['GridId', 'Month', 'Year', 'VolumeBucketPrediction', 'VolumeBucketPredictionProba', 'HotspotPrediction']

future_feature_frame = future_feature_frame[future_feature_frame.HotspotPrediction == 1]
future_feature_frame.to_csv(r'./predictions_final.csv',index=None,header=True)
print('prediction generation done')
e_t = time.time()
exe_time = e_t - s_t
print('execution time: '+str(exe_time))

running crime volume prediction service ...
2nd level predictions
['2' '3' '3' '2' '2' '3' '5' '5' '5' '3' '3' '3' '1' '3' '6' '6' '6' '5'
 '3' '3' '6' '6' '6' '5' '3' '5' '6' '5' '5' '1' '2' '5' '5' '5' '5' '5'
 '5' '5' '5' '5' '2' '3' '5' '6' '5' '5' '5' '5' '5' '5' '3' '4' '2' '3'
 '5' '3' '4' '5' '1' '1' '4' '4' '2' '4' '4' '1' '1' '1' '2' '3' '1' '1'
 '3' '1' '3' '3' '1' '1' '1' '1' '1' '1' '1' '1']
predictions generated for month 7 2018 ...
prediction generation done
execution time: 5.2932984828948975


### Google TTS Alert system prototype

In [1]:
# Import the required module for text
# to speech conversion
from gtts import gTTS
import playsound

# This module is imported so that we can
# play the converted audio
import os

lat = '123'
lon = '342'
# The text that you want to convert to audio
mytext = 'Attention All units tighten security at latitude'+lat+'and longitude'+lon

# Language in which you want to convert
language = 'en'

# Passing the text and language to the engine,
# here we have marked slow=False. Which tells
# the module that the converted audio should
# have a high speed
myobj = gTTS(text=mytext, lang=language, slow=False)

# Saving the converted audio in a mp3 file named
# welcome
myobj.save("warning.mp3")
playsound.playsound('./warning.mp3', True)