In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Read in SF crime data
crime = pd.read_csv('~/Desktop/project1030/SF/SF.csv')
print(crime.shape)

(878049, 9)


In [3]:
# Create date variable from datetime
crime['datetime'] = pd.to_datetime(crime['Dates'])
dates_only = crime['datetime'].map(pd.Timestamp.date)
print(dates_only[0:5])
crime['date'] = dates_only

0    2015-05-13
1    2015-05-13
2    2015-05-13
3    2015-05-13
4    2015-05-13
Name: datetime, dtype: object


In [4]:
# Extract years, months, and times
years_only = crime['datetime'].map(lambda t: t.year)
crime['year'] = years_only
print(crime['year'].head())

time_only = crime['datetime'].map(lambda t: t.time)
crime['time'] = time_only
print(crime['time'].head())

months_only = crime['datetime'].map(lambda t: t.month)
crime['month'] = months_only
print(crime['month'].head())

0    2015
1    2015
2    2015
3    2015
4    2015
Name: year, dtype: int64
0    <built-in method time of Timestamp object at 0...
1    <built-in method time of Timestamp object at 0...
2    <built-in method time of Timestamp object at 0...
3    <built-in method time of Timestamp object at 0...
4    <built-in method time of Timestamp object at 0...
Name: time, dtype: object
0    5
1    5
2    5
3    5
4    5
Name: month, dtype: int64


In [5]:
# To simplify initially, keep only crime reports from 2003 to 2005
#crime_new = crime[crime['year'].isin([2003, 2004, 2005])]

# Confusing, but I kept this because I didn't want to change future references to "crime_new"
crime_new = crime

In [6]:
# Sort by date
crime_new.sort_values('datetime', inplace=True)

In [7]:
print(crime_new['datetime'].head(1))
print(crime_new['datetime'].tail(1))

878048   2003-01-06 00:01:00
Name: datetime, dtype: datetime64[ns]
0   2015-05-13 23:53:00
Name: datetime, dtype: datetime64[ns]


In [8]:
import geopandas as gpd
import os

# Read in csv file containing SF census tracts and corresponding MULTIPOLYGON objects
data_path = '/Users/Sam/Desktop/project1030/SF'
census_tr = pd.read_csv(os.path.join(data_path, 'Census_2010_Tracts.csv'))

In [9]:
from geopandas import GeoDataFrame
import shapely.wkt

geometry = census_tr['the_geom'].map(shapely.wkt.loads)
print(census_tr.head())

crs = {'init': 'epsg:4326'}

# Restrict to mainland SF 
census_tr = census_tr[census_tr['INTPTLON10'] > -122.6]

sf_census_tracts = GeoDataFrame(census_tr, crs=crs, geometry=geometry)

   STATEFP10  TRACTCE10                                           the_geom  \
0          6      16500  MULTIPOLYGON (((-122.446471 37.775802, -122.44...   
1          6      16400  MULTIPOLYGON (((-122.44033999999999 37.7765799...   
2          6      16300  MULTIPOLYGON (((-122.429152 37.778006999999995...   
3          6      16100  MULTIPOLYGON (((-122.428909 37.778039, -122.42...   
4          6      16000  MULTIPOLYGON (((-122.420425 37.780583, -122.42...   

   COUNTYFP10     GEOID10  NAME10        NAMELSAD10 MTFCC10 FUNCSTAT10  \
0          75  6075016500   165.0  Census Tract 165   G5020          S   
1          75  6075016400   164.0  Census Tract 164   G5020          S   
2          75  6075016300   163.0  Census Tract 163   G5020          S   
3          75  6075016100   161.0  Census Tract 161   G5020          S   
4          75  6075016000   160.0  Census Tract 160   G5020          S   

   ALAND10  AWATER10  INTPTLAT10  INTPTLON10  
0   370459         0   37.774196 -122.4

In [10]:
from shapely.geometry import Point

# First convert all crime occurrences to Point objects
crime_locs = [Point(xy) for xy in zip(crime_new['X'], crime_new['Y'])]
crime_locs_df = GeoDataFrame(crime_locs, crs=crs, geometry=crime_locs)

# Add locations to original dataframe
crime_new['locs'] = crime_locs

# Execute spatial join of crimes with census tract boundaries 
crime_census = gpd.sjoin(crime_locs_df, sf_census_tracts, how="inner", op='within')

In [11]:
# Extract X and Y coordinates
x_coords = crime_census['geometry'].apply(lambda p: p.x)
y_coords = crime_census['geometry'].apply(lambda p: p.y)

# Create new truncated df containing only Point object and census tract name
crime_census_trunc = crime_census[['NAME10', 'the_geom']]
crime_census_trunc['X'] = x_coords
crime_census_trunc['Y'] = y_coords

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
print(crime_census_trunc.shape)
print(crime_new.shape)

# Remove duplicates from spatially merged dataset (crime_census_trunc)
crime_census_trunc.drop_duplicates(inplace=True)
print(crime_census_trunc.shape)

(877826, 4)
(878049, 15)
(34222, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [13]:
# Now, merge on census tract information to crime_new dataset, joining on the coordinate variables
crime_merged = pd.merge(crime_new, crime_census_trunc, how='inner', on=['X', 'Y'])
crime_merged.rename(columns= {'NAME10': 'census_tr', 'the_geom': 'census_tr_poly'}, inplace=True)

In [14]:
# Function to calculate difference between two datetimes: datetime_2 (later) and datetime_init (earlier)
def calc_time_delta(datetime_2, datetime_init):
    delta = datetime_2 - datetime_init
    print(type(delta))
    mins = delta.astype('timedelta64[m]')
    return(mins / np.timedelta64(1, 'm'))

In [15]:
# Crime dataset with unneccesary features removed where we will engineer new features
crime_engin_temp = crime_merged[['Category', 'DayOfWeek', 'datetime', 'date', 'year', \
                            'time', 'month', 'census_tr']]

# Add eviction-specific columns (all zeros for crime observations)
crime_engin_temp['non_payment'] = 0
crime_engin_temp['illegal_use'] = 0
crime_engin_temp['eviction'] = 0

print(crime_engin_temp.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


                 Category DayOfWeek            datetime        date  year  \
0  FORGERY/COUNTERFEITING    Monday 2003-01-06 00:01:00  2003-01-06  2003   
1           LARCENY/THEFT    Monday 2003-01-06 00:01:00  2003-01-06  2003   
2            NON-CRIMINAL    Sunday 2005-07-17 12:00:00  2005-07-17  2005   
3               VANDALISM    Monday 2003-01-06 00:01:00  2003-01-06  2003   
4           LARCENY/THEFT  Saturday 2003-01-25 12:00:00  2003-01-25  2003   

                                                time  month  census_tr  \
0  <built-in method time of Timestamp object at 0...      1     9809.0   
1  <built-in method time of Timestamp object at 0...      1      311.0   
2  <built-in method time of Timestamp object at 0...      7      311.0   
3  <built-in method time of Timestamp object at 0...      1      615.0   
4  <built-in method time of Timestamp object at 0...      1      615.0   

   non_payment  illegal_use  eviction  
0            0            0         0  
1           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [16]:
# Read in processed eviction data
evict = pd.read_csv('~/Desktop/project1030/SF/evict_processed.csv')
print(evict.head())

evict.drop(columns=['datetime'], inplace=True)

# Convert date to datetime
def date_to_datetime(date):
    #return str(date)
    return pd.to_datetime(str(date))

#date_to_datetime(evict['date'].values[0])
evict['datetime'] = evict['date'].apply(lambda x: date_to_datetime(x))

         date  census_tr  non_payment  illegal_use  eviction    datetime  \
0  2003-01-02     158.02            0            0         1  2003-01-02   
1  2003-01-02     228.01            0            0         1  2003-01-02   
2  2003-01-02     302.02            0            0         1  2003-01-02   
3  2003-01-02     232.00            0            0         1  2003-01-02   
4  2003-01-03     204.01            0            0         1  2003-01-03   

                                                time  Category  year  month  \
0  <built-in method time of Timestamp object at 0...  EVICTION  2003      1   
1  <built-in method time of Timestamp object at 0...  EVICTION  2003      1   
2  <built-in method time of Timestamp object at 0...  EVICTION  2003      1   
3  <built-in method time of Timestamp object at 0...  EVICTION  2003      1   
4  <built-in method time of Timestamp object at 0...  EVICTION  2003      1   

   DayOfWeek  
0        NaN  
1        NaN  
2        NaN  
3       

In [17]:
crime_engin = crime_engin_temp.append(evict).sort_values(by='datetime')

In [18]:
# Two functions that simply define property and violent crimes based on Category variable
def is_property_crime(crime_type):
    return (crime_type in ['BURGLARY', 'LARCENY/THEFT', 'VEHICLE THEFT', 'RECOVERED VEHICLE', \
                      'ARSON', 'VANDALISM', 'STOLEN PROPERTY', 'EMBEZZLEMENT'])

def is_violent_crime(crime_type):
    return (crime_type in ['ASSAULT', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING'])

In [19]:
import warnings
warnings.filterwarnings('ignore')

# Create categorical variables for crimes by mapping above functions over Category column
crime_engin['property_crime'] = crime_engin['Category'].apply(is_property_crime).astype(int)
crime_engin['violent_crime'] = crime_engin['Category'].apply(is_violent_crime).astype(int)

crime_engin['robbery'] = (crime_engin['Category'] == 'ROBBERY').astype(int)

crime_engin['other_crime'] = crime_engin['property_crime'] + crime_engin['violent_crime'] + crime_engin['robbery'] + 1
crime_engin['other_crime'].replace(2, 0, inplace=True)

crime_engin['any_crime'] = crime_engin['property_crime'] + crime_engin['violent_crime'] + \
                            crime_engin['robbery'] + crime_engin['other_crime']

# The total for any row should be 1 (has to fall into one of the categories!)
assert(crime_engin['any_crime'].unique() == 1)

In [20]:
# Create time index = minutes elapsed since beginning of data
crime_engin['min_elapsed'] = np.apply_along_axis(func1d = calc_time_delta, axis = 0, \
                             arr=crime_engin['datetime'].values, datetime_init=crime_engin['datetime'].values[0])

<class 'numpy.ndarray'>


In [21]:
# Combine slice and sum functions
def return_sum_over_slice(df, start_time, time_window, census_tract):
    
    if (start_time-time_window) >= 0:
        sliced_df = df[(df['min_elapsed'] < start_time) & (df['min_elapsed'] >= (start_time-time_window))]
        
    else:
        sliced_df = df[(df['min_elapsed'] < start_time)]
        
    # Restrict only to crimes in that census tract
    result = sliced_df[sliced_df['census_tr'] == census_tract]
    
    if (result.empty==False):
        return (result['property_crime'].sum(), result['violent_crime'].sum(), result['robbery'].sum(), \
                result['other_crime'].sum(), result['any_crime'].sum(), result['non_payment'].sum(), \
                result['illegal_use'].sum(), result['eviction'].sum())
    else:
        return (0, 0, 0, 0, 0, 0, 0, 0)
    
# Function to convert counts to binary (only using for future crimes as of now)
def convert_to_binary(val):
    return min(val, 1)

In [114]:
from datetime import datetime
from dateutil.parser import parse

# To calculate features, the user input date must be two weeks after the start date of the 
# crime data (Jan 6 2003) and an hour before the end of the data (May 13 2015)

# datetime_start: 2003-01-06 00:01:00
# datetime_end: 2015-05-13 23:53:00

min_date = datetime.strptime('2003-01-20 00:01:00', '%Y-%m-%d %H:%M:%S')
max_date = datetime.strptime('2015-05-30 22:52:00', '%Y-%m-%d %H:%M:%S') 

Idea: Given a datetime within our range (> 2 weeks after data start and reasonably far before the end of the dataset), calculate prior crime/eviction info for each census tract, and use previously trained classifier to predict probabilities of a crime occuring in the next hour for each tract. 

In [115]:
from datetime import datetime, date, time, timedelta

# Parameters for start date
year = int(input("Enter a year from 2006 to 2015: "))
month = int(input("Enter a month from 1 to 12: "))
day = int(input("Enter a day from 1 to 31: "))
hour = int(input("Enter an hour number from 0 to 23: "))
minute = int(input("Enter a minute number from 0 to 59: "))

# Number of periods in the future to forecast
forecast_periods = int(input("How many periods in the future would you like to forecast?"))

# Arbitary datetime to start off with. 
d = date(year, month, day)
t = time(hour, minute)

start_date = datetime.combine(d, t)

if (start_date < min_date) or (start_date + timedelta(hours=forecast_periods) > max_date):
    print("Date or number of periods out of bounds, please try again.")

print('The start date is: ', start_date)

Enter a year from 2006 to 2015: 2006
Enter a month from 1 to 12: 1
Enter a day from 1 to 31: 1
Enter an hour number from 0 to 23: 0
Enter a minute number from 0 to 59: 0
How many periods in the future would you like to forecast?72
The start date is:  2006-01-01 00:00:00


In [116]:
# Create dataframe
sim = pd.DataFrame(columns=['date'])
sim['date'] = pd.date_range(start=start_date, periods=forecast_periods, freq='H')
print(sim)

                  date
0  2006-01-01 00:00:00
1  2006-01-01 01:00:00
2  2006-01-01 02:00:00
3  2006-01-01 03:00:00
4  2006-01-01 04:00:00
5  2006-01-01 05:00:00
6  2006-01-01 06:00:00
7  2006-01-01 07:00:00
8  2006-01-01 08:00:00
9  2006-01-01 09:00:00
10 2006-01-01 10:00:00
11 2006-01-01 11:00:00
12 2006-01-01 12:00:00
13 2006-01-01 13:00:00
14 2006-01-01 14:00:00
15 2006-01-01 15:00:00
16 2006-01-01 16:00:00
17 2006-01-01 17:00:00
18 2006-01-01 18:00:00
19 2006-01-01 19:00:00
20 2006-01-01 20:00:00
21 2006-01-01 21:00:00
22 2006-01-01 22:00:00
23 2006-01-01 23:00:00
24 2006-01-02 00:00:00
25 2006-01-02 01:00:00
26 2006-01-02 02:00:00
27 2006-01-02 03:00:00
28 2006-01-02 04:00:00
29 2006-01-02 05:00:00
..                 ...
42 2006-01-02 18:00:00
43 2006-01-02 19:00:00
44 2006-01-02 20:00:00
45 2006-01-02 21:00:00
46 2006-01-02 22:00:00
47 2006-01-02 23:00:00
48 2006-01-03 00:00:00
49 2006-01-03 01:00:00
50 2006-01-03 02:00:00
51 2006-01-03 03:00:00
52 2006-01-03 04:00:00
53 2006-01-

In [117]:
# Extract day of week, month, year, and hour from each date
sim['month'] = sim['date'].map(lambda t: t.month)
sim['year'] = sim['date'].map(lambda t: t.year)
sim['dow'] = sim['date'].map(lambda t: t.weekday())
sim['hour'] = sim['date'].map(lambda t: t.hour)

In [118]:
# One hot encode year, month, and hour
sim['dow_1'] = (sim['dow'] == 0).astype(int)
sim['dow_2'] = (sim['dow'] == 1).astype(int)
sim['dow_3'] = (sim['dow'] == 2).astype(int)
sim['dow_4'] = (sim['dow'] == 3).astype(int)
sim['dow_5'] = (sim['dow'] == 4).astype(int)
sim['dow_6'] = (sim['dow'] == 5).astype(int)
sim['dow_7'] = (sim['dow'] == 6).astype(int)

# Month
sim['month_1'] = (sim['month'] == 1).astype(int)
sim['month_2'] = (sim['month'] == 2).astype(int)
sim['month_3'] = (sim['month'] == 3).astype(int)
sim['month_4'] = (sim['month'] == 4).astype(int)
sim['month_5'] = (sim['month'] == 5).astype(int)
sim['month_6'] = (sim['month'] == 6).astype(int)
sim['month_7'] = (sim['month'] == 7).astype(int)
sim['month_8'] = (sim['month'] == 8).astype(int)
sim['month_9'] = (sim['month'] == 9).astype(int)
sim['month_10'] = (sim['month'] == 10).astype(int)
sim['month_11'] = (sim['month'] == 11).astype(int)
sim['month_12'] = (sim['month'] == 12).astype(int)

# Year
sim['year_2003'] = (sim['year'] == 2003).astype(int)
sim['year_2004'] = (sim['year'] == 2004).astype(int)
sim['year_2005'] = (sim['year'] == 2005).astype(int)
sim['year_2006'] = (sim['year'] == 2006).astype(int)
sim['year_2007'] = (sim['year'] == 2007).astype(int)
sim['year_2008'] = (sim['year'] == 2008).astype(int) 
sim['year_2009'] = (sim['year'] == 2009).astype(int)
sim['year_2010'] = (sim['year'] == 2010).astype(int)
sim['year_2011'] = (sim['year'] == 2011).astype(int)
sim['year_2012'] = (sim['year'] == 2012).astype(int)
sim['year_2013'] = (sim['year'] == 2013).astype(int)
sim['year_2014'] = (sim['year'] == 2014).astype(int)
sim['year_2015'] = (sim['year'] == 2015).astype(int)

sim['hour_1'] = (sim['hour'] == 1).astype(int)
sim['hour_2'] = (sim['hour'] == 2).astype(int)
sim['hour_3'] = (sim['hour'] == 3).astype(int)
sim['hour_4'] = (sim['hour'] == 4).astype(int) 
sim['hour_5'] = (sim['hour'] == 5).astype(int) 
sim['hour_6'] = (sim['hour'] == 6).astype(int) 
sim['hour_7'] = (sim['hour'] == 7).astype(int) 
sim['hour_8'] = (sim['hour'] == 8).astype(int) 
sim['hour_9'] = (sim['hour'] == 9).astype(int) 
sim['hour_10'] = (sim['hour'] == 10).astype(int) 
sim['hour_11'] = (sim['hour'] == 11).astype(int)
sim['hour_12'] = (sim['hour'] == 12).astype(int) 
sim['hour_13'] = (sim['hour'] == 13).astype(int) 
sim['hour_14'] = (sim['hour'] == 14).astype(int) 
sim['hour_15'] = (sim['hour'] == 15).astype(int) 
sim['hour_16'] = (sim['hour'] == 16).astype(int)
sim['hour_17'] = (sim['hour'] == 17).astype(int) 
sim['hour_18'] = (sim['hour'] == 18).astype(int) 
sim['hour_19'] = (sim['hour'] == 19).astype(int) 
sim['hour_20'] = (sim['hour'] == 20).astype(int) 
sim['hour_21'] = (sim['hour'] == 21).astype(int) 
sim['hour_22'] = (sim['hour'] == 22).astype(int) 
sim['hour_23'] = (sim['hour'] == 23).astype(int) 
sim['hour_24'] = (sim['hour'] == 24).astype(int) 

In [119]:
# Create artificial key for m:m merge
sim['key'] = 1

In [120]:
# Now create df of census tracts and artifical key
census_tracts_list = crime_engin.census_tr.unique()
print(len(census_tracts_list))
print(census_tracts_list[0:10])

census_df = pd.DataFrame(columns=['census_tract', 'key'])
census_df['census_tract'] = census_tracts_list
census_df['key'] = 1

195
[ 232.    158.02  228.01  302.02  204.01  477.02  156.    311.    218.
  122.02]


In [121]:
# Merge two dfs to get Cartesian product
merged = pd.merge(census_df, sim, on='key')
merged.drop(columns=['key'], inplace=True)
print(merged.head())
print(merged.shape)

   census_tract                date  month  year  dow  hour  dow_1  dow_2  \
0         232.0 2006-01-01 00:00:00      1  2006    6     0      0      0   
1         232.0 2006-01-01 01:00:00      1  2006    6     1      0      0   
2         232.0 2006-01-01 02:00:00      1  2006    6     2      0      0   
3         232.0 2006-01-01 03:00:00      1  2006    6     3      0      0   
4         232.0 2006-01-01 04:00:00      1  2006    6     4      0      0   

   dow_3  dow_4   ...     hour_15  hour_16  hour_17  hour_18  hour_19  \
0      0      0   ...           0        0        0        0        0   
1      0      0   ...           0        0        0        0        0   
2      0      0   ...           0        0        0        0        0   
3      0      0   ...           0        0        0        0        0   
4      0      0   ...           0        0        0        0        0   

   hour_20  hour_21  hour_22  hour_23  hour_24  
0        0        0        0        0        0  


In [122]:
# Add minutes elapsed since beginning of data
begin_time = crime_engin['datetime'].values[0]

merged['min_elapsed'] = np.apply_along_axis(func1d = calc_time_delta, axis = 0, \
                             arr=merged['date'].values, datetime_init=crime_engin['datetime'].values[0])
print(merged[['census_tract', 'min_elapsed', 'hour']].head())

<class 'numpy.ndarray'>
   census_tract  min_elapsed  hour
0         232.0    1576800.0     0
1         232.0    1576860.0     1
2         232.0    1576920.0     2
3         232.0    1576980.0     3
4         232.0    1577040.0     4


In [123]:
merged['start_and_census'] = list(zip(merged['min_elapsed'], merged['census_tract']))

In [124]:
import time
start = time.time()

df_1_hour = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=60.0, census_tract = x[1])[0:5], merged['start_and_census'])])

df_2_hours = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=120.0, census_tract = x[1])[0:5], merged['start_and_census'])])

df_3_hours = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=180.0, census_tract = x[1])[0:5], merged['start_and_census'])])

df_4_hours = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=240.0, census_tract = x[1])[0:5], merged['start_and_census'])])

df_5_hours = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=300.0, census_tract = x[1])[0:5], merged['start_and_census'])])

df_1_day = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=1440.0, census_tract = x[1]), merged['start_and_census'])])

df_2_days = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=2880.0, census_tract = x[1]), merged['start_and_census'])])

df_3_days = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=4320.0, census_tract = x[1]), merged['start_and_census'])])

df_7_days = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=10080.0, census_tract = x[1]), merged['start_and_census'])])

df_14_days = pd.DataFrame([x for x in map(lambda x: return_sum_over_slice(crime_engin, start_time=x[0], \
             time_window=20160.0, census_tract = x[1]), merged['start_and_census'])])

# Give names to columns
df_1_hour.columns = ['property_last_1_hour', 'violent_last_1_hour', 'robbery_last_1_hour', 'other_last_1_hour', 'any_last_1_hour']
df_2_hours.columns = ['property_last_2_hours', 'violent_last_2_hours', 'robbery_last_2_hours', 'other_last_2_hours', 'any_last_2_hours']
df_3_hours.columns = ['property_last_3_hours', 'violent_last_3_hours', 'robbery_last_3_hours', 'other_last_3_hours', 'any_last_3_hours']
df_4_hours.columns = ['property_last_4_hours', 'violent_last_4_hours', 'robbery_last_4_hours', 'other_last_4_hours', 'any_last_4_hours']
df_5_hours.columns = ['property_last_5_hours', 'violent_last_5_hours', 'robbery_last_5_hours', 'other_last_5_hours', 'any_last_5_hours']
df_1_day.columns = ['property_last_1_day', 'violent_last_1_day', 'robbery_last_1_day', 'other_last_1_day', 'any_last_1_day', 'nonpay_last_1_day', 'illegal_last_1_day', 'evict_last_1_day']
df_2_days.columns = ['property_last_2_days', 'violent_last_2_days', 'robbery_last_2_days', 'other_last_2_days', 'any_last_2_days', 'nonpay_last_2_days', 'illegal_last_2_days', 'evict_last_2_days']
df_3_days.columns = ['property_last_3_days', 'violent_last_3_days', 'robbery_last_3_days', 'other_last_3_days', 'any_last_3_days', 'nonpay_last_3_days', 'illegal_last_3_days', 'evict_last_3_days']
df_7_days.columns = ['property_last_7_days', 'violent_last_7_days', 'robbery_last_7_days', 'other_last_7_days', 'any_last_7_days', 'nonpay_last_7_days', 'illegal_last_7_days', 'evict_last_7_days']
df_14_days.columns = ['property_last_14_days', 'violent_last_14_days', 'robbery_last_14_days', 'other_last_14_days', 'any_last_14_days', 'nonpay_last_14_days', 'illegal_last_14_days', 'evict_last_14_days']

end = time.time()
print("Time to run:", end-start, "seconds")

Time to run: 654.5396540164948 seconds


In [125]:
print(df_1_hour.shape)

(14040, 5)


In [126]:
# Future crimes
merged['future_property_1_hour'] = [convert_to_binary(x) for x in map(lambda x: return_sum_over_slice(df=crime_engin, \
                                        start_time=(x[0]+60.0), time_window=59.99, \
                                        census_tract = x[1])[0], merged['start_and_census'])]

merged['future_violent_1_hour'] = [convert_to_binary(x) for x in map(lambda x: return_sum_over_slice(df=crime_engin, \
                                        start_time=(x[0]+60.0), time_window=59.99, \
                                        census_tract = x[1])[1], merged['start_and_census'])]

merged['future_robbery_1_hour'] = [convert_to_binary(x) for x in map(lambda x: return_sum_over_slice(df=crime_engin, \
                                        start_time=(x[0]+60.0), time_window=59.99, \
                                        census_tract = x[1])[2], merged['start_and_census'])]

merged['future_other_1_hour'] = [convert_to_binary(x) for x in map(lambda x: return_sum_over_slice(df=crime_engin, \
                                        start_time=(x[0]+60.0), time_window=59.99, census_tract = x[1])[3], merged['start_and_census'])]

merged['future_any_1_hour'] = [convert_to_binary(x) for x in map(lambda x: return_sum_over_slice(df=crime_engin, \
                                        start_time=(x[0]+60.0), time_window=59.99, \
                                        census_tract = x[1])[4], merged['start_and_census'])]

In [127]:
# Merge on new features 
merged.reset_index(inplace=True, drop=True)
sim_concat = pd.concat([merged, df_1_hour, df_2_hours, df_3_hours, df_4_hours, df_5_hours, \
                          df_1_day, df_2_days, df_3_days, df_7_days, df_14_days], axis=1)

In [128]:
print(sim_concat.shape)
print(list(sim_concat))

(14040, 134)
['census_tract', 'date', 'month', 'year', 'dow', 'hour', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 'dow_7', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'year_2003', 'year_2004', 'year_2005', 'year_2006', 'year_2007', 'year_2008', 'year_2009', 'year_2010', 'year_2011', 'year_2012', 'year_2013', 'year_2014', 'year_2015', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_24', 'min_elapsed', 'start_and_census', 'future_property_1_hour', 'future_violent_1_hour', 'future_robbery_1_hour', 'future_other_1_hour', 'future_any_1_hour', 'property_last_1_hour', 'violent_last_1_hour', 'robbery_last_1_hour', 'other_last_1_hour', 'any_last_1_hour', 'property_last_2_hours', 'violent_last_2_ho

In [129]:
census_one_hot = pd.get_dummies(sim_concat['census_tract'], prefix='ct')
sim_2 = pd.concat([sim_concat, census_one_hot], axis=1)
print(sim_2.shape)

(14040, 329)


In [130]:
# Run this at the end to restrict to only variables that will be fed to network
sim_final = sim_2.drop(['min_elapsed', 'date', 'month', 'year', 'dow', 'hour', 'start_and_census'], axis=1)
print(sim_final.shape)
print(list(sim_final))

(14040, 322)
['census_tract', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 'dow_7', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'year_2003', 'year_2004', 'year_2005', 'year_2006', 'year_2007', 'year_2008', 'year_2009', 'year_2010', 'year_2011', 'year_2012', 'year_2013', 'year_2014', 'year_2015', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_24', 'future_property_1_hour', 'future_violent_1_hour', 'future_robbery_1_hour', 'future_other_1_hour', 'future_any_1_hour', 'property_last_1_hour', 'violent_last_1_hour', 'robbery_last_1_hour', 'other_last_1_hour', 'any_last_1_hour', 'property_last_2_hours', 'violent_last_2_hours', 'robbery_last_2_hours', 'other_last_2_hours', 'any_last_2_hours', 'pr

In [131]:
# Stack train/test data (remember we have future data and need to remove!)
df_1 = pd.read_csv('~/Desktop/project1030/SF/training data/training data.csv')
df_2 = pd.read_csv('~/Desktop/project1030/SF/training data/training data_50_100.csv')
df_3 = pd.read_csv('~/Desktop/project1030/SF/training data/training data_100_150.csv')
df_4 = pd.read_csv('~/Desktop/project1030/SF/training data/training data_150_200.csv')

crime_pred = (df_1.append(df_2, ignore_index=True).append(df_3, ignore_index=True).append(df_4, ignore_index=True))
print(crime_pred.shape)

(193159, 321)


In [132]:
# Read in training data (remember to remove future data we're not trying to predict!)
crime_pred.drop(['future_other_1_hour', 'future_property_1_hour', \
                 'future_violent_1_hour', 'future_robbery_1_hour',], axis=1, inplace=True)

Y = np.array(crime_pred['future_any_1_hour'])
X = crime_pred.drop(['future_any_1_hour'], axis=1).as_matrix()

In [133]:
# Fit gradient boosting classifier on training data
from sklearn.ensemble import GradientBoostingClassifier

grad_clf = GradientBoostingClassifier()
grad_clf.fit(X, Y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [134]:
pred_probs = pd.DataFrame(columns=['census_tract', 'predicted_prob_crime_1_hr'])
tracts = sim_final['census_tract']
pred_probs['census_tract'] = tracts

In [135]:
# Drop census tract from sim_final: now data is in format that classifier has been trained on
sim_final.drop(columns=['census_tract'], inplace=True)

# Try predicting general (any) crimes first
sim_final.drop(['future_other_1_hour', 'future_property_1_hour', \
                 'future_violent_1_hour', 'future_robbery_1_hour',], axis=1, inplace=True)

Y_sim = np.array(sim_final['future_any_1_hour'])
X_sim = sim_final.drop(['future_any_1_hour'], axis=1).as_matrix()

In [136]:
# Predict probabilities for each census tract
pred_probs['predicted_prob_crime_1_hr'] = pd.Series(grad_clf.predict_proba(X_sim)[:, 1])

# Draw random uniform from [0, 1] for simulation
pred_probs['rand_unif'] = np.random.uniform(size=(195*forecast_periods))

# Simulate crimes based on output probs
pred_probs['sim_crime'] = (pred_probs['rand_unif'] <= pred_probs['predicted_prob_crime_1_hr']).astype(int)
pred_probs['actual_crime'] = Y_sim

In [145]:
print(pred_probs.head(25))
print(pred_probs.shape)

pred_probs.to_csv('~/Desktop/project1030/SF/predicted_probs_72_hrs.csv')

    census_tract  predicted_prob_crime_1_hr  rand_unif  sim_crime  \
0          232.0                   0.107517   0.038855          1   
1          232.0                   0.107517   0.160850          0   
2          232.0                   0.107517   0.979303          0   
3          232.0                   0.107517   0.187824          0   
4          232.0                   0.107517   0.648799          0   
5          232.0                   0.107517   0.862449          0   
6          232.0                   0.107517   0.687698          0   
7          232.0                   0.107517   0.432777          0   
8          232.0                   0.165833   0.490224          0   
9          232.0                   0.143261   0.644683          0   
10         232.0                   0.132317   0.066694          1   
11         232.0                   0.158021   0.401664          0   
12         232.0                   0.140339   0.808819          0   
13         232.0                  

In [148]:
pred_probs_1_hr = pred_probs.groupby('census_tract').first()
pred_probs_1_hr.to_csv('~/Desktop/project1030/SF/predicted_probs_1_hr.csv')

In [138]:
# See how often we got it right (accuracy)
print(np.equal(pred_probs['sim_crime'].values, pred_probs['actual_crime'].values).astype(int).mean())

0.911894586895


In [139]:
grouped_pred_probs = pred_probs.groupby('census_tract').sum()
grouped_pred_probs['census_tract'] = grouped_pred_probs.index
print(grouped_pred_probs.head(25))

              predicted_prob_crime_1_hr  rand_unif  sim_crime  actual_crime  \
census_tract                                                                  
101.00                         8.596483  33.936317         10             1   
102.00                         3.054320  36.634810          4             1   
103.00                         2.805899  33.537632          4             0   
104.00                         3.016532  34.575684          2             0   
105.00                        10.079442  37.790199          8             3   
106.00                         8.938286  36.702734         11             1   
107.00                         4.619296  35.004563          3             4   
108.00                         2.699205  39.333484          1             0   
109.00                         3.870238  39.478119          5             0   
110.00                         3.160576  40.573331          2             1   
111.00                         7.141903  34.723267  

In [140]:
sim_output = grouped_pred_probs[['census_tract', 'sim_crime', 'actual_crime']]
sim_output['diff'] = abs(sim_output['sim_crime'] - sim_output['actual_crime'])
print(sim_output.head(20))

              census_tract  sim_crime  actual_crime  diff
census_tract                                             
101.00              101.00         10             1     9
102.00              102.00          4             1     3
103.00              103.00          4             0     4
104.00              104.00          2             0     2
105.00              105.00          8             3     5
106.00              106.00         11             1    10
107.00              107.00          3             4     1
108.00              108.00          1             0     1
109.00              109.00          5             0     5
110.00              110.00          2             1     1
111.00              111.00          5             0     5
112.00              112.00          2             1     1
113.00              113.00          3             1     2
117.00              117.00         28             5    23
118.00              118.00          2             0     2
119.01        

In [141]:
print('Average difference between # of predicted and actual crimes:')
print(sim_output['diff'].values.mean())

Average difference between # of predicted and actual crimes:
4.26153846154


In [142]:
# Merge on census tract boundaries for plotting
census_tr_for_merge = census_tr[['NAME10', 'the_geom']].rename(columns={'NAME10': 'census_tract'})

sim_output_w_tr = pd.merge(sim_output, census_tr_for_merge, on='census_tract')
print(sim_output_w_tr.head())
print(sim_output_w_tr.shape)

   census_tract  sim_crime  actual_crime  diff  \
0         101.0         10             1     9   
1         102.0          4             1     3   
2         103.0          4             0     4   
3         104.0          2             0     2   
4         105.0          8             3     5   

                                            the_geom  
0  MULTIPOLYGON (((-122.421076 37.812889, -122.42...  
1  MULTIPOLYGON (((-122.418445 37.80458, -122.418...  
2  MULTIPOLYGON (((-122.418445 37.80458, -122.418...  
3  MULTIPOLYGON (((-122.402421 37.799382, -122.40...  
4  MULTIPOLYGON (((-122.40068099999999 37.796777,...  
(195, 5)


In [143]:
# Output for plotting 
sim_output_w_tr.to_csv('~/Desktop/project1030/SF/simulations/sim_72hr.csv', index=False)