In [61]:
# Imports 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Display full column widths to read schema descriptions
pd.options.display.max_colwidth = 200 

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

In [2]:
# Read in data and convert to pandas dataframes 
df = pd.read_csv('sfpd-dispatch/sfpd_dispatch_data_subset.csv')
schema_df = pd.read_csv('sfpd-dispatch/sfpd_dispatch_schema.csv')
# Preview first 5 samples 
df.head()

Unnamed: 0,call_number,unit_id,incident_number,call_type,call_date,watch_date,received_timestamp,entry_timestamp,dispatch_timestamp,response_timestamp,...,number_of_alarms,unit_type,unit_sequence_in_call_dispatch,fire_prevention_district,supervisor_district,neighborhood_district,location,row_id,latitude,longitude
0,180243072,84,18010216,Medical Incident,2018-01-24,2018-01-24,2018-01-24 17:36:16.000000 UTC,2018-01-24 17:38:21.000000 UTC,2018-01-24 17:39:41.000000 UTC,2018-01-24 17:39:45.000000 UTC,...,1,MEDIC,1,7,1,,"(37.77444199483868, -122.5046792231959)",180243072-84,37.774442,-122.504679
1,180240538,61,18010011,Medical Incident,2018-01-24,2018-01-23,2018-01-24 07:05:05.000000 UTC,2018-01-24 07:05:05.000000 UTC,2018-01-24 07:05:31.000000 UTC,2018-01-24 07:05:45.000000 UTC,...,1,MEDIC,1,2,6,,"(37.774094856688166, -122.42000143696421)",180240538-61,37.774095,-122.420001
2,180240176,E22,18009959,Medical Incident,2018-01-24,2018-01-23,2018-01-24 02:04:21.000000 UTC,2018-01-24 02:05:37.000000 UTC,2018-01-24 02:06:04.000000 UTC,2018-01-24 02:07:26.000000 UTC,...,1,ENGINE,1,8,7,,"(37.75521795168784, -122.47554039050351)",180240176-E22,37.755218,-122.47554
3,180243588,E03,18010271,Alarms,2018-01-24,2018-01-24,2018-01-24 20:04:15.000000 UTC,2018-01-24 20:05:12.000000 UTC,2018-01-24 20:05:24.000000 UTC,2018-01-24 20:05:36.000000 UTC,...,1,ENGINE,1,4,2,,"(37.79031930341935, -122.4231629067995)",180243588-E03,37.790319,-122.423163
4,180243590,B03,18010272,Alarms,2018-01-24,2018-01-24,2018-01-24 20:03:08.000000 UTC,2018-01-24 20:05:36.000000 UTC,2018-01-24 20:05:57.000000 UTC,2018-01-24 20:06:56.000000 UTC,...,1,CHIEF,3,3,6,,"(37.77732776352611, -122.39308855968541)",180243590-B03,37.777328,-122.393089


In [11]:
# Make a copy of the dataframe (remove neighborhood district column while we're at it)
dfc = df.drop(['neighborhood_district'], axis = 1) 
time_columns = ['received_timestamp', 'entry_timestamp', 'dispatch_timestamp', 'response_timestamp', 'on_scene_timestamp', 
'transport_timestamp', 'hospital_timestamp', 'available_timestamp'] 
# Here we apply a lambda expression to each pandas column to remove the 'UTC', and then we use the to_datetime function in 
# pandas to convert the types to python Timestamps instead of strings. This allows us to perform new operations. 
for col in time_columns:
    dfc[col] = pd.to_datetime(dfc[col].astype(str).apply(lambda x: x[:-3]), format='%Y-%m-%d %H:%M:%S.%f')
dfc.head()

Unnamed: 0,call_number,unit_id,incident_number,call_type,call_date,watch_date,received_timestamp,entry_timestamp,dispatch_timestamp,response_timestamp,...,call_type_group,number_of_alarms,unit_type,unit_sequence_in_call_dispatch,fire_prevention_district,supervisor_district,location,row_id,latitude,longitude
0,180243072,84,18010216,Medical Incident,2018-01-24,2018-01-24,2018-01-24 17:36:16,2018-01-24 17:38:21,2018-01-24 17:39:41,2018-01-24 17:39:45,...,Non Life-threatening,1,MEDIC,1,7,1,"(37.77444199483868, -122.5046792231959)",180243072-84,37.774442,-122.504679
1,180240538,61,18010011,Medical Incident,2018-01-24,2018-01-23,2018-01-24 07:05:05,2018-01-24 07:05:05,2018-01-24 07:05:31,2018-01-24 07:05:45,...,Non Life-threatening,1,MEDIC,1,2,6,"(37.774094856688166, -122.42000143696421)",180240538-61,37.774095,-122.420001
2,180240176,E22,18009959,Medical Incident,2018-01-24,2018-01-23,2018-01-24 02:04:21,2018-01-24 02:05:37,2018-01-24 02:06:04,2018-01-24 02:07:26,...,Potentially Life-Threatening,1,ENGINE,1,8,7,"(37.75521795168784, -122.47554039050351)",180240176-E22,37.755218,-122.47554
3,180243588,E03,18010271,Alarms,2018-01-24,2018-01-24,2018-01-24 20:04:15,2018-01-24 20:05:12,2018-01-24 20:05:24,2018-01-24 20:05:36,...,Alarm,1,ENGINE,1,4,2,"(37.79031930341935, -122.4231629067995)",180243588-E03,37.790319,-122.423163
4,180243590,B03,18010272,Alarms,2018-01-24,2018-01-24,2018-01-24 20:03:08,2018-01-24 20:05:36,2018-01-24 20:05:57,2018-01-24 20:06:56,...,Alarm,1,CHIEF,3,3,6,"(37.77732776352611, -122.39308855968541)",180243590-B03,37.777328,-122.393089


In [53]:
df2 = dfc[['call_type', 'received_timestamp', 'city', 'zipcode_of_incident']]

In [54]:
type(dfc['received_timestamp'][0])

pandas._libs.tslib.Timestamp

In [55]:
# df2['zipcode_of_incident'] = df2['zipcode_of_incident'].astype(str)
df2['hour'] = dfc['received_timestamp'].apply(lambda x: x.hour)
df2['day'] = dfc['received_timestamp'].apply(lambda x: x.dayofweek)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [56]:
def assign_half(x): 
    if x < 12: 
        return 0 # night or early morning 
    return 1 

df2['day_half'] = df2['hour'].apply(assign_half)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [57]:
type(df2['zipcode_of_incident'][0])

numpy.int64

In [58]:
df2

Unnamed: 0,call_type,received_timestamp,city,zipcode_of_incident,hour,day,day_half
0,Medical Incident,2018-01-24 17:36:16,San Francisco,94121,17,2,1
1,Medical Incident,2018-01-24 07:05:05,San Francisco,94103,7,2,0
2,Medical Incident,2018-01-24 02:04:21,San Francisco,94122,2,2,0
3,Alarms,2018-01-24 20:04:15,San Francisco,94109,20,2,1
4,Alarms,2018-01-24 20:03:08,San Francisco,94107,20,2,1
5,Alarms,2018-01-24 21:18:56,San Francisco,94110,21,2,1
6,Structure Fire,2018-01-24 12:24:27,San Francisco,94102,12,2,1
7,Traffic Collision,2018-01-24 19:51:29,San Francisco,94103,19,2,1
8,Alarms,2018-01-24 22:26:22,San Francisco,94133,22,2,1
9,Medical Incident,2018-01-24 10:35:00,San Francisco,94134,10,2,0


In [62]:
label = LabelEncoder()
df2['zipcode_of_incident'] = label.fit_transform(df2['zipcode_of_incident'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [63]:
df2

Unnamed: 0,call_type,received_timestamp,city,zipcode_of_incident,hour,day,day_half
0,Medical Incident,2018-01-24 17:36:16,San Francisco,15,17,2,1
1,Medical Incident,2018-01-24 07:05:05,San Francisco,1,7,2,0
2,Medical Incident,2018-01-24 02:04:21,San Francisco,16,2,2,0
3,Alarms,2018-01-24 20:04:15,San Francisco,6,20,2,1
4,Alarms,2018-01-24 20:03:08,San Francisco,4,20,2,1
5,Alarms,2018-01-24 21:18:56,San Francisco,7,21,2,1
6,Structure Fire,2018-01-24 12:24:27,San Francisco,0,12,2,1
7,Traffic Collision,2018-01-24 19:51:29,San Francisco,1,19,2,1
8,Alarms,2018-01-24 22:26:22,San Francisco,24,22,2,1
9,Medical Incident,2018-01-24 10:35:00,San Francisco,25,10,2,0
