In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

In [2]:
#load in data from CSV file
df_all = pd.read_csv("./data/sfpd_dispatch_data_subset.csv")
num_rows = df_all.shape[0]
#convert string timestamp to pandas Timeframe
df_all["received_timestamp"] = pd.to_datetime(df_all["received_timestamp"])

In [3]:
#remove columns with null values
counter_nan = df_all.isnull().sum()
counter_no_nan = counter_nan[counter_nan==0]

In [4]:
df_cleaned = df_all[counter_no_nan.keys()]
kept_keys = counter_no_nan.keys().values
kept_keys

array(['call_number', 'unit_id', 'incident_number', 'call_type',
       'call_date', 'watch_date', 'received_timestamp', 'entry_timestamp',
       'dispatch_timestamp', 'call_final_disposition',
       'available_timestamp', 'address', 'zipcode_of_incident',
       'battalion', 'station_area', 'box', 'original_priority',
       'priority', 'final_priority', 'als_unit', 'number_of_alarms',
       'unit_type', 'unit_sequence_in_call_dispatch',
       'fire_prevention_district', 'supervisor_district', 'location',
       'row_id', 'latitude', 'longitude', 'x', 'y', 'z'], dtype=object)

In [57]:
kept_keys = ['received_timestamp','zipcode_of_incident', 'unit_type']
df_cleaned = df_cleaned[kept_keys]

KeyError: "['received_timestamp' 'unit_type'] not in index"

In [7]:
#extract hour, day, dotw from timestamp
columns_to_extract = ["received_timestamp"]
full_df = pd.DataFrame()
for c in columns_to_extract:
    new_data = []
    for timestamp in df_cleaned[c]:
        keys = ["date_"+c[:3], "hour_"+c[:3], "dow_"+c[:3]]
        new_data.append({keys[0]: timestamp.day, keys[1]: timestamp.hour, keys[2]: timestamp.dayofweek});
    new_df = pd.DataFrame(new_data)
    full_df = new_df.join(full_df, rsuffix='_next')
    df_cleaned = df_cleaned.drop(c, axis=1)
df_cleaned = df_cleaned.join(full_df, rsuffix="_next")

In [9]:
call_type_mapping = {
    'Alarms': 0,
    'Citizen Assist / Service Call': 1,
    'Electrical Hazard': 2,
    'Elevator / Escalator Rescue': 3,
    'Fuel Spill': 4,
    'Gas Leak (Natural and LP Gases)': 5,
    'HazMat': 6,
    'Medical Incident': 7,
    'Odor (Strange / Unknown)': 8,
    'Other': 9,
    'Outside Fire': 10,
    'Smoke Investigation (Outside)': 11,
    'Structure Fire': 12,
    'Traffic Collision': 13,
    'Train / Rail Incident': 14,
    'Vehicle Fire': 15,
    'Water Rescue': 16    
}
unit_type_mapping = {
    'CHIEF': 0,
    'ENGINE': 1,
    'INVESTIGATION': 2,
    'MEDIC': 3,
    'PRIVATE': 4,
    'RESCUE CAPTAIN': 5,
    'RESCUE SQUAD': 6,
    'SUPPORT': 7,
    'TRUCK': 8
}
fdisp = {
    'Against Medical Advice': 0,
    'Cancelled': 1,
    'Code 2 Transport': 2,
    'Code 3 Transport': 3,
    'Fire': 4,
    'Gone on Arrival': 5,
    'Medical Examiner': 6,
    'No Merit': 7,
    'Other': 8,
    'Patient Declined Transport': 9,
    'SFPD': 10,
    'Unable to Locate': 11
}
boolean_map = {
    'False': 0,
    'True': 1
}
battalion_mapping = {
    'B01': 0,
    'B02': 1,
    'B03': 2,
    'B04': 3,
    'B05': 4,
    'B06': 5,
    'B07': 6,
    'B08': 7,
    'B09': 8,
    'B10': 9,
    'B99': 10
}

In [10]:
df_cleaned = df_cleaned.replace({'unit_type': unit_type_mapping})
y_data = df_cleaned['unit_type']
df_cleaned = df_cleaned.drop('unit_type', axis=1)

In [None]:
# df_onehot = df_cleaned
# onehots = ['unit_id', 'call_type', 'call_final_disposition', 'battalion', 'station_area']
# for i in onehots:
#     onehot = pd.get_dummies(df_onehot[i])
#     df_onehot = df_onehot.drop(i, axis=1)
#     df_onehot = df_onehot.join(onehot, rsuffix='_right')

In [15]:
#map dispatch frequencies to zip codes
df_wy.loc[df_wy['zipcode_of_incident'] == 94122]
zipcodes = ['94121','94103','94122','94109','94107','94110','94102','94133','94134','94111','94114','94131','94117','94112','94118','94158','94105','94115','94108','94124','94104','94116','94123','94127','94132','94130','94129']
for zcode in zipcodes:
    vals = df_wy.loc[df_wy['zipcode_of_incident'] == int(zcode)]
    frequencies = np.bincount(vals['unit_type'].values)
    for idx,ut in enumerate(frequencies):
        zipcode_mapping[zcode]["unit_"+str(idx)+"_freq"] = ut

In [58]:
#map zip code to demographic data (data found at http://www.city-data.com/)
zipcode_mapping = {
    '94121': {'pop': 43332, 'houses': 19311, 'renters': 10397, 'col': 163, 'land': 3.1, 'dens': 14061},
    '94103': {'pop': 26116, 'houses': 15685, 'renters': 11657, 'col': 167.8, 'land': 1.4, 'dens': 19259},
    '94122': {'pop': 62918, 'houses': 24615, 'renters': 12231, 'col': 156.3, 'land': 2.4, 'dens': 26616},
    '94109': {'pop': 56822, 'houses': 37688, 'renters': 29105, 'col': 161.6, 'land': 1.2, 'dens': 47834},
    '94107': {'pop': 29805, 'houses': 15569, 'renters': 8463, 'col': 152.7, 'land': 1.8, 'dens': 16634},
    '94110': {'pop': 74565, 'houses': 30262, 'renters': 18788, 'col': 155.3, 'land': 2.3, 'dens': 32066},
    '94102': {'pop': 29739, 'houses': 19561, 'renters': 16005, 'col': 181.8, 'land': 0.7, 'dens': 44330},
    '94133': {'pop': 27885, 'houses': 14941, 'renters': 11182, 'col': 178.0, 'land': 0.8, 'dens': 36931},
    '94134': {'pop': 42274, 'houses': 12699, 'renters': 4962, 'col': 155.2, 'land': 2.4, 'dens': 17616},
    '94111': {'pop': 3481, 'houses': 2626, 'renters': 1593, 'col': 158.4, 'land': 0.3, 'dens': 10116},
    '94114': {'pop': 33984, 'houses': 18228, 'renters': 9073, 'col': 156.5, 'land': 1.4, 'dens': 23843},
    '94131': {'pop': 28756, 'houses': 13952, 'renters': 6050, 'col': 157.2, 'land': 2.1, 'dens': 13828},
    '94117': {'pop': 43944, 'houses': 20017, 'renters': 13447, 'col': 156.4, 'land': 1.7, 'dens': 26041},
    '94112': {'pop': 85205, 'houses': 24570, 'renters': 8694, 'col': 154.3, 'land': 3.4, 'dens': 25329},
    '94118': {'pop': 42259, 'houses': 19115, 'renters': 12358, 'col': 164.4, 'land': 1.9, 'dens': 21685},
    '94158': {'pop': 6080, 'houses': 3629, 'renters': 2530, 'col': 139.0, 'land': 0.7, 'dens': 9247},
    '94105': {'pop': 6890, 'houses': 5258, 'renters': 2108, 'col': 151.8, 'land': 0.4, 'dens': 18652},
    '94115': {'pop': 35178, 'houses': 19430, 'renters': 13333, 'col': 160.7, 'land': 1.1, 'dens': 31529},
    '94108': {'pop': 14914, 'houses': 8997, 'renters': 7446, 'col': 175.6, 'land': 0.3, 'dens': 55226},
    '94124': {'pop': 35954, 'houses': 11430, 'renters': 5230, 'col': 155.9, 'land': 4.9, 'dens': 7298},
    '94104': {'pop': 496, 'houses': 360, 'renters': 255, 'col': 197.8, 'land': 0.1, 'dens': 6392},
    '94116': {'pop': 46594, 'houses': 16851, 'renters': 5736, 'col': 156.9, 'land': 2.6, 'dens': 18021},
    '94123': {'pop': 25600, 'houses': 15429, 'renters': 10665, 'col': 162.7, 'land': 1.0, 'dens': 25083},
    '94127': {'pop': 10960, 'houses': 8077, 'renters': 1491, 'col': 156.2, 'land': 1.8, 'dens': 11839},
    '94132': {'pop': 31737, 'houses': 11028, 'renters': 5998, 'col': 162.6, 'land': 3.1, 'dens': 10205},
    '94130': {'pop': 3134, 'houses': 778, 'renters': 647, 'col': 144.2, 'land': 0.9, 'dens': 3405},
    '94129': {'pop': 3815, 'houses': 1364, 'renters': 1288, 'col': 149.4, 'land': 2.3, 'dens': 1655},
    '0': {'pop': 0, 'houses': 0, 'renters': 0, 'col': 0, 'land': 0, 'dens': 0}
}

In [17]:
#map lat/long to region data
full_df = pd.DataFrame()
new_data = []
for zipcode in df_cleaned["zipcode_of_incident"]:
    if str(zipcode) in zipcode_mapping:
        new_data.append(zipcode_mapping[str(zipcode)])
    else:
        new_data.append(zipcode_mapping['0'])

In [49]:
# df_new.to_json(orient='records', path_or_buf='../routes/data/zipcode_search.json')
df_new_y.to_csv(path_or_buf='../routes/data/zipcode_search.csv')

In [51]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
x_data = df_cleaned.loc[:,:].values
sscaler = StandardScaler()
x_std= sscaler.fit_transform(x_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)



In [52]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(40, 50, 50, 20))
clf.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(40, 50, 50, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [55]:
predictions = clf.predict(x_test)

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions, y_test)

0.0228

In [50]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=50)
clf.fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [None]:
predictions = clf.predict(x_test)

In [None]:
accuracy_score(predictions, y_test)

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(x_train,y_train)

In [None]:
predictions = clf.predict(x_test)
accuracy_score(predictions, y_test)