# Assessing Fire Risk in NYC

## Import Libraries

In [1]:
# Data analysis and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

# Interactive maps
import folium
from folium.plugins import HeatMap

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

## Load and describe data
Note: Data was filtered on the NYC OpenData site to only include incident classification groups that were fire-related (Structural and NonStructural Fires) prior to export.

In [2]:
# Load the data into Python

# Fire Incident Dispatch
alarms_df = pd.read_csv('../data/raw/In-Service_Alarm_Box_Locations.csv')
dispatch_df = pd.read_csv('../data/raw/Fire_Incident_Dispatch_Data.csv')

## ELT

In [3]:
alarms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13008 entries, 0 to 13007
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   BOROBOX           13008 non-null  object 
 1   BOX_TYPE          13008 non-null  object 
 2   LOCATION          13008 non-null  object 
 3   ZIP               12981 non-null  float64
 4   BOROUGH           13004 non-null  object 
 5   COMMUNITYDISTICT  12866 non-null  object 
 6   CITYCOUNCIL       13004 non-null  float64
 7   LATITUDE          13008 non-null  float64
 8   LONGITUDE         13008 non-null  float64
 9   Location Point    13008 non-null  object 
dtypes: float64(4), object(6)
memory usage: 1016.4+ KB


In [4]:
alarms_df = alarms_df[[
    'LOCATION',
]]
alarms_df.head()

Unnamed: 0,LOCATION
0,3 AVE & 65 ST
1,WOODSIDE AVE & 69 ST
2,MYRTLE AVE & PALMETTO ST
3,NEW YORK AVE & LEFFERTS AVE
4,RIVER & NORTH 3 STS


In [5]:
dispatch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134205 entries, 0 to 134204
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   STARFIRE_INCIDENT_ID            134205 non-null  int64  
 1   INCIDENT_DATETIME               134205 non-null  object 
 2   ALARM_BOX_BOROUGH               134205 non-null  object 
 3   ALARM_BOX_NUMBER                134205 non-null  int64  
 4   ALARM_BOX_LOCATION              134195 non-null  object 
 5   INCIDENT_BOROUGH                134205 non-null  object 
 6   ZIPCODE                         128109 non-null  float64
 7   POLICEPRECINCT                  128114 non-null  float64
 8   CITYCOUNCILDISTRICT             128110 non-null  float64
 9   COMMUNITYDISTRICT               128112 non-null  float64
 10  COMMUNITYSCHOOLDISTRICT         128101 non-null  float64
 11  CONGRESSIONALDISTRICT           128110 non-null  float64
 12  ALARM_SOURCE_DES

In [6]:
dispatch_df.rename(columns={
    'ALARM_BOX_LOCATION': 'LOCATION',
    'INCIDENT_DATETIME':'DATE',
    'ALARM_BOX_NUMBER': 'ALARM_ID',
    'ALARM_BOX_BOROUGH': 'BOROUGH',
    'INCIDENT_CLASSIFICATION': 'TYPE'
}, inplace=True)
dispatch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134205 entries, 0 to 134204
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   STARFIRE_INCIDENT_ID            134205 non-null  int64  
 1   DATE                            134205 non-null  object 
 2   BOROUGH                         134205 non-null  object 
 3   ALARM_ID                        134205 non-null  int64  
 4   LOCATION                        134195 non-null  object 
 5   INCIDENT_BOROUGH                134205 non-null  object 
 6   ZIPCODE                         128109 non-null  float64
 7   POLICEPRECINCT                  128114 non-null  float64
 8   CITYCOUNCILDISTRICT             128110 non-null  float64
 9   COMMUNITYDISTRICT               128112 non-null  float64
 10  COMMUNITYSCHOOLDISTRICT         128101 non-null  float64
 11  CONGRESSIONALDISTRICT           128110 non-null  float64
 12  ALARM_SOURCE_DES

In [7]:
dispatch_df = dispatch_df[[
    'LOCATION',
    'DATE',
    'ALARM_ID',
    'BOROUGH',
    'TYPE'
]]
dispatch_df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
0,BAY 28 ST & SUNNYSIDE ST,01/01/2018 12:02:05 AM,1151,QUEENS,Private Dwelling Fire
1,75 AVE & 178 ST,01/01/2018 12:02:54 AM,9707,QUEENS,Private Dwelling Fire
2,CHURCH AVE & STORY ST,01/01/2018 12:03:53 AM,3776,BROOKLYN,Demolition Debris or Rubbish Fire
3,LINDEN BLVD & 121 ST,01/01/2018 12:13:46 AM,5210,QUEENS,Demolition Debris or Rubbish Fire
4,CROTONA AVE & 182 ST,01/01/2018 12:14:20 AM,3173,BRONX,Multiple Dwelling 'A' - Food on the stove fire


In [8]:
# Classify fire types and label-encode TYPE values
#  1 = Commercial
#  2 = Industrial
#  3 = Non-Structural
#  4 = Public
#  5 = Residential
#  6 = Utilities
#  7 = Misc/Other Spaces

def classify(x):
    if x == "Other Commercial Building Fire" or x == "Store Fire":
        return 1
    elif x == "Construction or Demolition Building Fire" or x == "Factory Fire":
        return 2
    elif x == "Demolition Debris or Rubbish Fire" or x == "Automobile Fire" or x == "Transit System - NonStructural" or x == "Brush Fire" or x == "Other Transportation Fire" or x == "Abandoned Derelict Vehicle Fire" or x == "Undefined Nonstructural Fire":
        return 3
    elif x == "School Fire" or x == "Hospital Fire" or x == "Other Public Building Fire" or x == "Church Fire" or x == "Transit System - Structural" or x == "Theater or TV Studio Fire":
        return 4
    elif x == "Multiple Dwelling 'A' - Food on the stove fire" or x == "Private Dwelling Fire" or x == "Multiple Dwelling 'A' - Other fire" or x == "Multiple Dwelling 'B' Fire" or x == "Multiple Dwelling 'A' - Compactor fire" or x == "Untenanted Building Fire":
        return 5
    elif x == "Manhole Fire - Seeping Smoke" or x == "Manhole Fire - Other" or x == "Manhole Fire - Blown Cover" or x == "Manhole Fire - Extended to Building":
        return 6
    else: return 7

dispatch_df['TYPE'] = dispatch_df.TYPE.apply(classify)
dispatch_df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
0,BAY 28 ST & SUNNYSIDE ST,01/01/2018 12:02:05 AM,1151,QUEENS,5
1,75 AVE & 178 ST,01/01/2018 12:02:54 AM,9707,QUEENS,5
2,CHURCH AVE & STORY ST,01/01/2018 12:03:53 AM,3776,BROOKLYN,3
3,LINDEN BLVD & 121 ST,01/01/2018 12:13:46 AM,5210,QUEENS,3
4,CROTONA AVE & 182 ST,01/01/2018 12:14:20 AM,3173,BRONX,5


In [9]:
incidents_df = alarms_df.merge(dispatch_df, how='outer', on='LOCATION')
incidents_df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
0,3 AVE & 65 ST,02/06/2018 03:40:32 AM,2653.0,BROOKLYN,3.0
1,3 AVE & 65 ST,07/13/2018 01:55:49 PM,2653.0,BROOKLYN,5.0
2,3 AVE & 65 ST,08/06/2018 06:32:28 AM,2653.0,BROOKLYN,3.0
3,3 AVE & 65 ST,09/13/2018 06:27:32 PM,2653.0,BROOKLYN,5.0
4,3 AVE & 65 ST,09/16/2018 09:51:51 AM,2653.0,BROOKLYN,5.0


In [10]:
# Verify nulls exist
nulls = pd.isnull(incidents_df['DATE'])
incidents_df[nulls]

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
105,NEW YORK AVE & LEFFERTS AVE,,,,
108,VINCENT AVE & COVERLY ST,,,,
160,METROPOLITAN AVE & PLEASANTVIEW ST,,,,
161,NARROWS RD N 1000' W OF FINGERBOARD RD,,,,
162,E 167 ST & STEBBINS ST/POLITE AVE,,,,
...,...,...,...,...,...
67451,BAYCHESTER AVE & E 233 ST,,,,
67460,21 AVE & 45 ST,,,,
67479,COLUMBUS AVE & W 70 ST,,,,
67552,GCP & 165 ST,,,,


In [11]:
incidents_df.TYPE.value_counts()

5.0    75378
3.0    33547
6.0    17471
1.0    12224
4.0     3890
2.0      260
7.0       19
Name: TYPE, dtype: int64

In [12]:
incidents_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148248 entries, 0 to 148247
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   LOCATION  148238 non-null  object 
 1   DATE      142789 non-null  object 
 2   ALARM_ID  142789 non-null  float64
 3   BOROUGH   142789 non-null  object 
 4   TYPE      142789 non-null  float64
dtypes: float64(2), object(3)
memory usage: 6.8+ MB


In [13]:
incidents_df['DATE'] = pd.to_datetime(incidents_df['DATE']).apply(lambda x: x.date())
incidents_df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
0,3 AVE & 65 ST,2018-02-06,2653.0,BROOKLYN,3.0
1,3 AVE & 65 ST,2018-07-13,2653.0,BROOKLYN,5.0
2,3 AVE & 65 ST,2018-08-06,2653.0,BROOKLYN,3.0
3,3 AVE & 65 ST,2018-09-13,2653.0,BROOKLYN,5.0
4,3 AVE & 65 ST,2018-09-16,2653.0,BROOKLYN,5.0


In [14]:
# Classify fire types and label-encode BOROUGH values
#  1 = Manhattan
#  2 = Bronx
#  3 = Brooklyn
#  4 = Queens
#  5 = Staten Island

def classifyBorough(x):
    if x == 'MANHATTAN':
        return 1
    elif x == 'BRONX':
        return 2
    elif x == 'BROOKLYN':
        return 3
    elif x == 'QUEENS':
        return 4
    elif x == 'STATEN ISLAND':
        return 5
    else: return 0

incidents_df['BOROUGH'] = dispatch_df.BOROUGH.apply(classifyBorough)
incidents_df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
0,3 AVE & 65 ST,2018-02-06,2653.0,4.0,3.0
1,3 AVE & 65 ST,2018-07-13,2653.0,4.0,5.0
2,3 AVE & 65 ST,2018-08-06,2653.0,3.0,3.0
3,3 AVE & 65 ST,2018-09-13,2653.0,4.0,5.0
4,3 AVE & 65 ST,2018-09-16,2653.0,2.0,5.0


## Data Wrangling

In [15]:
incidents_df.TYPE.fillna(0, inplace=True)
incidents_df.DATE.fillna(0, inplace=True)
incidents_df.ALARM_ID.fillna(0, inplace=True)
incidents_df.BOROUGH.fillna(0, inplace=True)
incidents_df[nulls]

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE
105,NEW YORK AVE & LEFFERTS AVE,0,0.0,1.0,0.0
108,VINCENT AVE & COVERLY ST,0,0.0,3.0,0.0
160,METROPOLITAN AVE & PLEASANTVIEW ST,0,0.0,1.0,0.0
161,NARROWS RD N 1000' W OF FINGERBOARD RD,0,0.0,2.0,0.0
162,E 167 ST & STEBBINS ST/POLITE AVE,0,0.0,3.0,0.0
...,...,...,...,...,...
67451,BAYCHESTER AVE & E 233 ST,0,0.0,3.0,0.0
67460,21 AVE & 45 ST,0,0.0,1.0,0.0
67479,COLUMBUS AVE & W 70 ST,0,0.0,1.0,0.0
67552,GCP & 165 ST,0,0.0,2.0,0.0


In [16]:
incidents_df['FIRE'] = np.where(incidents_df['TYPE'] == 0, 0, 1)
incidents_df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE,FIRE
0,3 AVE & 65 ST,2018-02-06,2653.0,4.0,3.0,1
1,3 AVE & 65 ST,2018-07-13,2653.0,4.0,5.0,1
2,3 AVE & 65 ST,2018-08-06,2653.0,3.0,3.0,1
3,3 AVE & 65 ST,2018-09-13,2653.0,4.0,5.0,1
4,3 AVE & 65 ST,2018-09-16,2653.0,2.0,5.0,1


In [17]:
# Copy into new dataframe for backup purposes
df = incidents_df
df.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE,FIRE
0,3 AVE & 65 ST,2018-02-06,2653.0,4.0,3.0,1
1,3 AVE & 65 ST,2018-07-13,2653.0,4.0,5.0,1
2,3 AVE & 65 ST,2018-08-06,2653.0,3.0,3.0,1
3,3 AVE & 65 ST,2018-09-13,2653.0,4.0,5.0,1
4,3 AVE & 65 ST,2018-09-16,2653.0,2.0,5.0,1


In [19]:
# Calculate all locations with at least one fire incident recorded
df.FIRE.value_counts()

1    142789
0      5459
Name: FIRE, dtype: int64

In [20]:
target = df['FIRE']
target_names = ['negative', 'positive']

In [21]:
data = df.drop('FIRE', axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,LOCATION,DATE,ALARM_ID,BOROUGH,TYPE,ID
0,3 AVE & 65 ST,2018-02-06,2653.0,4.0,3.0,2653
1,3 AVE & 65 ST,2018-07-13,2653.0,4.0,5.0,2653
2,3 AVE & 65 ST,2018-08-06,2653.0,3.0,3.0,2653
3,3 AVE & 65 ST,2018-09-13,2653.0,4.0,5.0,2653
4,3 AVE & 65 ST,2018-09-16,2653.0,2.0,5.0,2653
