# Assessing fire risk factors in NYC

## Background
Using data provided by NYC OpenData, this notebook walks through the steps of analyzing fire-related incidents and some possible contributing factors in New York City.

## Import Libraries

In [1]:
# Data analysis and visualization
import numpy as np
import pandas as pd
import datetime as dt
from functools import reduce


## Load and describe data

Note: Data was filtered on the NYC OpenData site to only include incident classification groups that were fire-related (Structural and NonStructural Fires) prior to export.

In [2]:
# Load the data into Python

# NYPD Complaints
nypd_df = pd.read_csv('../static/data/raw/NYPD_Complaint_18-21.csv')

# Housing Maintenance Code Violations
codev_df = pd.read_csv('../static/data/raw/Housing_Maintenance_Code_Violations_18-21.csv')

# Orders to repair/vacate
vacate_df = pd.read_csv('../static/data/raw/Order_to_Repair_Vacate_18-21.csv')

## ELT

### NYPD Complaints

In [3]:
nypd_df

Unnamed: 0,CMPLNT_FR_DT,BORO_NM,CMPLNT_NUM
0,01/01/2018 12:00:00 AM,BRONX,8337
1,01/01/2018 12:00:00 AM,BROOKLYN,11193
2,01/01/2018 12:00:00 AM,MANHATTAN,9614
3,01/01/2018 12:00:00 AM,QUEENS,7450
4,01/01/2018 12:00:00 AM,STATEN ISLAND,1677
...,...,...,...
211,12/01/2020 12:00:00 AM,BROOKLYN,8631
212,12/01/2020 12:00:00 AM,MANHATTAN,7200
213,12/01/2020 12:00:00 AM,QUEENS,6697
214,12/01/2020 12:00:00 AM,STATEN ISLAND,1216


In [4]:
# Rename columns for easier useage
nypd_df.rename(columns={
    'CMPLNT_FR_DT':'DATE',
    'BORO_NM':'BOROUGH',
    'CMPLNT_NUM':'NYPD_COMPLAINT_COUNT'
}, inplace=True)
nypd_df.head()

Unnamed: 0,DATE,BOROUGH,NYPD_COMPLAINT_COUNT
0,01/01/2018 12:00:00 AM,BRONX,8337
1,01/01/2018 12:00:00 AM,BROOKLYN,11193
2,01/01/2018 12:00:00 AM,MANHATTAN,9614
3,01/01/2018 12:00:00 AM,QUEENS,7450
4,01/01/2018 12:00:00 AM,STATEN ISLAND,1677


In [5]:
nypd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DATE                  216 non-null    object
 1   BOROUGH               180 non-null    object
 2   NYPD_COMPLAINT_COUNT  216 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 5.2+ KB


In [6]:
# Convert COMPLAINT_DATE column to datetime
nypd_df['DATE'] = nypd_df['DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
nypd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   DATE                  216 non-null    datetime64[ns]
 1   BOROUGH               180 non-null    object        
 2   NYPD_COMPLAINT_COUNT  216 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.2+ KB


### Housing Maintenance Code Violations

In [7]:
codev_df

Unnamed: 0,NOVIssuedDate,ViolationID,Borough
0,01/01/2018 12:00:00 AM,13430,BRONX
1,01/01/2018 12:00:00 AM,17188,BROOKLYN
2,01/01/2018 12:00:00 AM,8279,MANHATTAN
3,01/01/2018 12:00:00 AM,4191,QUEENS
4,01/01/2018 12:00:00 AM,545,STATEN ISLAND
...,...,...,...
200,05/01/2021 12:00:00 AM,8576,BRONX
201,05/01/2021 12:00:00 AM,13015,BROOKLYN
202,05/01/2021 12:00:00 AM,6341,MANHATTAN
203,05/01/2021 12:00:00 AM,2930,QUEENS


In [8]:
# Rename columns
codev_df.rename(columns={
    'NOVIssuedDate': 'DATE',
    'Borough': 'BOROUGH',    
    'ViolationID': 'CODE_VIOLATION_COUNT'
}, inplace=True)
codev_df.head()

Unnamed: 0,DATE,CODE_VIOLATION_COUNT,BOROUGH
0,01/01/2018 12:00:00 AM,13430,BRONX
1,01/01/2018 12:00:00 AM,17188,BROOKLYN
2,01/01/2018 12:00:00 AM,8279,MANHATTAN
3,01/01/2018 12:00:00 AM,4191,QUEENS
4,01/01/2018 12:00:00 AM,545,STATEN ISLAND


In [9]:
codev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DATE                  205 non-null    object
 1   CODE_VIOLATION_COUNT  205 non-null    int64 
 2   BOROUGH               205 non-null    object
dtypes: int64(1), object(2)
memory usage: 4.9+ KB


In [10]:
# Convert VIOLATION_ISSUE_DATE column to datetime
codev_df['DATE'] = codev_df['DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
codev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   DATE                  205 non-null    datetime64[ns]
 1   CODE_VIOLATION_COUNT  205 non-null    int64         
 2   BOROUGH               205 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.9+ KB


### Orders to Vacate

In [11]:
vacate_df

Unnamed: 0,VACATE EFFECTIVE DATE,BOROUGH,VACATE ORDER NUMBER
0,01/01/2018 12:00:00 AM,BK,30
1,01/01/2018 12:00:00 AM,BX,25
2,01/01/2018 12:00:00 AM,MN,13
3,01/01/2018 12:00:00 AM,QN,14
4,01/01/2018 12:00:00 AM,SI,7
...,...,...,...
190,05/01/2021 12:00:00 AM,BK,8
191,05/01/2021 12:00:00 AM,BX,6
192,05/01/2021 12:00:00 AM,MN,2
193,05/01/2021 12:00:00 AM,QN,6


In [12]:
# Rename columns
vacate_df.rename(columns={
    'VACATE EFFECTIVE DATE': 'DATE',
    'BOROUGH': 'BORO',
    'VACATE ORDER NUMBER': 'VACATE_ORDER_COUNT'
}, inplace=True)
vacate_df.head()

Unnamed: 0,DATE,BORO,VACATE_ORDER_COUNT
0,01/01/2018 12:00:00 AM,BK,30
1,01/01/2018 12:00:00 AM,BX,25
2,01/01/2018 12:00:00 AM,MN,13
3,01/01/2018 12:00:00 AM,QN,14
4,01/01/2018 12:00:00 AM,SI,7


In [13]:
# Replace Borough abbreviations with names based on file schema 
# MN = Manhattan
# BX = Bronx
# BK = Brooklyn
# QN = Queens
# SI = Staten Island

def f(x):
    if x['BORO'] == 'MN': return 'MANHATTAN'
    elif x['BORO'] == 'BX': return 'BRONX'
    elif x['BORO'] == 'BK': return 'BROOKLYN'
    elif x['BORO'] == 'QN': return 'QUEENS'
    elif x['BORO'] == 'SI': return 'STATEN ISLAND'
    else: return ''

vacate_df['BOROUGH'] = vacate_df.apply(f, axis=1)
vacate_df.head()

Unnamed: 0,DATE,BORO,VACATE_ORDER_COUNT,BOROUGH
0,01/01/2018 12:00:00 AM,BK,30,BROOKLYN
1,01/01/2018 12:00:00 AM,BX,25,BRONX
2,01/01/2018 12:00:00 AM,MN,13,MANHATTAN
3,01/01/2018 12:00:00 AM,QN,14,QUEENS
4,01/01/2018 12:00:00 AM,SI,7,STATEN ISLAND


In [14]:
# Drop unneeded columns
vacate_df = vacate_df[['DATE',
                       'BOROUGH',
                       'VACATE_ORDER_COUNT']]
vacate_df.head()

Unnamed: 0,DATE,BOROUGH,VACATE_ORDER_COUNT
0,01/01/2018 12:00:00 AM,BROOKLYN,30
1,01/01/2018 12:00:00 AM,BRONX,25
2,01/01/2018 12:00:00 AM,MANHATTAN,13
3,01/01/2018 12:00:00 AM,QUEENS,14
4,01/01/2018 12:00:00 AM,STATEN ISLAND,7


In [15]:
vacate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   DATE                195 non-null    object
 1   BOROUGH             195 non-null    object
 2   VACATE_ORDER_COUNT  195 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 4.7+ KB


In [16]:
# Convert VACATE_DATE column to datetime
vacate_df['DATE'] = vacate_df['DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
vacate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                195 non-null    datetime64[ns]
 1   BOROUGH             195 non-null    object        
 2   VACATE_ORDER_COUNT  195 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.7+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
# Merge dataframes
df = [nypd_df, codev_df, vacate_df]

factors = reduce(lambda left,right: pd.merge(left,right,on=['DATE','BOROUGH'],how='outer'),df)

factors

Unnamed: 0,DATE,BOROUGH,NYPD_COMPLAINT_COUNT,CODE_VIOLATION_COUNT,VACATE_ORDER_COUNT
0,2018-01-01,BRONX,8337.0,13430.0,25.0
1,2018-01-01,BROOKLYN,11193.0,17188.0,30.0
2,2018-01-01,MANHATTAN,9614.0,8279.0,13.0
3,2018-01-01,QUEENS,7450.0,4191.0,14.0
4,2018-01-01,STATEN ISLAND,1677.0,545.0,7.0
...,...,...,...,...,...
236,2021-05-01,BRONX,,8576.0,6.0
237,2021-05-01,BROOKLYN,,13015.0,8.0
238,2021-05-01,MANHATTAN,,6341.0,2.0
239,2021-05-01,QUEENS,,2930.0,6.0


In [19]:
factors.to_csv('../static/data/processed/factors.csv', index=False)