# Assessing fire risk by location in NYC

## Background
Using data provided by NYC OpenData, this notebook walks through the steps of analyzing fire risk in New York City.

## Import Libraries

In [None]:
# Data analysis and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

# Interactive maps
import folium
from folium.plugins import HeatMap

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

## Load and describe data

Note: Data was filtered on the NYC OpenData site to only include incident classification groups that were fire-related (Structural and NonStructural Fires) prior to export.

In [None]:
# Connection to azure database 
# import pandas as pd, pyodbc
# server = 'finalprojectdata.database.windows.net'
# database = 'v2-project-data'
# username = 'finalproject1_pmprybylski'
# password = 'firedispatch1!'
# driver= '{ODBC Driver 17 for SQL Server}'
# # con_string = 'DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password
# # con_string = 'DRIVER={SQL Server};SERVER='+ <server> +';DATABASE=' + <database>
# # cnxn = pyodbc.connect(con_string)
# cnxn = pyodbc.connect(
#     'DRIVER={ODBC Driver 17 for SQL Server};'
#     'SERVER=finalprojectdata.database.windows.net;'
#     'PORT=1433;'
#     'DATABASE=v2-project-data;'
#     'UID=finalproject1_pmprybylski;'
#     'PWD=firedispatch1!;'
# )
# query = """
# SELECT TOP 3 * FROM cleaned_fire_dispatch_data
# """
# result_port_map = pd.read_sql(query, cnxn)
# result_port_map

In [None]:
# Load the data into Python

# Fire Incident Dispatch
alarms_df = pd.read_csv('../data/raw/In-Service_Alarm_Box_Locations.csv')
dispatch_df = pd.read_csv('../data/raw/Fire_Incident_Dispatch_Data.csv')

# NYPD Complaints
nypd_df = pd.read_csv('../data/raw/NYPD_Complaint_18-21.csv')

# Dept of Buildings/Environ Control Board Violations
DOB18_df = pd.read_csv('../data/raw/DOB_ECB_Violations_18.csv')
DOB19_df = pd.read_csv('../data/raw/DOB_ECB_Violations_19.csv')
DOB20_df = pd.read_csv('../data/raw/DOB_ECB_Violations_20.csv')
DOB21_df = pd.read_csv('../data/raw/DOB_ECB_Violations_21.csv')

# Housing Maintenance Code Violations
codev_df = pd.read_csv('../data/raw/Housing_Maintenance_Code_Violations_18-21.csv')

# Orders to repair/vacate
vacate_df = pd.read_csv('../data/raw/Order_to_Repair_Vacate_18-21.csv')

## Data Wrangling

### Fire Incident Dispatches

In [None]:
# Join Fire Dispatch files
fires_df = pd.merge(left=alarms_df, right=dispatch_df, left_on='LOCATION', right_on='ALARM_BOX_LOCATION')
fires_df.head()

In [None]:
# Remove unnecessary columns
fires_df = fires_df[['STARFIRE_INCIDENT_ID',
               'INCIDENT_DATETIME',
               'ALARM_BOX_BOROUGH',
               'BOROBOX',
               'ALARM_BOX_LOCATION',
               'LATITUDE',
               'LONGITUDE',
               'INCIDENT_BOROUGH',
               'ZIPCODE',
               'INCIDENT_CLASSIFICATION',
               'INCIDENT_CLASSIFICATION_GROUP',
               'DISPATCH_RESPONSE_SECONDS_QY',
               'INCIDENT_RESPONSE_SECONDS_QY',
               'INCIDENT_TRAVEL_TM_SECONDS_QY',
               'ENGINES_ASSIGNED_QUANTITY',
               'LADDERS_ASSIGNED_QUANTITY',
               'OTHER_UNITS_ASSIGNED_QUANTITY',]]
fires_df.head()

In [None]:
# Export cleaned data to csv for visualization use
fires_df.to_csv('../data/processed/cleaned_fire_dispatch_data.csv', index=False)

In [None]:
fires_df.info()

In [None]:
# Convert INCIDENT_DATETIME column to datetime
fires_df['INCIDENT_DATETIME'] = fires_df['INCIDENT_DATETIME'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
fires_df.info()

In [None]:
# Add a column that splits off the year
fires_df['YEAR'] = fires_df['INCIDENT_DATETIME'].dt.year

# Move that column to the beginning of the frame
year = fires_df['YEAR']
fires_df.drop(labels=['YEAR'], axis=1, inplace=True)
fires_df.insert(0,'YEAR', year)
fires_df.head()

In [None]:
# Split into training(18-19) and cross-validation(20-21) dataframes
fires1 = fires_df.loc[(fires_df.YEAR == 2018)|(fires_df.YEAR == 2019)]
fires1

In [None]:
fires2 = fires_df.loc[(fires_df.YEAR == 2020)|(fires_df.YEAR == 2021)]
fires2

### NYPD Complaints

In [None]:
nypd_df

In [None]:
# Rename columns for easier useage
nypd_df.rename(columns={
    'CMPLNT_FR_DT':'COMPLAINT_DATE',
    'BORO_NM':'BOROUGH',
    'CMPLNT_NUM':'NUMBER_OF_COMPLAINTS'
}, inplace=True)
nypd_df.head()

In [None]:
# Export cleaned data to csv for visualization use
nypd_df.to_csv('../data/processed/cleaned_nypd_complaint_data.csv', index=False)

In [None]:
nypd_df.info()

In [None]:
# Convert COMPLAINT_DATE column to datetime
nypd_df['COMPLAINT_DATE'] = fires_df['COMPLAINT_DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
nypd_df.info()

In [None]:
# Add a column that splits off the year
nypd_df['YEAR'] = fires_df['COMPLAINT_DATE'].dt.year

# Move that column to the beginning of the frame
year = nypd_df['YEAR']
nypd_df.drop(labels=['YEAR'], axis=1, inplace=True)
nypd_df.insert(0,'YEAR', year)
nypd_df.head()

In [None]:
# Split into training(18-19) and cross-validation(20-21) dataframes
nypd1 = fires_df.loc[(nypd_df.YEAR == 2018)|(nypd_df.YEAR == 2019)]
nypd1

In [None]:
nypd2 = nypd_df.loc[(nypd_df.YEAR == 2020) | (nypd_df.YEAR == 2021)]
nypd2

In [None]:
# Split into training and cross-validation dataframes

### Dept of Buildings/Environmental Control Board Violations

In [None]:
DOB18_df.head()

In [None]:
DOB19_df.head()

In [None]:
DOB20_df.head()

In [None]:
DOB21_df.head()

In [None]:
# Add columns to designate years for each df
DOB18_df.insert(0, 'YEAR', '2018')
DOB19_df.insert(0, 'YEAR', '2019')
DOB20_df.insert(0, 'YEAR', '2020')
DOB21_df.insert(0, 'YEAR', '2021')

In [None]:
DOB21_df.head()

In [None]:
# Replace Borough Number with names based on file schema 
# 1 = Manhattan
# 2 = Bronx
# 3 = Brooklyn
# 4 = Queens
# 5 = Staten Island

def f(x):
    if x['BORO'] == 1: return 'MANHATTAN'
    elif x['BORO'] == 2: return 'BRONX'
    elif x['BORO'] == 3: return 'BROOKLYN'
    elif x['BORO'] == 4: return 'QUEENS'
    elif x['BORO'] == 5: return 'STATEN ISLAND'
    else: return ''

DOB18_df['BOROUGH'] = DOB18_df.apply(f, axis=1)
DOB19_df['BOROUGH'] = DOB19_df.apply(f, axis=1) 
DOB20_df['BOROUGH'] = DOB20_df.apply(f, axis=1)
DOB21_df['BOROUGH'] = DOB21_df.apply(f, axis=1)


In [None]:
# DOB18_df.head()
# DOB19_df.head()
# DOB20_df.head()
DOB21_df.head()

In [None]:
# Combine the 2018 and 2019 dataframes for training
dobv_df = DOB18_df.append(DOB19_df)
dobv_df

In [None]:
# Combine the 2020 and 2021 dataframes for cross-validation of predictive algorithm
dobv2_df = DOB20_df.append(DOB21_df)
dobv2_df

In [None]:
# Rename columns for easier usage
dobv_df.rename(columns={
    'YEAR': 'YEAR_OF_COMPLAINT',
    'DOB_VIOLATION_NUMBER': 'NUMBER_OF_VIOLATIONS'
    }, inplace=True)
dobv_df.head()

In [None]:
dobv2_df.rename(columns={
    'YEAR': 'YEAR_OF_COMPLAINT',
    'DOB_VIOLATION_NUMBER': 'NUMBER_OF_VIOLATIONS'
    }, inplace=True)
dobv2_df.head()

In [None]:
# Drop unnecessary columns
dobv_df = dobv_df[['YEAR_OF_COMPLAINT',
                   'BOROUGH',
                   'VIOLATION_TYPE',
                   'NUMBER_OF_VIOLATIONS'
]]
dobv_df.head()

In [None]:
dobv2_df = dobv2_df[['YEAR_OF_COMPLAINT',
                   'BOROUGH',
                   'VIOLATION_TYPE',
                   'NUMBER_OF_VIOLATIONS'
]]
dobv2_df.head()

### Housing Maintenance Code Violations

In [None]:
codev_df

In [None]:
codev_df.info()

In [None]:
# Clean-up date column

# Convert INCIDENT_DATETIME column to datetime
# codev_df['NOVIssuedDate'] = codev_df['NOVIssuedDate'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
codev_df.head()


### Orders to Repair/Vacate

In [None]:
vacate_df

## Mapping alarm boxes

## Data Wrangling