# Assessing fire risk factors in NYC

## Background
Using data provided by NYC OpenData, this notebook walks through the steps of analyzing fire-related incidents and some possible contributing factors in New York City.

## Import Libraries

In [1]:
# Data analysis and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

# Interactive maps
import folium
from folium.plugins import HeatMap

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

## Load and describe data

Note: Data was filtered on the NYC OpenData site to only include incident classification groups that were fire-related (Structural and NonStructural Fires) prior to export.

In [2]:
# Connection to azure database 
# import pandas as pd, pyodbc
# server = 'finalprojectdata.database.windows.net'
# database = 'v2-project-data'
# username = 'finalproject1_pmprybylski'
# password = 'firedispatch1!'
# driver= '{ODBC Driver 17 for SQL Server}'
# # con_string = 'DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password
# # con_string = 'DRIVER={SQL Server};SERVER='+ <server> +';DATABASE=' + <database>
# # cnxn = pyodbc.connect(con_string)
# cnxn = pyodbc.connect(
#     'DRIVER={ODBC Driver 17 for SQL Server};'
#     'SERVER=finalprojectdata.database.windows.net;'
#     'PORT=1433;'
#     'DATABASE=v2-project-data;'
#     'UID=finalproject1_pmprybylski;'
#     'PWD=firedispatch1!;'
# )
# query = """
# SELECT TOP 3 * FROM cleaned_fire_dispatch_data
# """
# result_port_map = pd.read_sql(query, cnxn)
# result_port_map

In [3]:
# Load the data into Python

# Fire Incident Dispatch
alarms_df = pd.read_csv('../data/raw/In-Service_Alarm_Box_Locations.csv')
dispatch_df = pd.read_csv('../data/raw/Fire_Incident_Dispatch_Data.csv')
fire_counts_df = pd.read_csv('../data/raw/Fire_Counts.csv')

# NYPD Complaints
nypd_df = pd.read_csv('../data/raw/NYPD_Complaint_18-21.csv')

# Dept of Buildings/Environ Control Board Violations
DOB18_df = pd.read_csv('../data/raw/DOB_ECB_Violations_18.csv')
DOB19_df = pd.read_csv('../data/raw/DOB_ECB_Violations_19.csv')
DOB20_df = pd.read_csv('../data/raw/DOB_ECB_Violations_20.csv')
DOB21_df = pd.read_csv('../data/raw/DOB_ECB_Violations_21.csv')

# Housing Maintenance Code Violations
codev_df = pd.read_csv('../data/raw/Housing_Maintenance_Code_Violations_18-21.csv')

# Orders to repair/vacate
vacate_df = pd.read_csv('../data/raw/Order_to_Repair_Vacate_18-21.csv')

## ELT

### Fire Incident Dispatches

In [4]:
# Join Fire Dispatch files
fires_df = pd.merge(left=alarms_df, right=dispatch_df, left_on='LOCATION', right_on='ALARM_BOX_LOCATION')
fires_df.head()

Unnamed: 0,BOROBOX,BOX_TYPE,LOCATION,ZIP,BOROUGH,COMMUNITYDISTICT,CITYCOUNCIL,LATITUDE,LONGITUDE,Location Point,...,FIRST_ACTIVATION_DATETIME,FIRST_ON_SCENE_DATETIME,INCIDENT_CLOSE_DATETIME,VALID_DISPATCH_RSPNS_TIME_INDC,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,INCIDENT_TRAVEL_TM_SECONDS_QY,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY
0,B2653,ERS,3 AVE & 65 ST,11220.0,Brooklyn,BK07,38.0,40.63932,-74.023549,POINT (-74.02354939 40.63932033),...,02/06/2018 03:43:00 AM,02/06/2018 03:46:35 AM,02/06/2018 04:48:21 AM,N,Y,363,357,2,2,0
1,B2653,ERS,3 AVE & 65 ST,11220.0,Brooklyn,BK07,38.0,40.63932,-74.023549,POINT (-74.02354939 40.63932033),...,07/13/2018 01:56:41 PM,07/13/2018 01:59:36 PM,07/13/2018 02:09:18 PM,N,Y,227,194,3,2,1
2,B2653,ERS,3 AVE & 65 ST,11220.0,Brooklyn,BK07,38.0,40.63932,-74.023549,POINT (-74.02354939 40.63932033),...,08/06/2018 06:33:22 AM,08/06/2018 06:35:39 AM,08/06/2018 06:54:41 AM,N,Y,191,155,3,2,1
3,B2653,ERS,3 AVE & 65 ST,11220.0,Brooklyn,BK07,38.0,40.63932,-74.023549,POINT (-74.02354939 40.63932033),...,09/13/2018 06:28:56 PM,09/13/2018 06:30:12 PM,09/13/2018 07:41:08 PM,N,Y,160,95,4,2,1
4,B2653,ERS,3 AVE & 65 ST,11220.0,Brooklyn,BK07,38.0,40.63932,-74.023549,POINT (-74.02354939 40.63932033),...,09/16/2018 09:52:58 AM,09/16/2018 09:54:42 AM,09/16/2018 10:16:05 AM,N,Y,171,121,3,2,1


In [5]:
# Remove unnecessary columns
fires_df = fires_df[['STARFIRE_INCIDENT_ID',
               'INCIDENT_DATETIME',
               'ALARM_BOX_BOROUGH',
               'BOROBOX',
               'ALARM_BOX_LOCATION',
               'LATITUDE',
               'LONGITUDE',
               'INCIDENT_BOROUGH',
               'ZIPCODE',
               'INCIDENT_CLASSIFICATION',
               'INCIDENT_CLASSIFICATION_GROUP',
               'DISPATCH_RESPONSE_SECONDS_QY',
               'INCIDENT_RESPONSE_SECONDS_QY',
               'INCIDENT_TRAVEL_TM_SECONDS_QY',
               'ENGINES_ASSIGNED_QUANTITY',
               'LADDERS_ASSIGNED_QUANTITY',
               'OTHER_UNITS_ASSIGNED_QUANTITY',]]
fires_df.head()

Unnamed: 0,STARFIRE_INCIDENT_ID,INCIDENT_DATETIME,ALARM_BOX_BOROUGH,BOROBOX,ALARM_BOX_LOCATION,LATITUDE,LONGITUDE,INCIDENT_BOROUGH,ZIPCODE,INCIDENT_CLASSIFICATION,INCIDENT_CLASSIFICATION_GROUP,DISPATCH_RESPONSE_SECONDS_QY,INCIDENT_RESPONSE_SECONDS_QY,INCIDENT_TRAVEL_TM_SECONDS_QY,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY
0,1803726530140110,02/06/2018 03:40:32 AM,BROOKLYN,B2653,3 AVE & 65 ST,40.63932,-74.023549,BROOKLYN,11220.0,Automobile Fire,NonStructural Fires,6,363,357,2,2,0
1,1819426530140570,07/13/2018 01:55:49 PM,BROOKLYN,B2653,3 AVE & 65 ST,40.63932,-74.023549,BROOKLYN,11220.0,Multiple Dwelling 'A' - Other fire,Structural Fires,33,227,194,3,2,1
2,1821826530140150,08/06/2018 06:32:28 AM,BROOKLYN,B2653,3 AVE & 65 ST,40.63932,-74.023549,BROOKLYN,11220.0,Demolition Debris or Rubbish Fire,NonStructural Fires,36,191,155,3,2,1
3,1825626530241090,09/13/2018 06:27:32 PM,BROOKLYN,B2653,3 AVE & 65 ST,40.63932,-74.023549,BROOKLYN,11220.0,Multiple Dwelling 'A' - Other fire,Structural Fires,65,160,95,4,2,1
4,1825926530240410,09/16/2018 09:51:51 AM,BROOKLYN,B2653,3 AVE & 65 ST,40.63932,-74.023549,BROOKLYN,11220.0,Multiple Dwelling 'A' - Food on the stove fire,Structural Fires,50,171,121,3,2,1


In [6]:
# Export cleaned data to csv for visualization use
fires_df.to_csv('../data/processed/cleaned_fire_dispatch_data.csv', index=False)

In [7]:
fire_counts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   INCIDENT_DATETIME     205 non-null    object
 1   INCIDENT_BOROUGH      205 non-null    object
 2   STARFIRE_INCIDENT_ID  205 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 4.9+ KB


In [8]:
# Convert INCIDENT_DATETIME column to datetime
fire_counts_df['INCIDENT_DATETIME'] = fire_counts_df['INCIDENT_DATETIME'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
fire_counts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   INCIDENT_DATETIME     205 non-null    datetime64[ns]
 1   INCIDENT_BOROUGH      205 non-null    object        
 2   STARFIRE_INCIDENT_ID  205 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.9+ KB


In [9]:
# Add a column that splits off the year
fire_counts_df['YEAR'] = fire_counts_df['INCIDENT_DATETIME'].dt.year

# Move that column to the beginning of the frame
year = fire_counts_df['YEAR']
fire_counts_df.drop(labels=['YEAR'], axis=1, inplace=True)
fire_counts_df.insert(0,'YEAR', year)
fire_counts_df.head()

Unnamed: 0,YEAR,INCIDENT_DATETIME,INCIDENT_BOROUGH,STARFIRE_INCIDENT_ID
0,2018,2018-01-01,BRONX,12016
1,2018,2018-01-01,BROOKLYN,17129
2,2018,2018-01-01,MANHATTAN,15650
3,2018,2018-01-01,QUEENS,11670
4,2018,2018-01-01,RICHMOND / STATEN ISLAND,2836


In [10]:
# Split into training(18-19) and cross-validation(20-21) dataframes
fires1 = fire_counts_df.loc[(fire_counts_df.YEAR == 2018)|(fire_counts_df.YEAR == 2019)]
fires1

Unnamed: 0,YEAR,INCIDENT_DATETIME,INCIDENT_BOROUGH,STARFIRE_INCIDENT_ID
0,2018,2018-01-01,BRONX,12016
1,2018,2018-01-01,BROOKLYN,17129
2,2018,2018-01-01,MANHATTAN,15650
3,2018,2018-01-01,QUEENS,11670
4,2018,2018-01-01,RICHMOND / STATEN ISLAND,2836
...,...,...,...,...
115,2019,2019-12-01,BRONX,10181
116,2019,2019-12-01,BROOKLYN,13585
117,2019,2019-12-01,MANHATTAN,12750
118,2019,2019-12-01,QUEENS,9230


In [11]:
fires2 = fire_counts_df.loc[(fire_counts_df.YEAR == 2020)|(fire_counts_df.YEAR == 2021)]
fires2

Unnamed: 0,YEAR,INCIDENT_DATETIME,INCIDENT_BOROUGH,STARFIRE_INCIDENT_ID
120,2020,2020-01-01,BRONX,9560
121,2020,2020-01-01,BROOKLYN,12903
122,2020,2020-01-01,MANHATTAN,12192
123,2020,2020-01-01,QUEENS,8964
124,2020,2020-01-01,RICHMOND / STATEN ISLAND,2098
...,...,...,...,...
200,2021,2021-05-01,BRONX,1856
201,2021,2021-05-01,BROOKLYN,2220
202,2021,2021-05-01,MANHATTAN,2064
203,2021,2021-05-01,QUEENS,1510


### NYPD Complaints

In [12]:
nypd_df

Unnamed: 0,CMPLNT_FR_DT,BORO_NM,CMPLNT_NUM
0,01/01/2018 12:00:00 AM,BRONX,8337
1,01/01/2018 12:00:00 AM,BROOKLYN,11193
2,01/01/2018 12:00:00 AM,MANHATTAN,9614
3,01/01/2018 12:00:00 AM,QUEENS,7450
4,01/01/2018 12:00:00 AM,STATEN ISLAND,1677
...,...,...,...
211,12/01/2020 12:00:00 AM,BROOKLYN,8631
212,12/01/2020 12:00:00 AM,MANHATTAN,7200
213,12/01/2020 12:00:00 AM,QUEENS,6697
214,12/01/2020 12:00:00 AM,STATEN ISLAND,1216


In [13]:
# Rename columns for easier useage
nypd_df.rename(columns={
    'CMPLNT_FR_DT':'COMPLAINT_DATE',
    'BORO_NM':'BOROUGH',
    'CMPLNT_NUM':'NUMBER_OF_COMPLAINTS'
}, inplace=True)
nypd_df.head()

Unnamed: 0,COMPLAINT_DATE,BOROUGH,NUMBER_OF_COMPLAINTS
0,01/01/2018 12:00:00 AM,BRONX,8337
1,01/01/2018 12:00:00 AM,BROOKLYN,11193
2,01/01/2018 12:00:00 AM,MANHATTAN,9614
3,01/01/2018 12:00:00 AM,QUEENS,7450
4,01/01/2018 12:00:00 AM,STATEN ISLAND,1677


In [14]:
# Export cleaned data to csv for visualization use
nypd_df.to_csv('../data/processed/cleaned_nypd_complaint_data.csv', index=False)

In [15]:
nypd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   COMPLAINT_DATE        216 non-null    object
 1   BOROUGH               180 non-null    object
 2   NUMBER_OF_COMPLAINTS  216 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 5.2+ KB


In [16]:
# Convert COMPLAINT_DATE column to datetime
nypd_df['COMPLAINT_DATE'] = nypd_df['COMPLAINT_DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
nypd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   COMPLAINT_DATE        216 non-null    datetime64[ns]
 1   BOROUGH               180 non-null    object        
 2   NUMBER_OF_COMPLAINTS  216 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.2+ KB


In [17]:
# Add a column that splits off the year
nypd_df['YEAR'] = nypd_df['COMPLAINT_DATE'].dt.year

# Move that column to the beginning of the frame
year = nypd_df['YEAR']
nypd_df.drop(labels=['YEAR'], axis=1, inplace=True)
nypd_df.insert(0,'YEAR', year)
nypd_df.head()

Unnamed: 0,YEAR,COMPLAINT_DATE,BOROUGH,NUMBER_OF_COMPLAINTS
0,2018,2018-01-01,BRONX,8337
1,2018,2018-01-01,BROOKLYN,11193
2,2018,2018-01-01,MANHATTAN,9614
3,2018,2018-01-01,QUEENS,7450
4,2018,2018-01-01,STATEN ISLAND,1677


In [18]:
# Split into training(18-19) and cross-validation(20-21) dataframes
nypd1 = nypd_df.loc[(nypd_df.YEAR == 2018)|(nypd_df.YEAR == 2019)]
nypd1

Unnamed: 0,YEAR,COMPLAINT_DATE,BOROUGH,NUMBER_OF_COMPLAINTS
0,2018,2018-01-01,BRONX,8337
1,2018,2018-01-01,BROOKLYN,11193
2,2018,2018-01-01,MANHATTAN,9614
3,2018,2018-01-01,QUEENS,7450
4,2018,2018-01-01,STATEN ISLAND,1677
...,...,...,...,...
139,2019,2019-12-01,BROOKLYN,10331
140,2019,2019-12-01,MANHATTAN,9292
141,2019,2019-12-01,QUEENS,7481
142,2019,2019-12-01,STATEN ISLAND,1477


In [19]:
nypd2 = nypd_df.loc[(nypd_df.YEAR == 2020) | (nypd_df.YEAR == 2021)]
nypd2

Unnamed: 0,YEAR,COMPLAINT_DATE,BOROUGH,NUMBER_OF_COMPLAINTS
144,2020,2020-01-01,BRONX,8352
145,2020,2020-01-01,BROOKLYN,10810
146,2020,2020-01-01,MANHATTAN,9812
147,2020,2020-01-01,QUEENS,7821
148,2020,2020-01-01,STATEN ISLAND,1608
...,...,...,...,...
211,2020,2020-12-01,BROOKLYN,8631
212,2020,2020-12-01,MANHATTAN,7200
213,2020,2020-12-01,QUEENS,6697
214,2020,2020-12-01,STATEN ISLAND,1216


### Dept of Buildings/Environmental Control Board Violations

In [20]:
DOB18_df.head()

Unnamed: 0,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER
0,1,Boilers,342
1,1,Construction,14250
2,1,Cranes and Derricks,164
3,1,Elevators,2271
4,1,Local Law,1619


In [21]:
DOB19_df.head()

Unnamed: 0,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER
0,1,Boilers,78
1,1,Construction,14808
2,1,Cranes and Derricks,216
3,1,Elevators,1636
4,1,Local Law,68


In [22]:
DOB20_df.head()

Unnamed: 0,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER
0,1,Boilers,16
1,1,Construction,10053
2,1,Cranes and Derricks,127
3,1,Elevators,271
4,1,Local Law,570


In [23]:
DOB21_df.head()

Unnamed: 0,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER
0,4,Quality of Life,21
1,2,Site Safety,22
2,3,Zoning,32
3,5,Unknown,91
4,3,Boilers,5


In [24]:
# Add columns to designate years for each df
DOB18_df.insert(0, 'YEAR', '2018')
DOB19_df.insert(0, 'YEAR', '2019')
DOB20_df.insert(0, 'YEAR', '2020')
DOB21_df.insert(0, 'YEAR', '2021')

In [25]:
DOB21_df.head()

Unnamed: 0,YEAR,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER
0,2021,4,Quality of Life,21
1,2021,2,Site Safety,22
2,2021,3,Zoning,32
3,2021,5,Unknown,91
4,2021,3,Boilers,5


In [26]:
# Replace Borough Number with names based on file schema 
# 1 = Manhattan
# 2 = Bronx
# 3 = Brooklyn
# 4 = Queens
# 5 = Staten Island

def f(x):
    if x['BORO'] == 1: return 'MANHATTAN'
    elif x['BORO'] == 2: return 'BRONX'
    elif x['BORO'] == 3: return 'BROOKLYN'
    elif x['BORO'] == 4: return 'QUEENS'
    elif x['BORO'] == 5: return 'STATEN ISLAND'
    else: return ''

DOB18_df['BOROUGH'] = DOB18_df.apply(f, axis=1)
DOB19_df['BOROUGH'] = DOB19_df.apply(f, axis=1) 
DOB20_df['BOROUGH'] = DOB20_df.apply(f, axis=1)
DOB21_df['BOROUGH'] = DOB21_df.apply(f, axis=1)


In [27]:
# DOB18_df.head()
# DOB19_df.head()
# DOB20_df.head()
DOB21_df.head()

Unnamed: 0,YEAR,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER,BOROUGH
0,2021,4,Quality of Life,21,QUEENS
1,2021,2,Site Safety,22,BRONX
2,2021,3,Zoning,32,BROOKLYN
3,2021,5,Unknown,91,STATEN ISLAND
4,2021,3,Boilers,5,BROOKLYN


In [28]:
# Combine the 2018 and 2019 dataframes for training
dobv1= DOB18_df.append(DOB19_df)
dobv1

Unnamed: 0,YEAR,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER,BOROUGH
0,2018,1,Boilers,342,MANHATTAN
1,2018,1,Construction,14250,MANHATTAN
2,2018,1,Cranes and Derricks,164,MANHATTAN
3,2018,1,Elevators,2271,MANHATTAN
4,2018,1,Local Law,1619,MANHATTAN
...,...,...,...,...,...
51,2019,5,Quality of Life,6,STATEN ISLAND
52,2019,5,Signs,26,STATEN ISLAND
53,2019,5,Site Safety,11,STATEN ISLAND
54,2019,5,Unknown,134,STATEN ISLAND


In [29]:
# Combine the 2020 and 2021 dataframes for cross-validation of predictive algorithm
dobv2 = DOB20_df.append(DOB21_df)
dobv2

Unnamed: 0,YEAR,BORO,VIOLATION_TYPE,DOB_VIOLATION_NUMBER,BOROUGH
0,2020,1,Boilers,16,MANHATTAN
1,2020,1,Construction,10053,MANHATTAN
2,2020,1,Cranes and Derricks,127,MANHATTAN
3,2020,1,Elevators,271,MANHATTAN
4,2020,1,Local Law,570,MANHATTAN
...,...,...,...,...,...
43,2021,1,Boilers,2,MANHATTAN
44,2021,4,Boilers,0,QUEENS
45,2021,5,Site Safety,9,STATEN ISLAND
46,2021,4,Zoning,28,QUEENS


In [30]:
# Merge data for clean data file
dobv_df = dobv1.append(dobv2)

In [31]:
# Export cleaned data to csv for visualization use
dobv_df.to_csv('../data/processed/cleaned_dob_violations_data.csv', index=False)

In [38]:
# Rename columns for easier usage
dobv1.rename(columns={
    'DOB_VIOLATION_NUMBER': 'DOB/ECB_VIOLATION_COUNT'
    }, inplace=True)
dobv1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,YEAR,BOROUGH,ECB_VIOLATION_COUNT
0,2018,MANHATTAN,342
1,2018,MANHATTAN,14250
2,2018,MANHATTAN,164
3,2018,MANHATTAN,2271
4,2018,MANHATTAN,1619


In [35]:
dobv2.rename(columns={
    'DOB_VIOLATION_NUMBER': 'DOB/ECB_VIOLATION_COUNT'
    }, inplace=True)
dobv2.head()

Unnamed: 0,YEAR,BORO,VIOLATION_TYPE,ECB_VIOLATION_COUNT,BOROUGH
0,2020,1,Boilers,16,MANHATTAN
1,2020,1,Construction,10053,MANHATTAN
2,2020,1,Cranes and Derricks,127,MANHATTAN
3,2020,1,Elevators,271,MANHATTAN
4,2020,1,Local Law,570,MANHATTAN


In [36]:
# Drop unnecessary columns
dobv1 = dobv1[['YEAR',
               'BOROUGH',
               'ECB_VIOLATION_COUNT'
]]
dobv1.head()

Unnamed: 0,YEAR,BOROUGH,ECB_VIOLATION_COUNT
0,2018,MANHATTAN,342
1,2018,MANHATTAN,14250
2,2018,MANHATTAN,164
3,2018,MANHATTAN,2271
4,2018,MANHATTAN,1619


In [37]:
dobv2 = dobv2[['YEAR',
                'BOROUGH',
                'ECB_VIOLATION_COUNT'
]]
dobv2.head()

Unnamed: 0,YEAR,BOROUGH,ECB_VIOLATION_COUNT
0,2020,MANHATTAN,16
1,2020,MANHATTAN,10053
2,2020,MANHATTAN,127
3,2020,MANHATTAN,271
4,2020,MANHATTAN,570


In [43]:
# use groupby to "roll-up" count data
dobv1 = dobv1.groupby(['YEAR', 'BOROUGH']).agg({'ECB_VIOLATION_COUNT':'sum'}).reset_index()
dobv1

Unnamed: 0,YEAR,BOROUGH,ECB_VIOLATION_COUNT
0,2018,BRONX,12792
1,2018,BROOKLYN,29570
2,2018,MANHATTAN,20536
3,2018,QUEENS,21974
4,2018,STATEN ISLAND,2699
5,2019,BRONX,11786
6,2019,BROOKLYN,30200
7,2019,MANHATTAN,19796
8,2019,QUEENS,17904
9,2019,STATEN ISLAND,2506


In [44]:
dobv2 = dobv2.groupby(['YEAR', 'BOROUGH']).agg({'ECB_VIOLATION_COUNT':'sum'}).reset_index()
dobv2

Unnamed: 0,YEAR,BOROUGH,ECB_VIOLATION_COUNT
0,2020,BRONX,7248
1,2020,BROOKLYN,18660
2,2020,MANHATTAN,12626
3,2020,QUEENS,10372
4,2020,STATEN ISLAND,1317
5,2021,BRONX,2530
6,2021,BROOKLYN,6409
7,2021,MANHATTAN,4581
8,2021,QUEENS,4174
9,2021,STATEN ISLAND,769


### Housing Maintenance Code Violations

In [45]:
codev_df

Unnamed: 0,NOVIssuedDate,ViolationID,Borough
0,01/01/2018 12:00:00 AM,13430,BRONX
1,01/01/2018 12:00:00 AM,17188,BROOKLYN
2,01/01/2018 12:00:00 AM,8279,MANHATTAN
3,01/01/2018 12:00:00 AM,4191,QUEENS
4,01/01/2018 12:00:00 AM,545,STATEN ISLAND
...,...,...,...
200,05/01/2021 12:00:00 AM,8576,BRONX
201,05/01/2021 12:00:00 AM,13015,BROOKLYN
202,05/01/2021 12:00:00 AM,6341,MANHATTAN
203,05/01/2021 12:00:00 AM,2930,QUEENS


In [46]:
# Rename columns
codev_df.rename(columns={
    'NOVIssuedDate': 'VIOLATION_ISSUE_DATE',
    'ViolationID': 'NUMBER_OF_VIOLATIONS',
    'Borough': 'BOROUGH'
}, inplace=True)
codev_df.head()

Unnamed: 0,VIOLATION_ISSUE_DATE,NUMBER_OF_VIOLATIONS,BOROUGH
0,01/01/2018 12:00:00 AM,13430,BRONX
1,01/01/2018 12:00:00 AM,17188,BROOKLYN
2,01/01/2018 12:00:00 AM,8279,MANHATTAN
3,01/01/2018 12:00:00 AM,4191,QUEENS
4,01/01/2018 12:00:00 AM,545,STATEN ISLAND


In [47]:
# Export cleaned data to csv for visualization use
codev_df.to_csv('../data/processed/cleaned_housing_code_violation_data.csv', index=False)

In [48]:
codev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   VIOLATION_ISSUE_DATE  205 non-null    object
 1   NUMBER_OF_VIOLATIONS  205 non-null    int64 
 2   BOROUGH               205 non-null    object
dtypes: int64(1), object(2)
memory usage: 4.9+ KB


In [49]:
# Convert VIOLATION_ISSUE_DATE column to datetime
codev_df['VIOLATION_ISSUE_DATE'] = codev_df['VIOLATION_ISSUE_DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
codev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   VIOLATION_ISSUE_DATE  205 non-null    datetime64[ns]
 1   NUMBER_OF_VIOLATIONS  205 non-null    int64         
 2   BOROUGH               205 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.9+ KB


In [50]:
# Add a column that splits off the year
codev_df['YEAR'] = codev_df['VIOLATION_ISSUE_DATE'].dt.year

# Move that column to the beginning of the frame
year = codev_df['YEAR']
codev_df.drop(labels=['YEAR'], axis=1, inplace=True)
codev_df.insert(0,'YEAR', year)
codev_df.head()

Unnamed: 0,YEAR,VIOLATION_ISSUE_DATE,NUMBER_OF_VIOLATIONS,BOROUGH
0,2018,2018-01-01,13430,BRONX
1,2018,2018-01-01,17188,BROOKLYN
2,2018,2018-01-01,8279,MANHATTAN
3,2018,2018-01-01,4191,QUEENS
4,2018,2018-01-01,545,STATEN ISLAND


In [51]:
# Split into training(18-19) and cross-validation(20-21) dataframes
codev1 = codev_df.loc[(codev_df.YEAR == 2018)|(codev_df.YEAR == 2019)]
codev1

Unnamed: 0,YEAR,VIOLATION_ISSUE_DATE,NUMBER_OF_VIOLATIONS,BOROUGH
0,2018,2018-01-01,13430,BRONX
1,2018,2018-01-01,17188,BROOKLYN
2,2018,2018-01-01,8279,MANHATTAN
3,2018,2018-01-01,4191,QUEENS
4,2018,2018-01-01,545,STATEN ISLAND
...,...,...,...,...
115,2019,2019-12-01,14352,BRONX
116,2019,2019-12-01,18868,BROOKLYN
117,2019,2019-12-01,10057,MANHATTAN
118,2019,2019-12-01,4658,QUEENS


In [52]:
codev2 = codev_df.loc[(codev_df.YEAR == 2020) | (codev_df.YEAR == 2021)]
codev2

Unnamed: 0,YEAR,VIOLATION_ISSUE_DATE,NUMBER_OF_VIOLATIONS,BOROUGH
120,2020,2020-01-01,13876,BRONX
121,2020,2020-01-01,20237,BROOKLYN
122,2020,2020-01-01,10527,MANHATTAN
123,2020,2020-01-01,4635,QUEENS
124,2020,2020-01-01,521,STATEN ISLAND
...,...,...,...,...
200,2021,2021-05-01,8576,BRONX
201,2021,2021-05-01,13015,BROOKLYN
202,2021,2021-05-01,6341,MANHATTAN
203,2021,2021-05-01,2930,QUEENS


### Orders to Vacate

In [53]:
vacate_df

Unnamed: 0,VACATE EFFECTIVE DATE,BOROUGH,VACATE ORDER NUMBER
0,01/01/2018 12:00:00 AM,BK,30
1,01/01/2018 12:00:00 AM,BX,25
2,01/01/2018 12:00:00 AM,MN,13
3,01/01/2018 12:00:00 AM,QN,14
4,01/01/2018 12:00:00 AM,SI,7
...,...,...,...
190,05/01/2021 12:00:00 AM,BK,8
191,05/01/2021 12:00:00 AM,BX,6
192,05/01/2021 12:00:00 AM,MN,2
193,05/01/2021 12:00:00 AM,QN,6


In [54]:
# Rename columns
vacate_df.rename(columns={
    'VACATE EFFECTIVE DATE': 'VACATE_DATE',
    'BOROUGH': 'BORO',
    'VACATE ORDER NUMBER': 'NUMBER_OF_VACATE_ORDERS'
}, inplace=True)
vacate_df.head()

Unnamed: 0,VACATE_DATE,BORO,NUMBER_OF_VACATE_ORDERS
0,01/01/2018 12:00:00 AM,BK,30
1,01/01/2018 12:00:00 AM,BX,25
2,01/01/2018 12:00:00 AM,MN,13
3,01/01/2018 12:00:00 AM,QN,14
4,01/01/2018 12:00:00 AM,SI,7


In [55]:
# Replace Borough abbreviations with names based on file schema 
# MN = Manhattan
# BX = Bronx
# BK = Brooklyn
# QN = Queens
# SI = Staten Island

def f(x):
    if x['BORO'] == 'MN': return 'MANHATTAN'
    elif x['BORO'] == 'BX': return 'BRONX'
    elif x['BORO'] == 'BK': return 'BROOKLYN'
    elif x['BORO'] == 'QN': return 'QUEENS'
    elif x['BORO'] == 'SI': return 'STATEN ISLAND'
    else: return ''

vacate_df['BOROUGH'] = vacate_df.apply(f, axis=1)
vacate_df.head()

Unnamed: 0,VACATE_DATE,BORO,NUMBER_OF_VACATE_ORDERS,BOROUGH
0,01/01/2018 12:00:00 AM,BK,30,BROOKLYN
1,01/01/2018 12:00:00 AM,BX,25,BRONX
2,01/01/2018 12:00:00 AM,MN,13,MANHATTAN
3,01/01/2018 12:00:00 AM,QN,14,QUEENS
4,01/01/2018 12:00:00 AM,SI,7,STATEN ISLAND


In [56]:
# Drop unneeded columns
vacate_df = vacate_df[['VACATE_DATE',
                       'BOROUGH',
                       'NUMBER_OF_VACATE_ORDERS']]
vacate_df.head()

Unnamed: 0,VACATE_DATE,BOROUGH,NUMBER_OF_VACATE_ORDERS
0,01/01/2018 12:00:00 AM,BROOKLYN,30
1,01/01/2018 12:00:00 AM,BRONX,25
2,01/01/2018 12:00:00 AM,MANHATTAN,13
3,01/01/2018 12:00:00 AM,QUEENS,14
4,01/01/2018 12:00:00 AM,STATEN ISLAND,7


In [57]:
# Export cleaned data to csv for visualization use
vacate_df.to_csv('../data/processed/cleaned_order_to_vacate_data.csv', index=False)

In [58]:
vacate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   VACATE_DATE              195 non-null    object
 1   BOROUGH                  195 non-null    object
 2   NUMBER_OF_VACATE_ORDERS  195 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 4.7+ KB


In [59]:
# Convert VACATE_DATE column to datetime
vacate_df['VACATE_DATE'] = vacate_df['VACATE_DATE'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
vacate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   VACATE_DATE              195 non-null    datetime64[ns]
 1   BOROUGH                  195 non-null    object        
 2   NUMBER_OF_VACATE_ORDERS  195 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.7+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [60]:
# Add a column that splits off the year
vacate_df['YEAR'] = vacate_df['VACATE_DATE'].dt.year

# Move that column to the beginning of the frame
year = vacate_df['YEAR']
vacate_df.drop(labels=['YEAR'], axis=1, inplace=True)
vacate_df.insert(0,'YEAR', year)
vacate_df.head()

Unnamed: 0,YEAR,VACATE_DATE,BOROUGH,NUMBER_OF_VACATE_ORDERS
0,2018,2018-01-01,BROOKLYN,30
1,2018,2018-01-01,BRONX,25
2,2018,2018-01-01,MANHATTAN,13
3,2018,2018-01-01,QUEENS,14
4,2018,2018-01-01,STATEN ISLAND,7


In [61]:
# Split into training(18-19) and cross-validation(20-21) dataframes
vacate1 = vacate_df.loc[(vacate_df.YEAR == 2018)|(vacate_df.YEAR == 2019)]
vacate1

Unnamed: 0,YEAR,VACATE_DATE,BOROUGH,NUMBER_OF_VACATE_ORDERS
0,2018,2018-01-01,BROOKLYN,30
1,2018,2018-01-01,BRONX,25
2,2018,2018-01-01,MANHATTAN,13
3,2018,2018-01-01,QUEENS,14
4,2018,2018-01-01,STATEN ISLAND,7
...,...,...,...,...
110,2019,2019-11-01,STATEN ISLAND,1
111,2019,2019-12-01,BROOKLYN,13
112,2019,2019-12-01,BRONX,10
113,2019,2019-12-01,MANHATTAN,3


In [62]:
vacate2 = vacate_df.loc[(vacate_df.YEAR == 2020) | (vacate_df.YEAR == 2021)]
vacate2

Unnamed: 0,YEAR,VACATE_DATE,BOROUGH,NUMBER_OF_VACATE_ORDERS
115,2020,2020-01-01,BROOKLYN,18
116,2020,2020-01-01,BRONX,16
117,2020,2020-01-01,MANHATTAN,8
118,2020,2020-01-01,QUEENS,8
119,2020,2020-02-01,BROOKLYN,22
...,...,...,...,...
190,2021,2021-05-01,BROOKLYN,8
191,2021,2021-05-01,BRONX,6
192,2021,2021-05-01,MANHATTAN,2
193,2021,2021-05-01,QUEENS,6


## Standardizing Datasets

In [63]:
# Create dataframe to load all variables into
data1 = pd.DataFrame()
data2 = pd.DataFrame()

In [64]:
# Fires 1 dataset
fires1.rename(columns={
    'INCIDENT_DATETIME':'DATE',
    'INCIDENT_BOROUGH':'BOROUGH',
    'STARFIRE_INCIDENT_ID':'FIRE_COUNT'
},inplace=True)
fires1['BOROUGH'] = fires1['BOROUGH'].replace(['RICHMOND / STATEN ISLAND'],'STATEN ISLAND')
fires1['YEAR'] = fires1['YEAR'].apply(str)
fires1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,BOROUGH,FIRE_COUNT
0,2018,2018-01-01,BRONX,12016
1,2018,2018-01-01,BROOKLYN,17129
2,2018,2018-01-01,MANHATTAN,15650
3,2018,2018-01-01,QUEENS,11670
4,2018,2018-01-01,STATEN ISLAND,2836


In [65]:
# Fires 2 Dataset
fires2.rename(columns={
    'INCIDENT_DATETIME':'DATE',
    'INCIDENT_BOROUGH':'BOROUGH',
    'STARFIRE_INCIDENT_ID':'FIRE_COUNT'
},inplace=True)
fires2['BOROUGH'] = fires2['BOROUGH'].replace(['RICHMOND / STATEN ISLAND'],'STATEN ISLAND')
fires2['YEAR'] = fires2['YEAR'].apply(str)
fires2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,BOROUGH,FIRE_COUNT
120,2020,2020-01-01,BRONX,9560
121,2020,2020-01-01,BROOKLYN,12903
122,2020,2020-01-01,MANHATTAN,12192
123,2020,2020-01-01,QUEENS,8964
124,2020,2020-01-01,STATEN ISLAND,2098


In [67]:
# DOB/ECB Violations 1 dataset
dobv1.rename(columns={
    'YEAR_OF_COMPLAINT':'YEAR',
    'ECB_VIOLATION_COUNT':'DOB/ECB_VIOLATION_COUNT'
},inplace=True)
dobv1['DATE'] = dobv1['YEAR'].map(lambda x: '2018-01-01' if '2018'in x else '2019-01-01' if '2019' else '' )
dobv1['DATE'] = dobv1['DATE'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
dobv1.head()

Unnamed: 0,YEAR,BOROUGH,DOB/ECB_VIOLATION_COUNT,DATE
0,2018,BRONX,12792,2018-01-01
1,2018,BROOKLYN,29570,2018-01-01
2,2018,MANHATTAN,20536,2018-01-01
3,2018,QUEENS,21974,2018-01-01
4,2018,STATEN ISLAND,2699,2018-01-01


In [68]:
# DOB/ECB Violations 2 dataset
dobv2.rename(columns={
    'YEAR_OF_COMPLAINT':'YEAR',
    'ECB_VIOLATION_COUNT':'DOB/ECB_VIOLATION_COUNT'
},inplace=True)
dobv2['DATE'] = dobv2['YEAR'].map(lambda x: '2020-01-01' if '2020'in x else '2021-01-01' if '2021' else '' )
dobv2['DATE'] = dobv2['DATE'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
dobv2.head()

Unnamed: 0,YEAR,BOROUGH,DOB/ECB_VIOLATION_COUNT,DATE
0,2020,BRONX,7248,2020-01-01
1,2020,BROOKLYN,18660,2020-01-01
2,2020,MANHATTAN,12626,2020-01-01
3,2020,QUEENS,10372,2020-01-01
4,2020,STATEN ISLAND,1317,2020-01-01


In [69]:
# Housing Code Violations 1 dataset
codev1.rename(columns={
    'VIOLATION_ISSUE_DATE':'DATE',
    'NUMBER_OF_VIOLATIONS':'HOUSING_CODE_VIOLATION_COUNT'
},inplace=True)
codev1['YEAR'] = codev1['YEAR'].apply(str)
codev1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,HOUSING_CODE_VIOLATION_COUNT,BOROUGH
0,2018,2018-01-01,13430,BRONX
1,2018,2018-01-01,17188,BROOKLYN
2,2018,2018-01-01,8279,MANHATTAN
3,2018,2018-01-01,4191,QUEENS
4,2018,2018-01-01,545,STATEN ISLAND


In [70]:
# Housing Code Violations 2 dataset
codev2.rename(columns={
    'VIOLATION_ISSUE_DATE':'DATE',
    'NUMBER_OF_VIOLATIONS':'HOUSING_CODE_VIOLATION_COUNT'
},inplace=True)
codev2['YEAR'] = codev2['YEAR'].apply(str)
codev2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,HOUSING_CODE_VIOLATION_COUNT,BOROUGH
120,2020,2020-01-01,13876,BRONX
121,2020,2020-01-01,20237,BROOKLYN
122,2020,2020-01-01,10527,MANHATTAN
123,2020,2020-01-01,4635,QUEENS
124,2020,2020-01-01,521,STATEN ISLAND


In [71]:
# NYPD Complaints 1 dataset
nypd1.rename(columns={
    'COMPLAINT_DATE':'DATE',
    'NUMBER_OF_COMPLAINTS':'NYPD_COMPLAINT_COUNT',
},inplace=True)
nypd1['YEAR'] = nypd1['YEAR'].apply(str)
nypd1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,BOROUGH,NYPD_COMPLAINT_COUNT
0,2018,2018-01-01,BRONX,8337
1,2018,2018-01-01,BROOKLYN,11193
2,2018,2018-01-01,MANHATTAN,9614
3,2018,2018-01-01,QUEENS,7450
4,2018,2018-01-01,STATEN ISLAND,1677


In [72]:
# NYPD Complaints 2 dataset
nypd2.rename(columns={
    'COMPLAINT_DATE':'DATE',
    'NUMBER_OF_COMPLAINTS':'NYPD_COMPLAINT_COUNT',
},inplace=True)
nypd2['YEAR'] = nypd2['YEAR'].apply(str)
nypd2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,BOROUGH,NYPD_COMPLAINT_COUNT
144,2020,2020-01-01,BRONX,8352
145,2020,2020-01-01,BROOKLYN,10810
146,2020,2020-01-01,MANHATTAN,9812
147,2020,2020-01-01,QUEENS,7821
148,2020,2020-01-01,STATEN ISLAND,1608


In [73]:
# Orders to Vacate 1 Dataset
vacate1.rename(columns={
    'VACATE_DATE': 'DATE',
    'NUMBER_OF_VACATE_ORDERS':'VACATE_ORDER_COUNT'
},inplace=True)
vacate1['YEAR'] = vacate1['YEAR'].apply(str)
vacate1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,BOROUGH,VACATE_ORDER_COUNT
0,2018,2018-01-01,BROOKLYN,30
1,2018,2018-01-01,BRONX,25
2,2018,2018-01-01,MANHATTAN,13
3,2018,2018-01-01,QUEENS,14
4,2018,2018-01-01,STATEN ISLAND,7


In [74]:
# Orders to Vacate 2 Dataset
vacate2.rename(columns={
    'VACATE_DATE': 'DATE',
    'NUMBER_OF_VACATE_ORDERS':'VACATE_ORDER_COUNT'
},inplace=True)
vacate2['YEAR'] = vacate2['YEAR'].apply(str)
vacate2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,YEAR,DATE,BOROUGH,VACATE_ORDER_COUNT
115,2020,2020-01-01,BROOKLYN,18
116,2020,2020-01-01,BRONX,16
117,2020,2020-01-01,MANHATTAN,8
118,2020,2020-01-01,QUEENS,8
119,2020,2020-02-01,BROOKLYN,22


In [78]:
# Merge all dataframes 
data1 = pd.merge(fires1, dobv1, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data1 = pd.merge(data1, nypd1, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data1 = pd.merge(data1, codev1, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data1 = pd.merge(data1, vacate1, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data1.head()


Unnamed: 0,YEAR,DATE,BOROUGH,FIRE_COUNT,DOB/ECB_VIOLATION_COUNT,NYPD_COMPLAINT_COUNT,HOUSING_CODE_VIOLATION_COUNT,VACATE_ORDER_COUNT
0,2018,2018-01-01,BRONX,12016.0,12792.0,8337,13430.0,25.0
1,2018,2018-01-01,BROOKLYN,17129.0,29570.0,11193,17188.0,30.0
2,2018,2018-01-01,MANHATTAN,15650.0,20536.0,9614,8279.0,13.0
3,2018,2018-01-01,QUEENS,11670.0,21974.0,7450,4191.0,14.0
4,2018,2018-01-01,STATEN ISLAND,2836.0,2699.0,1677,545.0,7.0


In [79]:
# Merge all dataframes 
data2 = pd.merge(fires2, dobv2, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data2 = pd.merge(data2, nypd2, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data2 = pd.merge(data2, codev2, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data2 = pd.merge(data2, vacate2, on=['YEAR','DATE', 'BOROUGH'], how='outer')
data2.head()

Unnamed: 0,YEAR,DATE,BOROUGH,FIRE_COUNT,DOB/ECB_VIOLATION_COUNT,NYPD_COMPLAINT_COUNT,HOUSING_CODE_VIOLATION_COUNT,VACATE_ORDER_COUNT
0,2020,2020-01-01,BRONX,9560.0,7248.0,8352.0,13876.0,16.0
1,2020,2020-01-01,BROOKLYN,12903.0,18660.0,10810.0,20237.0,18.0
2,2020,2020-01-01,MANHATTAN,12192.0,12626.0,9812.0,10527.0,8.0
3,2020,2020-01-01,QUEENS,8964.0,10372.0,7821.0,4635.0,8.0
4,2020,2020-01-01,STATEN ISLAND,2098.0,1317.0,1608.0,521.0,


Unnamed: 0,YEAR,DATE,BOROUGH,FIRE_COUNT,DOB/ECB_VIOLATION_COUNT,NYPD_COMPLAINT_COUNT,HOUSING_CODE_VIOLATION_COUNT,VACATE_ORDER_COUNT
0,2018,2018-01-01,BRONX,12016.0,12792.0,8337.0,13430.0,25.0
1,2018,2018-01-01,BROOKLYN,17129.0,29570.0,11193.0,17188.0,30.0
2,2018,2018-01-01,MANHATTAN,15650.0,20536.0,9614.0,8279.0,13.0
3,2018,2018-01-01,QUEENS,11670.0,21974.0,7450.0,4191.0,14.0
4,2018,2018-01-01,STATEN ISLAND,2836.0,2699.0,1677.0,545.0,7.0
...,...,...,...,...,...,...,...,...
236,2020,2020-08-01,,,,57.0,,
237,2020,2020-09-01,,,,61.0,,
238,2020,2020-10-01,,,,41.0,,
239,2020,2020-11-01,,,,30.0,,
