# Natural Gas Consumption
https://data.cityofnewyork.us/Environment/Natural-Gas-Consumption-by-ZIP-Code-2010/uedp-fegm

Natural gas consumption information is given by zipcode. Number of reported gas leaks and all socioeconomic data is given by census tracts (geoid). There is a way to find all census tracts in each zipcode area. But merging two datasets by geoid will be wrong, as some tracts belong to 2-3 zipcode areas. 

Another way to merge this data is by converting everything to neighborhoods (development name)


In [104]:
import pandas as pd
import numpy as np
import os

### Attempt 1 to merge natural gas consumtion table with the rest of the data (gas leaks, socioeconomic data)

In [105]:
data = pd.read_csv("https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_tract_rel_10.txt")

In [106]:
data.head()

Unnamed: 0,ZCTA5,STATE,COUNTY,TRACT,GEOID,POPPT,HUPT,AREAPT,AREALANDPT,ZPOP,...,TRAREA,TRAREALAND,ZPOPPCT,ZHUPCT,ZAREAPCT,ZAREALANDPCT,TRPOPPCT,TRHUPCT,TRAREAPCT,TRAREALANDPCT
0,601,72,1,956300,72001956300,4271,1706,44663250,44572589,18570,...,44924558,44833897,23.0,22.03,26.67,26.74,98.5,98.33,99.42,99.42
1,601,72,1,956400,72001956400,2384,1037,32830481,32492074,18570,...,37782601,37191697,12.84,13.39,19.61,19.5,79.6,80.14,86.89,87.36
2,601,72,1,956500,72001956500,3126,1240,44969548,44809680,18570,...,44969548,44809680,16.83,16.01,26.85,26.89,100.0,100.0,100.0,100.0
3,601,72,1,956600,72001956600,2329,972,1981101,1981101,18570,...,1981101,1981101,12.54,12.55,1.18,1.19,100.0,100.0,100.0,100.0
4,601,72,1,956700,72001956700,2053,948,1380041,1380041,18570,...,1380041,1380041,11.06,12.24,0.82,0.83,100.0,100.0,100.0,100.0


In [107]:
data.columns = map(str.lower, data.columns)

In [108]:
len(data['geoid'].unique())

73596

In [109]:
columns = ['zcta5', 'geoid']
data = data[columns]

data['zip'] = data['zcta5'].astype('str').str.zfill(5)
data['geoid'] = data['geoid'].astype('str').str.zfill(11)
data.head()

Unnamed: 0,zcta5,geoid,zip
0,601,72001956300,601
1,601,72001956400,601
2,601,72001956500,601
3,601,72001956600,601
4,601,72001956700,601


In [110]:
data = data[['geoid', 'zip']]

In [111]:
gas_data = pd.read_csv('C:\\Users\\mskac\\machineLearning\\GasLeakConEd\\data\\processed\\important_(used_in_app)\\Merged_asc_fdny_data.csv')

In [112]:
gas_data.head()

Unnamed: 0.1,Unnamed: 0,%housh_grandp_resp_for_grandch,avg_houshold_size,avg_year_built,gas_leaks,gas_leaks_per_person,geoid,incident_year,lonely_housholder%,lonely_housholder_over65%,mean_houshold_income,median_age,nonfamily_housholds%,not_us_citizen%,total_housing_units,unemployed%,vacant_housing_units%
0,0,9.576837,3.6,1953.839431,2,0.000412,36005000200,2013,18.040089,11.210097,72652.0,34.5,19.005197,11.198024,1476,8.5,8.7
1,1,4.16429,3.04,1972.667364,5,0.000936,36005000400,2013,13.519681,7.187678,80733.0,35.2,18.881917,7.821856,1912,5.7,8.3
2,2,1.748072,2.55,1969.786833,11,0.002109,36005001600,2013,33.470437,13.984576,44609.0,39.2,34.03599,15.819751,2043,6.6,4.8
3,3,0.0,2.6,1966.332772,3,0.001171,36005001900,2013,35.499398,2.406739,42285.0,30.5,44.645006,24.863388,891,18.5,6.7
4,4,6.241611,2.84,1953.933724,42,0.004967,36005002000,2013,28.624161,12.95302,34044.0,35.9,29.798658,9.42526,3161,13.5,5.7


In [113]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column])

In [114]:
merged = gas_data.merge(data, on='geoid')
merged.head()

Unnamed: 0.1,Unnamed: 0,%housh_grandp_resp_for_grandch,avg_houshold_size,avg_year_built,gas_leaks,gas_leaks_per_person,geoid,incident_year,lonely_housholder%,lonely_housholder_over65%,mean_houshold_income,median_age,nonfamily_housholds%,not_us_citizen%,total_housing_units,unemployed%,vacant_housing_units%,zip
0,0,9.576837,3.6,1953.839431,2,0.000412,36005000200,2013,18.040089,11.210097,72652.0,34.5,19.005197,11.198024,1476,8.5,8.7,10473
1,2019,5.946338,3.69,1956.357627,4,0.000785,36005000200,2014,14.57578,8.846991,75015.0,36.6,15.445975,10.951914,1475,7.7,6.5,10473
2,5497,6.526468,3.91,1957.30496,4,0.00074,36005000200,2015,16.316171,7.541697,77789.0,37.3,18.564177,11.937812,1492,7.7,7.6,10473
3,6942,4.883546,3.94,1958.66218,6,0.001143,36005000200,2016,16.078137,5.785124,80621.0,38.6,19.233659,12.873738,1486,6.8,10.4,10473
4,7944,3.028009,3.82,1964.495222,11,0.002175,36005000200,2017,17.335352,5.677517,73687.0,39.6,23.31567,17.694741,1465,12.4,9.8,10473


In [115]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15534 entries, 0 to 15533
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      15534 non-null  int64  
 1   %housh_grandp_resp_for_grandch  15525 non-null  float64
 2   avg_houshold_size               15513 non-null  float64
 3   avg_year_built                  15534 non-null  float64
 4   gas_leaks                       15534 non-null  int64  
 5   gas_leaks_per_person            15534 non-null  float64
 6   geoid                           15534 non-null  int64  
 7   incident_year                   15534 non-null  int64  
 8   lonely_housholder%              15525 non-null  float64
 9   lonely_housholder_over65%       15525 non-null  float64
 10  mean_houshold_income            15484 non-null  float64
 11  median_age                      15504 non-null  float64
 12  nonfamily_housholds%            

In [116]:
gas_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11607 entries, 0 to 11606
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      11607 non-null  int64  
 1   %housh_grandp_resp_for_grandch  11604 non-null  float64
 2   avg_houshold_size               11595 non-null  float64
 3   avg_year_built                  11607 non-null  float64
 4   gas_leaks                       11607 non-null  int64  
 5   gas_leaks_per_person            11607 non-null  float64
 6   geoid                           11607 non-null  int64  
 7   incident_year                   11607 non-null  int64  
 8   lonely_housholder%              11604 non-null  float64
 9   lonely_housholder_over65%       11604 non-null  float64
 10  mean_houshold_income            11577 non-null  float64
 11  median_age                      11590 non-null  float64
 12  nonfamily_housholds%            

We can see that resulting table has many more records that the original gas leaks table, because some geoids were matched with several zipcodes.

### Attempt 2

Now I will try to reprocess the FDNY files with #gas leaks, and  retreive information by zipcode

In [125]:
dir_path = os.path.dirname(os.path.abspath("__file__"))
df_original1 = pd.read_csv(r'C:\Users\mskac\Desktop\Incidents_Responded_to_by_Fire_Companies.csv', low_memory=False)
df_locations1 = pd.read_csv(dir_path + '\..\data\original\In_Service_Alarm_Box_Locations.csv', low_memory=False)

# filter out only gas leaks data out of all FDNY calls (incidents file1)
df_incidents1 = df_original1[df_original1.INCIDENT_TYPE_DESC == "412 - Gas leak (natural gas or LPG)"]
df_incidents1.columns = map(str.lower, df_incidents1.columns)
df_incidents1.info()
df_incidents1.to_csv(r'C:\Users\mskac\machineLearning\GasLeakConEd\data\processed\Gas_Leak_Incidents_Responded_to_by_Fire_Companies.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119324 entries, 3 to 2518753
Data columns (total 24 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   im_incident_key               119324 non-null  int64  
 1   fire_box                      119322 non-null  object 
 2   incident_type_desc            119324 non-null  object 
 3   incident_date_time            119324 non-null  object 
 4   arrival_date_time             118636 non-null  object 
 5   units_onscene                 118636 non-null  float64
 6   last_unit_cleared_date_time   119324 non-null  object 
 7   highest_level_desc            119322 non-null  object 
 8   total_incident_duration       119324 non-null  float64
 9   action_taken1_desc            119324 non-null  object 
 10  action_taken2_desc            115132 non-null  object 
 11  action_taken3_desc            113241 non-null  object 
 12  property_use_desc             119323 non-nu

In [126]:
df_incidents1.groupby(df_incidents1.incident_date_time.str[6:10]).count()
df_incidents1 = df_incidents1[['fire_box','incident_type_desc','incident_date_time','units_onscene','total_incident_duration','property_use_desc','borough_desc', 'zip_code']]
df_incidents1 = df_incidents1.dropna()
df_incidents1.head()

Unnamed: 0,fire_box,incident_type_desc,incident_date_time,units_onscene,total_incident_duration,property_use_desc,borough_desc,zip_code
3,7412,412 - Gas leak (natural gas or LPG),01/01/2013 12:02:32 AM,4.0,2259.0,429 - Multifamily dwelling,5 - Queens,11103
159,4504,412 - Gas leak (natural gas or LPG),01/01/2013 01:53:19 AM,5.0,1495.0,"962 - Residential street, road or residential ...",5 - Queens,11354
175,3064,412 - Gas leak (natural gas or LPG),01/01/2013 02:07:29 AM,5.0,872.0,419 - 1 or 2 family dwelling,2 - Bronx,10460
226,3652,412 - Gas leak (natural gas or LPG),01/01/2013 02:47:57 AM,6.0,1346.0,419 - 1 or 2 family dwelling,4 - Brooklyn,11235
260,6180,412 - Gas leak (natural gas or LPG),01/01/2013 03:10:53 AM,6.0,1498.0,"962 - Residential street, road or residential ...",5 - Queens,11361


In [127]:
df_incidents1 = df_incidents1[['zip_code', 'borough_desc', 'fire_box','incident_date_time']]

We will be merging by the 'fire_box' column.
Df_locations 'fire_box' column has the following format: capital letter for the borough (B-Brooklyn, X-Bronx, M-Manhattan, Q- Queens, R- Staten Island)+ 4 digits of fire_box ID'. Example: 'B0012'
Df_incidents1 'fire_box' column only contains <= 4 digits of fire_box ID'. Example: 12
Df_incidents2 'fire_box_number' column only contains <=digits of fire_box ID'. Example: 12

Therefore, we need to make their formats the same

In [128]:
df_incidents1['fire_box'] = df_incidents1['fire_box'].str.zfill(4)
df_incidents1['boro_letter'] = df_incidents1['borough_desc'].apply(lambda x: 'B' if x[0] == '4' else 'M' if x[0] == '1' else 'X' if x[0] == '2' else 'Q' if x[0]=='5' else 'R')
df_incidents1['fire_box'] = df_incidents1.boro_letter + df_incidents1.fire_box
df_incidents1.drop(['boro_letter'], axis = 1, inplace = True)

In [129]:
df_incidents1.to_csv(dir_path + '\..\data\processed\FDNY_zip.csv')

In [130]:
df_locations2 = pd.read_csv(dir_path + '\..\data\original\Fire_Boxes.csv', low_memory=False)
df_locations2.head()

Unnamed: 0,longitude,latitude,fire_box,address
0,-73.93253,40.72795,B0014,Gardner Ave & Meeker Ave
1,-73.93146,40.72563,B0015,Gardner Ave & Cherry St
2,-73.93622,40.72748,B0018,Bridgewater St & Varick St
3,-73.95612,40.72601,B0020,Meserole Ave & Banker St
4,-73.95865,40.73626,B0023,Commercial St & Clay St


In [131]:
df_merged = pd.merge(df_incidents1, df_locations2, how='inner', on = 'fire_box')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114992 entries, 0 to 114991
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   zip_code            114992 non-null  object 
 1   borough_desc        114992 non-null  object 
 2   fire_box            114992 non-null  object 
 3   incident_date_time  114992 non-null  object 
 4   longitude           114992 non-null  float64
 5   latitude            114992 non-null  float64
 6   address             114992 non-null  object 
dtypes: float64(2), object(5)
memory usage: 7.0+ MB


In [134]:
df_merged_2013_2018 = df_merged

In [135]:
df_merged_2013_2018.head()

Unnamed: 0,zip_code,borough_desc,fire_box,incident_date_time,longitude,latitude,address
0,11103,5 - Queens,Q7412,01/01/2013 12:02:32 AM,-73.91579,40.75754,Broadway && 44th St
1,11103,5 - Queens,Q7412,04/20/2013 11:43:06 AM,-73.91579,40.75754,Broadway && 44th St
2,11103,5 - Queens,Q7412,08/13/2013 06:03:48 AM,-73.91579,40.75754,Broadway && 44th St
3,11103,5 - Queens,Q7412,01/03/2014 12:04:59 PM,-73.91579,40.75754,Broadway && 44th St
4,11103,5 - Queens,Q7412,03/12/2014 04:19:44 PM,-73.91579,40.75754,Broadway && 44th St


In [136]:
df_merged_2013_2018.to_csv(dir_path + '\..\data\processed\FDNY_zip.csv')


In [137]:
df_incidents_carto2013_2018 =  pd.read_csv(dir_path + '\\..\\data\\processed\\fdny_zip_carto.csv')

OSError: [Errno 22] Invalid argument: 'C:\\Users\\mskac\\machineLearning\\GasLeakConEd\\dataProcessing\\..\\data\\processed\x0cdny_zip_carto.csv'