### Data integration

This notebook outlines the steps taken to integrate data with the ACS dataset.

### Reading NTA_spatial_join.csv derived from QGIS spatial join

NTA_spatial_join.csv added NTA Geocode and name to the previous Merged_Data.csv 

In [None]:
import pandas as pd
from openpyxl import load_workbook

In [None]:
Spatial_join = pd.read_csv("data/NTA_spatial_join.csv")

In [None]:
Spatial_join.rename(columns={"nta2020": "GeoID"}, inplace=True)
Spatial_join.rename(columns={"Largest Property Use Type - Gross Floor Area (ft²)": "LargestPropertyUseTypeGrossFloorArea"}, inplace=True)
Spatial_join.rename(columns={"ENERGY STAR Score": "ENERGY_STAR_SCORE"}, inplace=True)
Spatial_join.rename(columns={"Weather Normalized Source EUI (kBtu/ft²)": "Weather_Normalized_Source EUI_(kBtu/ft²)"}, inplace=True)
Spatial_join.rename(columns={"Weather Normalized Site Natural Gas Use (therms)": "Weather_Normalized_Site_Natural_Gas_Use_(therms)"}, inplace=True)
Spatial_join.rename(columns={"Weather Normalized Site Energy Use (kBtu)": "Weather_Normalized_Site_Energy_Use_(kBtu) "}, inplace=True)
Spatial_join.rename(columns={"Building Age": "Building_Age"}, inplace=True)

In [None]:
Spatial_join.info()

### Reading ACS data

This is public data available to download from here: https://www.nyc.gov/site/planning/planning-level/nyc-population/american-community-survey.page.

#### ACS demo data

In [None]:
ACS_2019_demo = pd.read_excel("data/demo_2019.xlsx")

In [None]:
ACS_2019_demo.info()

In [None]:
ACS_2019_demo.head()

In [None]:
ACS_2019_demo.info()

In [None]:
ACS_2019_demo.rename(columns={"Pop65pl1E": "Population_with_65_years_and_over"}, inplace=True)
ACS_2019_demo.rename(columns={"Pop_1E": "Population"}, inplace=True)

In [None]:
columns_to_keep = ['GeoID','Population_with_65_years_and_over', 'Population']
ACS_2019_demo = ACS_2019_demo.loc[:, columns_to_keep]

In [None]:
ACS_2019_demo.head()

#### ACS eco data

In [None]:
ACS_2019_eco = pd.read_excel("data/econ_2019.xlsx")

In [None]:
ACS_2019_eco.info()

In [None]:
ACS_2019_eco.head()

In [None]:
ACS_2019_eco.rename(columns={"MnHHIncE": "Mean_household_income_(dollars)"}, inplace=True)
ACS_2019_eco.rename(columns={"PvHInsE": "Civilian_noninstitutionalized_population_with_private_insurance"}, inplace=True)
ACS_2019_eco.rename(columns={"LFE": "Population_in_labour_force"}, inplace=True)
ACS_2019_eco.rename(columns={"PopPvU2E": "Population_with_determined_poverty_status"}, inplace=True)

In [None]:
columns_to_keep = ['GeoID','Mean_household_income_(dollars)', 'Civilian_noninstitutionalized_population_with_private_insurance','Population_in_labour_force','Population_with_determined_poverty_status']
ACS_2019_eco = ACS_2019_eco.loc[:, columns_to_keep]

In [None]:
ACS_2019_eco.head()

In [None]:
merged_df = pd.merge(ACS_2019_eco, ACS_2019_demo, on='GeoID', how='inner')

In [None]:
merged_df.head()

#### ACS soc data

In [None]:
ACS_2019_soc = pd.read_excel("data/soc_2019.xlsx")

In [None]:
ACS_2019_soc.info()

In [None]:
ACS_2019_soc.head()

In [None]:
ACS_2019_soc.rename(columns={"HH1E": "Total_households"}, inplace=True)
ACS_2019_soc.rename(columns={"Pop3plEnE": "School_Employees"}, inplace=True)
ACS_2019_soc.rename(columns={"EA_BchDHE": "Population_with_bachelor_degree_or_higher"}, inplace=True)
ACS_2019_soc.rename(columns={"MS_M15plE": "Married_males"}, inplace=True)
ACS_2019_soc.rename(columns={"MS_F15plE": "Married_females"}, inplace=True)
ACS_2019_soc.rename(columns={"AvgHHSzE": "Average_household_size"}, inplace=True)

In [None]:
columns_to_keep = ['GeoID','Total_households', 'School_Employees', 'Population_with_bachelor_degree_or_higher',
                  'Married_males','Married_females', 'Average_household_size' ]
ACS_2019_soc = ACS_2019_soc.loc[:, columns_to_keep]

In [None]:
ACS_2019_soc.head()

In [None]:
merged_df1 = pd.merge(merged_df, ACS_2019_soc, on='GeoID', how='inner')

In [None]:
merged_df1.head()

#### ACS hous data

In [None]:
ACS_2019_hous = pd.read_excel("data/hous_2019.xlsx")

In [None]:
ACS_2019_hous.info()

In [None]:
ACS_2019_hous.head()

In [None]:
ACS_2019_hous.rename(columns={"VacHUE": "Vacant_housing_units"}, inplace=True)
ACS_2019_hous.rename(columns={"MdGRE": "Median_Gross_Rent_(dollars)"}, inplace=True)
ACS_2019_hous.rename(columns={"Vhcl3plAvE": "Housing_units_with_3_or_more_vehicles_available"}, inplace=True)

In [None]:
columns_to_keep = ['GeoID','Vacant_housing_units', 'Median_Gross_Rent_(dollars)','Housing_units_with_3_or_more_vehicles_available']
ACS_2019_hous = ACS_2019_hous.loc[:, columns_to_keep]

In [None]:
ACS_2019_hous.head()

In [None]:
ACS_2019 = pd.merge(merged_df1, ACS_2019_hous, on='GeoID', how='inner')

In [None]:
ACS_2019.head()

In [None]:
ACS_2019.info()

In [None]:
ACS_2019_cleaned = ACS_2019.dropna()

In [None]:
ACS_2019_cleaned.info()

In [None]:
ACS_2019_cleaned.to_csv('data/ACS_2019_cleaned.csv', index=False)

#### Merge with NTA_spatial_join.csv

In [None]:
df = pd.merge(Spatial_join, ACS_2019_cleaned, on='GeoID', how='inner')

In [None]:
columns_to_drop = ['ZMCode', 'EDesigNum', 'FIRM07_FLA', 'PFIRM15_FL', 'Version', 'DCPEdited']
df = df.drop(columns=columns_to_drop)

In [None]:
df.info()

In [None]:
df.to_csv('data/df.csv', index=False)