### Data Cleaning

This notebook outlines the steps taken to process and clean the LL84 which is one of three datasets needed for Model in predicting natural gas and electricity use.

In [1]:
import pandas as pd

### Reading LL84 data

This is public data available to download from here: https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/wcm8-aq5w.

In [2]:
LL84_2019 = pd.read_csv("data/LL84_2019.csv")

  LL84_2019 = pd.read_csv("data/LL84_2019.csv")


In [3]:
LL84_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29084 entries, 0 to 29083
Columns: 254 entries, Property Id to NTA
dtypes: float64(6), int64(5), object(243)
memory usage: 56.4+ MB


In [4]:
LL84_2019.head()

Unnamed: 0,Property Id,Property Name,Parent Property Id,Parent Property Name,Year Ending,"NYC Borough, Block and Lot (BBL)",NYC Building Identification Number (BIN),Address 1,Address 2,City,...,Last Modified Date - Water Meters,Generation Date,Release Date,Borough,Latitude,Longitude,Community Board,Council District,Census Tract,NTA
0,7365,1155,Not Applicable: Standalone Property,Not Applicable: Standalone Property,12/31/2019,1009970029,1022631,1155 Avenue of the Americas,Not Available,Manhattan,...,Not Available,05/28/2020 04:27:22 AM,05/28/2020 11:31:28 AM,MANHATTAN,40.756631,-73.982826,105.0,4.0,119.0,Midtown-Midtown South
1,8139,200,Not Applicable: Standalone Property,Not Applicable: Standalone Property,12/31/2019,1013150001,1037545,200 East 42nd St.,Not Available,Manhattan,...,03/03/2020 05:46 PM EST,05/28/2020 04:27:23 AM,05/28/2020 11:31:28 AM,MANHATTAN,40.750698,-73.974306,106.0,4.0,88.0,Turtle Bay-East Midtown
2,8604,114,Not Applicable: Standalone Property,Not Applicable: Standalone Property,12/31/2019,1009990019,1022667,114 West 47th st,Not Available,Manhattan,...,Not Available,05/28/2020 04:27:23 AM,05/28/2020 11:31:28 AM,MANHATTAN,40.75831,-73.982504,105.0,4.0,125.0,Midtown-Midtown South
3,8841,733,Not Applicable: Standalone Property,Not Applicable: Standalone Property,12/31/2019,1013190047,1037596,733 Third Avenue,Not Available,Manhattan,...,Not Available,05/28/2020 04:27:24 AM,05/28/2020 11:31:28 AM,MANHATTAN,40.753074,-73.972753,106.0,4.0,90.0,Turtle Bay-East Midtown
4,11809,Conde Nast Building,Not Applicable: Standalone Property,Not Applicable: Standalone Property,12/31/2019,1009950005,1085682,4 Times Square,Not Available,Manhattan,...,Not Available,05/28/2020 04:27:25 AM,05/28/2020 11:31:28 AM,MANHATTAN,40.756181,-73.986244,105.0,4.0,119.0,Midtown-Midtown South


In [5]:
# Rename the column "NYC Borough, Block and Lot (BBL)" to "BBL"
LL84_2019.rename(columns={"NYC Borough, Block and Lot (BBL)": "BBL"}, inplace=True)

In [6]:
# Reformat values in the BBL column
LL84_2019["BBL"] = LL84_2019["BBL"].str.replace("-", "").astype(str)

In [7]:
current_year = 2019

# Calculate building age
LL84_2019 ['Building Age'] = current_year - LL84_2019['Year Built']

In [8]:
# Retain only the specified columns
columns_to_retain = [
    "BBL",
    "Largest Property Use Type - Gross Floor Area (ft²)",
    "Building Age",
    "ENERGY STAR Score",
    "Weather Normalized Source EUI (kBtu/ft²)",
    "Weather Normalized Site Natural Gas Use (therms)",
    "Weather Normalized Site Energy Use (kBtu)",
    "Latitude",
    "Longitude",
    "NTA"
]
LL84_2019 = LL84_2019[columns_to_retain]
LL84_2019 = LL84_2019.replace("Not Available", pd.NA)  # Replace "not available" with NaN
LL84_2019.dropna(inplace=True)

KeyError: "['Weather Normalized Site Energy Use (kBtu)Latitude'] not in index"

In [None]:
# Remove duplicate rows
LL84_2019.drop_duplicates(inplace=True)

In [None]:
# Save file 
LL84_2019.to_csv(r"Ll84_2019_processed.csv", index=False)

### Reading PLUTO data

This is public information exported from MapPLUTO™ - Shapefile 19v2 Releases and available to download from here: https://www.nyc.gov/site/planning/data-maps/open-data/bytes-archive.page.

In [None]:
PLUTO_2019 = pd.read_csv("data/PLUTO_2019.csv")

In [None]:
PLUTO_2019.info()

In [None]:
PLUTO_2019.head()

In [None]:
# Remove unwanted colunms
columns_to_drop = [
    "Borough", "Block", "Lot", "CD", "CT2010", "CB2010", "SchoolDist",
    "Council", "ZipCode", "FireComp", "PolicePrct", "HealthCent", "HealthArea",
    "Sanitboro", "SanitDistr", "SanitSub", "BldgClass",
    "Easements", "OwnerType", "LotArea", "ComArea", "ResArea", "OfficeArea", "RetailArea",
    "GarageArea", "StrgeArea", "FactryArea", "OtherArea", "AreaSource",
    "UnitsRes", "UnitsTotal", "LotFront", "LotDepth", "BldgFront", "BldgDepth", "Ext", "IrrLotCode",
    "BsmtCode", "AssessLand", "AssessTot", "ExemptTot", "YearBuilt", "YearAlter1", 
    "YearAlter2", "HistDist","BuiltFAR", "ResidFAR", "CommFAR", "FacilFAR", "CondoNo", 
    "Tract2010", "APPBBL", "APPDate", "Address", "ZoneDist1", "ZoneDist2", 
    "ZoneDist3", "ZoneDist4","Overlay1", "Overlay2", "SPDist1", "SPDist2", "SPDist3",
    "LtdHeight", "SplitZone", "OwnerName", "HistDist", "Landmark"
]
PLUTO_2019.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Remove unavailable lot type
PLUTO_2019 = PLUTO_2019[PLUTO_2019["LotType"] != 0]

# Remove unavailable 
PLUTO_2019 = PLUTO_2019[PLUTO_2019["ProxCode"] != 0]

In [None]:
# Fixes to exported BBLs
PLUTO_2019["BBL"] = PLUTO_2019["BBL"].round(0).astype(int)

In [None]:
# Save file 
PLUTO_2019.to_csv(r"PLUTO_2019_processed.csv", index=False)

### Data integration according to BBL number

In [None]:
# load two datasets
df_pluto = pd.read_csv(r"PLUTO_2019_processed.csv")
df_ll84 = pd.read_csv(r"LL84_2019_processed.csv")

In [None]:
df_pluto['BBL'] = df_pluto['BBL'].astype(str)
df_ll84['BBL'] = df_ll84['BBL'].astype(str)

In [None]:
# Merge two datasets according their BBL number
merged_df = pd.merge(df_pluto, df_ll84, on="BBL", how="inner")

In [None]:
# Save file
merged_df.to_csv(r"Merged_Data.csv", index=False)