In [None]:
import pandas as pd

## 1.2 Clean data for Income-Restricted Housing DB

In [None]:
# Load the data from data/income_restricted.csv
ir_df = pd.read_csv('../data/raw/income_restricted.csv')
ir_df.head()

In [None]:
# Select columns we are interested in
interested_columns = ["Project Name", "Zip Code", "Section 8", "TtlProjUnits"]
income_restricted = ir_df[interested_columns]

# Rename columns
income_restricted = income_restricted.rename(columns={
    "Project Name": "PROJ_NAME",
    "Zip Code": "ZIPCODE",
    "Section 8": "SECTION8",
    "TtlProjUnits": "RES_UNITS"
})

# Fill NaN zipcodes with 0
income_restricted["ZIPCODE"] = income_restricted["ZIPCODE"].fillna(0)

income_restricted.head()

In [None]:
%pip install python-dotenv

In [None]:
# Derive street name from Project Name using Google's Place API
import requests
import os

# Load .env file
from dotenv import load_dotenv
load_dotenv()

URL = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"
API_KEY = os.environ["GOOGLE_API_KEY"]

def derive_street_name(row):
    project_name = row["PROJ_NAME"]
    zip_code = int(row["ZIPCODE"])
    
    if zip_code == 0:
        input = project_name
    else:
        input = f"{project_name} {zip_code:05d}"

    res = requests.get(
        URL,
        params={
            "input": input,
            "inputtype": "textquery",
            "key": API_KEY,
            "fields": "formatted_address"
        }
    )

    addresses = res.json()["candidates"]
    if len(addresses) == 0:
        return pd.Series([project_name, ""])

    if "formatted_address" not in addresses[0]:
        return pd.Series([project_name, ""])
    
    return pd.Series([project_name, addresses[0]["formatted_address"]])
    
# Create new DataFrame with PROJ_NAME and ADDRESS columns
proj_to_address = pd.DataFrame(columns=["PROJ_NAME", "ADDRESS"])
proj_to_address[["PROJ_NAME", "ADDRESS"]] = income_restricted.apply(derive_street_name, axis=1)

# Save as csv file
proj_to_address.to_csv("../data/clean/proj_to_address.csv", index=False)

proj_to_address.head()

In [None]:
import re
import pandas as pd

proj_to_address = pd.read_csv("../data/clean/proj_to_address.csv")

# Join income_restricted with proj_to_address on PROJ_NAME
joined_data = income_restricted.join(
    proj_to_address.set_index("PROJ_NAME"),
    on="PROJ_NAME",
    how="left",
    lsuffix="_income_restricted",
    rsuffix="_proj_to_address"
)

def is_number(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

def parse_address(formatted_address):
    """
    Parse the address into a series of components.

    Example street: 100 Shawmut Ave, Boston, MA 02118, United States
    """
    if formatted_address != formatted_address:
        return pd.Series([None, None, None])

    address = formatted_address.split(',')[:-2]
    if len(address) != 2:
        return pd.Series([None, None, None])

    street_number = address[0].split(' ')[0]
    if is_number(street_number):
        street_number = int(street_number)
        street_name = address[0].partition(' ')[2]
    else:
        street_number = None
        street_name = address[0]
    
    city = address[1]

    return pd.Series([street_number, street_name, city])

joined_data[["ST_NUM", "ST_NAME", "CITY"]] = joined_data["ADDRESS"].apply(parse_address)

# Drop the formatted_address column and project name
joined_data = joined_data.drop(columns=["ADDRESS"])

joined_data.to_csv("../data/clean/income_restricted_housing.csv", index=False)

joined_data.head()