In [1]:
import pandas as pd

## 1.2 Clean data for Income-Restricted Housing DB

In [2]:
# Load the data from data/income_restricted.csv
ir_df = pd.read_csv('../data/raw/income_restricted.csv')
ir_df.head()

Unnamed: 0,Project Name,Neighborhood,Zip Code,TtlProjUnits,RentUnits,OwnUnits,TtlMarket,MarketRent,MarketOwn,Total Income-Restricted,Income-Restricted Rental,Income-Restricted Ownership,Tenure,Public/ Private,Includes Senior Units?,Section 8
0,Abbot Street/ Shawmut Ave,Roxbury,2119.0,16,16,0,0.0,0.0,0.0,16,16,0.0,Rental,Private,,
1,Academy Homes I,Roxbury,2119.0,202,202,0,52.0,52.0,0.0,150,150,0.0,Rental,Private,,
2,Academy Homes II,Roxbury,2119.0,236,236,0,0.0,0.0,0.0,236,236,0.0,Rental,Private,,Y
3,Adams Court Phase A,Mattapan,2126.0,50,50,0,0.0,0.0,0.0,50,50,0.0,Rental,Private,,
4,Adams Court Phase B,Mattapan,2126.0,45,45,0,0.0,0.0,0.0,45,45,0.0,Rental,Private,,


In [3]:
# Select columns we are interested in
interested_columns = ["Project Name", "Zip Code", "Section 8", "TtlProjUnits"]
income_restricted = ir_df[interested_columns]

# Rename columns
income_restricted = income_restricted.rename(columns={
    "Project Name": "PROJ_NAME",
    "Zip Code": "ZIPCODE",
    "Section 8": "SECTION8",
    "TtlProjUnits": "RES_UNITS"
})

# Fill NaN zipcodes with 0
income_restricted["ZIPCODE"] = income_restricted["ZIPCODE"].fillna(0)

income_restricted.head()

Unnamed: 0,PROJ_NAME,ZIPCODE,SECTION8,RES_UNITS
0,Abbot Street/ Shawmut Ave,2119.0,,16
1,Academy Homes I,2119.0,,202
2,Academy Homes II,2119.0,Y,236
3,Adams Court Phase A,2126.0,,50
4,Adams Court Phase B,2126.0,,45


In [4]:
%pip install python-dotenv

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
# Derive street name from Project Name using Google's Place API
import requests
import os

# Load .env file
from dotenv import load_dotenv
load_dotenv()

URL = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"
API_KEY = os.environ["GOOGLE_API_KEY"]

def derive_street_name(row):
    project_name = row["PROJ_NAME"]
    zip_code = int(row["ZIPCODE"])
    
    if zip_code == 0:
        input = project_name
    else:
        input = f"{project_name} {zip_code:05d}"

    res = requests.get(
        URL,
        params={
            "input": input,
            "inputtype": "textquery",
            "key": API_KEY,
            "fields": "formatted_address"
        }
    )

    addresses = res.json()["candidates"]
    if len(addresses) == 0:
        return pd.Series([project_name, ""])

    if "formatted_address" not in addresses[0]:
        return pd.Series([project_name, ""])
    
    return pd.Series([project_name, addresses[0]["formatted_address"]])
    
# Create new DataFrame with PROJ_NAME and ADDRESS columns
proj_to_address = pd.DataFrame(columns=["PROJ_NAME", "ADDRESS"])
proj_to_address[["PROJ_NAME", "ADDRESS"]] = income_restricted.apply(derive_street_name, axis=1)

# Save as csv file
proj_to_address.to_csv("../data/clean/proj_to_address.csv", index=False)

proj_to_address.head()

AIzaSyBqTxvX7FNrJ5HAEWPVnTtCTtwxLGUDnIw


KeyboardInterrupt: 

: 

In [8]:
import re
import pandas as pd

proj_to_address = pd.read_csv("../data/clean/proj_to_address.csv")

# Join income_restricted with proj_to_address on PROJ_NAME
joined_data = income_restricted.join(
    proj_to_address.set_index("PROJ_NAME"),
    on="PROJ_NAME",
    how="left",
    lsuffix="_income_restricted",
    rsuffix="_proj_to_address"
)

def is_number(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

def parse_address(formatted_address):
    """
    Parse the address into a series of components.

    Example street: 100 Shawmut Ave, Boston, MA 02118, United States
    """
    if formatted_address != formatted_address:
        return pd.Series([None, None, None])

    address = formatted_address.split(',')[:-2]
    if len(address) != 2:
        return pd.Series([None, None, None])

    street_number = address[0].split(' ')[0]
    if is_number(street_number):
        street_number = int(street_number)
        street_name = address[0].partition(' ')[2]
    else:
        street_number = None
        street_name = address[0]
    
    city = address[1]

    return pd.Series([street_number, street_name, city])

joined_data[["ST_NUM", "ST_NAME", "CITY"]] = joined_data["ADDRESS"].apply(parse_address)

# Drop the formatted_address column and project name
joined_data = joined_data.drop(columns=["ADDRESS"])

joined_data.to_csv("../data/clean/income_restricted_housing.csv", index=False)

joined_data.head()

Unnamed: 0,PROJ_NAME,ZIPCODE,SECTION8,RES_UNITS,ST_NUM,ST_NAME,CITY
0,Abbot Street/ Shawmut Ave,2119.0,,16,100.0,Shawmut Ave,Boston
1,Academy Homes I,2119.0,,202,1592.0,Columbus Ave,Roxbury
2,Academy Homes II,2119.0,Y,236,2926.0,Washington St,Roxbury
3,Adams Court Phase A,2126.0,,50,59.0,Msgr Patrick J Lydon Way,Boston
4,Adams Court Phase B,2126.0,,45,,,
