Notebook to retrieve affordable housing data from CoStar (sample)

<b>Author</b>: Phu Dang

<b>Date</b>: November 24, 2023

In [39]:
import pandas as pd 
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
# Test / Demo

df = pd.read_clipboard(sep=r'^', skip_blank_lines=False)

In [26]:
data = df.copy()

In [27]:
def viewAll(status=False):

    if status:
        pd.set_option('display.max_rows', None)
    else:
        pd.set_option('display.max_rows', 11)
        
    return None

In [28]:
data.iloc[range(15), :]

Unnamed: 0,106-114 4th St
0,C
1,5
2,5800
3,Apartments
4,Del Mar
...,...
10,Market
11,0.80
12,4
13,140 4th St


In [29]:
viewAll(False)

In [34]:
# Operation: Make initial column name the first row, rename column as 'data'
# Reason: The first property's address is pasted as the column name (the
#   immediate dataset after pasting only has 1 column)

if data.columns[0] != 'data':
    firstPropertyAddress = data.columns[0]
    topRow = pd.DataFrame(data={'data': [firstPropertyAddress]})
    data.rename(columns={firstPropertyAddress: 'data'}, inplace=True)
    data = pd.concat([topRow, data], axis=0).reset_index(drop=True)

In [85]:
for i in range(5):
    if i == 0:
        pass
    else:
        print('Hello')
    
    print('HA')

HA
Hello
HA
Hello
HA
Hello
HA
Hello
HA


In [166]:
addressPattern = r'^\d+.+[a-zA-Z]{1}$'
classPattern = r'^[A-Z]{1}$'
builtRenovPattern = r'(^[0-9]{4}$)|(^[0-9]{4}/{1}[0-9]{4}$)'
secondaryTypes = ['Manufactured Housing/Mobile Home Park', 'Dormitory', \
    'Apartments']
cities = ['del mar', 'la jolla', 'pacific beach', 'san diego']
rentTypes = ['market/affordable', 'market', 'affordable']
affordableTypes = ['rent subsidized', 'rent stabilized', 'rent restricted', \
    'rent controlled', 'affordable units']
propWORentType = ['3357 Apache Ave',
                    '2179 Avenida De La Playa',
                    '941 Camino Del Mar',
                    '1765 Chalcedony St',
                    '2596 Chalcedony St',
                    '3867-3869 Clairemont Dr',
                    '4828 Clairemont Dr',
                    '2135 Coast Blvd',
                    '1671-1675 Diamond St',
                    '826 Emerald St',
                    '918 Felspar St',
                    '2142 Felspar St',
                    '1425 Grand Ave',
                    '4274-4276 Gresham St',
                    '2732-2736 Hornblend St',
                    '4330 Kendall',
                    '7696-7698 Kiwi St',
                    '6850 Mission Gorge Rd',
                    '377-385 Nautilus St',
                    '1217 Oliver Ave',
                    '6232-6236 Osler St',
                    '343 Playa Del Sur',
                    '3701 Promontory St',
                    '4044 Promontory St',
                    '725 Rockaway Ct',
                    '961 Thomas Ave',
                    '1052 Turquoise St',
                    '702-708 Whiting Ct',
                    '7634-7676 Clairemont',
                    '3410 Clairemont Dr',
                    '1033 Diamond',
                    '1950 Emerald St',
                    '539 Genter St',
                    '5550 Kearny Mesa Rd',
                    '4330 Kendall St',
                    '1719 Oliver Ave',
                    '922-926 Reed Ave',
                    '10288 Wateridge Cir']

In [217]:
data[:15]

Unnamed: 0,data
0,106-114 4th St
1,C
2,5
3,5800
4,Apartments
...,...
10,4.0
11,Market
12,0.80
13,4


In [204]:
def attributeCheck(datum, column, currIdx):

    output = None

    if column == 'Address':
        if re.findall(addressPattern, datum):
            output = True
        else: output = False

    elif column == 'Building Name':
        prev_d = data['data'][currIdx-1]
        if re.findall(r'[A-Za-z]{2,}', datum) and \
            re.findall(addressPattern, prev_d):
            output = True
        else: output = False

    elif column == 'Class':
        if re.findall(classPattern, datum): 
            output = True 
        else: output = False

    elif column == 'Units':
        if list(finalDf['Address'])[-1] == '2135 Coast Blvd':
            output = False
        else: output = True

    elif column == 'RBA/GLA':
        if list(finalDf['Address'])[-1] == '10770-10771 Black Mountain Rd':
            output = False
        else: output = True 

    elif column == 'Secondary Type':
        if datum in secondaryTypes:
            output = True
        else: output = False
    
    elif column == 'Submarket':
        prev_d = data['data'][currIdx-1]
        if (prev_d in secondaryTypes) or (len(prev_d) == 5):
            output = True
        else: output = False

    elif column == 'City':
        if datum.lower() in cities:
            output = True
        else: output = False
    
    elif column == 'Land(AC)':
        prev_d = data['data'][currIdx-1]
        if ('.' in datum) and (prev_d.lower() in cities):
            output = True
        else: output = False
    
    elif column == 'Built/Renov':
        if re.findall(builtRenovPattern, datum):
            output = True
        else: output = False
    
    elif column == 'Total Buildings':
        prev_d = data['data'][currIdx-1]
        if (re.findall(builtRenovPattern, prev_d)) or ('.' in prev_d):
            output = True
        else: output = False 

    elif column == 'Stories':
        next_d = data['data'][currIdx+1].lower()
        if ('.' not in datum) and ((next_d in rentTypes) or ('.' in next_d)):
            output = True
        else: output = False
    
    elif column == 'Vacancy %':
        next_d = data['data'][currIdx+1]
        if (next_d.lower() in rentTypes) and ('.' in datum):
            output = True
        elif ('.' in datum):
            output = True
        else: output = False

    elif column == 'Rent Type':
        output = True if datum.lower() in rentTypes else False
    
    elif column == 'Affordable Type':
        output = True if datum.lower() in affordableTypes else False
    
    elif column == 'Parking Spaces/Unit':
        prev_d = data['data'][currIdx-1].lower()
        if ('.' in datum) and ((prev_d in rentTypes+affordableTypes) or \
            (len(prev_d) == 1)):
            output = True
        else: output = False
    
    else:
        next_d = data['data'][currIdx+1]
        if re.findall(addressPattern, next_d):
            output = True
        else: output = False

    return output 

In [221]:
# Create final dataframe

colNames = ['Address', 'Building Name', 'Class', 'Units', 'RBA/GLA', \
    'Secondary Type', 'Submarket', 'City', 'Land(AC)', 'Built/Renov', \
    'Total Buildings', 'Stories', 'Vacancy %', 'Rent Type', 'Affordable Type', \
    'Parking Spaces/Unit', 'Parking Spaces']
finalDf = pd.DataFrame(columns=colNames)

In [223]:
# Begin iterating over data, identify null attributes, populate final dataframe

prev = (None, None)
i = 0
while i < len(data['data']):
    
    if finalDf.shape[0] == 5:
        break

    for column in colNames:
        d = data['data'][i]
        status = attributeCheck(d, column, currIdx=i)
        print(f"{d}, {column}, {status}")
        if status:
            if column == 'Address':
                finalDf.loc[finalDf.shape[0], 'Address'] = d
            else: 
                finalDf.loc[finalDf.shape[0]-1, column] = d 
            i += 1
        else: 
            finalDf.loc[finalDf.shape[0]-1, column] = np.NaN 

106-114 4th St, Address, True
C, Building Name, False
C, Class, True
5, Units, True
5,800, RBA/GLA, True
Apartments, Secondary Type, True
Del Mar, Submarket, True
Del Mar, City, True
0.18, Land(AC), True
1, Built/Renov, False
1, Total Buildings, True
2, Stories, True
4.0, Vacancy %, True
Market, Rent Type, True
0.80, Affordable Type, False
0.80, Parking Spaces/Unit, True
4, Parking Spaces, True
140 4th St, Address, True
C, Building Name, False
C, Class, True
6, Units, True
5,920, RBA/GLA, True
Apartments, Secondary Type, True
Del Mar, Submarket, True
Del Mar, City, True
0.21, Land(AC), True
1, Built/Renov, False
1, Total Buildings, True
2, Stories, True
Market, Vacancy %, False
Market, Rent Type, True
201 4th St, Affordable Type, False
201 4th St, Parking Spaces/Unit, False
201 4th St, Parking Spaces, False
201 4th St, Address, True
Los Arboles Apartments, Building Name, True
B, Class, True
97, Units, True
126,570, RBA/GLA, True
Apartments, Secondary Type, True
Del Mar, Submarket, True

In [224]:
finalDf

Unnamed: 0,Address,Building Name,Class,Units,RBA/GLA,Secondary Type,Submarket,City,Land(AC),Built/Renov,Total Buildings,Stories,Vacancy %,Rent Type,Affordable Type,Parking Spaces/Unit,Parking Spaces
0,106-114 4th St,,C,5,5800,Apartments,Del Mar,Del Mar,0.18,,1,2,4.0,Market,,0.8,4.0
1,140 4th St,,C,6,5920,Apartments,Del Mar,Del Mar,0.21,,1,2,,Market,,,
2,201 4th St,Los Arboles Apartments,B,97,126570,Apartments,Del Mar,Del Mar,2.23,1970.0,2,3,2.2,Market,,1.36,132.0
3,129-131 10th St,,C,5,5544,Apartments,Del Mar,Del Mar,0.37,1953.0,2,1,4.0,Market,,1.0,5.0
4,320-322 11th St,,C,2,3902,Apartments,Del Mar,Del Mar,0.2,2017.0,2,3,4.0,Market,,2.0,2.0


In [121]:
test = '1992/'
pattern = r'(^[0-9]{4}$)|(^[0-9]{4}/{1}[0-9]{4}$)'
result = re.findall(pattern, test)
result

[]

In [69]:
if result:
    print("Hello")

In [35]:
data

Unnamed: 0,data
0,106-114 4th St
1,C
2,5
3,5800
4,Apartments
...,...
1381,2
1382,3.9
1383,Market
1384,1.20


##### Unused code archive

In [None]:
# prev = (None, None)
# for i in range(len(data['data'])):
#     d = data['data'][i]

#     # Check if d is a property address
#     addressPattern = r'^\d+.+[a-zA-Z]{1}$'
#     result = re.findall(addressPattern, d)
#     if result:
#         if prev[1] == 'Address': 
#             pass
#         else:
#             finalDf.loc[finalDf.shape[0], 'Address'] = d
#             prev = (d, 'Address')
#             continue

#     # Check if d is a building name
#     classPattern = r'^[A-Z]{1}$'
#     next_d = data['data'][i+1]
#     if (prev[1] == 'Address') and (re.findall(classPattern, next_d)):
#         finalDf.loc[finalDf.shape[0]-1, 'Building Name'] = d
#         prev = (d, 'Building Name')
#         continue
#     elif (prev[1] == 'Address') and (re.findall(classPattern, d)):
#         finalDf.loc[finalDf.shape[0]-1, 'Building Name'] = np.NaN
#         finalDf.loc[finalDf.shape[0]-1, 'Class'] = d
#         prev = (d, 'Class')
#         continue

#     # Check if d is a class