# Step 1 - Clean data

In [None]:
import pandas as pd
import re

## 1.1 Clean data for Property Assessment DB

In [None]:
# Load the data from data/property_assessment.csv
pa_df = pd.read_csv('../data/raw/all_housing.csv')
pa_df.head()

In [None]:
# Select columns we are interested in
interested_columns = ["OWN_OCC", "LU_DESC", "UNIT_NUM", "ST_NUM", "ST_NAME", "ZIPCODE", "CITY", "RES_UNITS", "OWNER"]
property_assessment = pa_df[interested_columns]
property_assessment

In [None]:
# Since ST NUM could be a range of numbers like 100 200 we expand it into a row with one number each
result_rows = []

def expand_st_num(row):
    row = row.to_dict()

    if row["ST_NUM"] is None:
        result_rows.append(row)
        return

    # Split numbers using regex
    numbers = re.findall(r'\d+', row["ST_NUM"])
    st_nums = [int(n) for n in numbers]
    
    if len(st_nums) > 1:
        # Extract all numbers from the string
        first = st_nums[0]
        last = st_nums[-1]

        for i in range(first, last + 1):
            new_row = row.copy()
            new_row["ST_NUM"] = i
            result_rows.append(new_row)
    elif len(st_nums) == 1:
        # Convert to integer
        row["ST_NUM"] = st_nums[0]
        result_rows.append(row)
    else:
        # If street number is not a number, set to None
        row["ST_NUM"] = None
        result_rows.append(row)

property_assessment.apply(expand_st_num, axis=1)
result_df = pd.DataFrame(result_rows)
result_df

In [None]:
# Save the result
result_df.to_csv('../data/clean/all_housing.csv', index=False)