In [89]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

# Panel Data Version

### Median square footage per home per year in neighborhood

In [2]:
tax = gpd.read_feather('../data/fair_market_acquisition.geofeather')

In [3]:
tax = tax.replace({'South Of Market': 'South of Market', 'Ocean View': 'Oceanview', 'Marina': 'Marina District'})

In [4]:
tax[['assessor_neighborhood', 'zillow_neighborhood_name']] = tax[['assessor_neighborhood', 'zillow_neighborhood_name']].replace({'NaN': np.nan})

In [5]:
tax = tax[tax.closed_roll_year < 2017].copy()

In [6]:
residential = tax[tax.use_definition.str.contains('Residential') | False]
residential = residential[residential['number_of_units'] > 0]
residential = residential[residential['property_area'] > 0]

In [7]:
residential['home_size'] = (residential['property_area'] / residential['number_of_units']).round()

In [8]:
sqft_by_neighborhood1 = residential.groupby(['assessor_neighborhood', 'closed_roll_year']).agg({'home_size':'median'})

In [9]:
sqft_by_neighborhood2 = residential.groupby(['zillow_neighborhood_name', 'closed_roll_year']).agg({'home_size':'median'})

In [10]:
sqft_by_neighborhood1, sqft_by_neighborhood2 = sqft_by_neighborhood1.reset_index(), sqft_by_neighborhood2.reset_index()

### Panel Data of Rents

In [11]:
home_prices = pd.read_csv('../data/efz/Neighborhood_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')

In [12]:
home_prices = home_prices[home_prices.City == 'San Francisco']
annual = [c for c in home_prices.columns if '12-31' in c]
neighborhood_rents = home_prices[['RegionName'] + annual].copy()
neighborhood_rents.rename(columns={'RegionName': 'neighborhood'}, inplace=True)

In [13]:
covered_years = neighborhood_rents.columns[1:].str.split('-').str[0].astype(int) + 1

In [14]:
neighborhood_rents.columns = ['neighborhood'] + covered_years.tolist()
neighborhood_rents = neighborhood_rents.melt(id_vars='neighborhood', var_name='year', value_name='rent')
neighborhood_rents = neighborhood_rents[(neighborhood_rents.year >= 2007) & (neighborhood_rents.year < 2017)]

In [15]:
neighborhood_rents.neighborhood.nunique()

108

### Match Zillow neighborhoods with SF

In [16]:
sqft_rent_by_hood1 = pd.merge(sqft_by_neighborhood1, neighborhood_rents,
                              right_on=['neighborhood', 'year'], 
                              left_on=['assessor_neighborhood', 'closed_roll_year'], 
                              how='inner')

In [17]:
sqft_rent_by_hood2 = pd.merge(sqft_by_neighborhood2, neighborhood_rents,
                              right_on=['neighborhood', 'year'], 
                              left_on=['zillow_neighborhood_name', 'closed_roll_year'], 
                              how='inner')

In [18]:
sqft_rent_by_hood1.shape, sqft_rent_by_hood2.shape

((608, 6), (299, 6))

In [19]:
sqft_rent_by_hood1['rent/sf'] = (sqft_rent_by_hood1['rent'] / sqft_rent_by_hood1['home_size']).round(0)
sqft_rent_by_hood2['rent/sf'] = (sqft_rent_by_hood2['rent'] / sqft_rent_by_hood2['home_size']).round(0)

# Roll back into tax df

In [43]:
merge1 = pd.merge(tax, 
                  sqft_rent_by_hood1,
                  how='left', 
                  on=['assessor_neighborhood', 'closed_roll_year'])

In [44]:
merge2 = pd.merge(tax, sqft_rent_by_hood2,
                  how='left',
                  on=['zillow_neighborhood_name', 'closed_roll_year'])

In [53]:
tax['sqft_price'] = merge1['rent/sf'].fillna(merge2['rent/sf']).values

In [54]:
tax['sqft_price'].isna().mean()

0.05883426935879795

### Impute annual mean. Do better matching later

In [90]:
imputer = KNeighborsRegressor(n_neighbors=2)

In [67]:
tax_imputed = tax.copy()

In [82]:
tax_imputed = tax_imputed.reset_index(drop=True)

In [95]:
year_df[~missing][['x', 'y']].dropna().isna().any()

x    False
y    False
dtype: bool

In [102]:
train['x'].value_counts()

x
-212141.396304    67
-212810.458565    61
-211814.866711    50
-210251.679990    41
-210499.633977    37
                  ..
-214266.719996     1
-219949.755300     1
-214507.627256     1
-211070.928773     1
-219195.053363     1
Name: count, Length: 8674, dtype: int64

In [109]:
for year in tax_imputed['closed_roll_year'].unique():
    print(year)
    # Filter the DataFrame based on the year
    year_df = tax_imputed[tax_imputed['closed_roll_year'] == year]
    missing = year_df['sqft_price'].isna()
    train = year_df[~missing][['x', 'y', 'geometry', 'sqft_price']].dropna().sample(10000, random_state=0)
    
    # Perform KNN imputation on the subset
    imputer.fit(X=train[['x', 'y']], y=train['sqft_price'])
    preds = imputer.predict(year_df[missing & year_df['geometry'].notna()][['x', 'y']])
    tax_imputed.loc[year_df[missing & year_df['geometry'].notna()].index, 'sqft_price'] = preds

2009
2012
2014
2015
2010
2008
2007
2011
2013
2016


In [113]:
tax_imputed['sqft_price'].isna().mean()

0.000318406138594761

In [112]:
tax_imputed.to_feather('../data/ef_df.geofeather')