In [None]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [None]:
df = pd.read_pickle('df.pickle')
df.head()

In [None]:
df['Weighted Total'] = df['1 Unit'] + (2 * df['2 Units']) + (3.5 * df['3 and 4 Units']) + (10 * df['5 Units or More'])

# Over entire pop, so adjust by 100,000 (standard for population statistics like crime)
df['Adjusted Weighted Total'] = df['Weighted Total'] / df['Population'] * 100_000
# But don't multiply these, since we expect permits to be directly proportional to population growth
df['Adjusted Weighted 1Y Total'] = df['Weighted Total'] / df['Pop Growth 1 Year']
df['Adjusted Weighted 5Y Total'] = df['Weighted Total'] / df['Pop Growth 5 Year']

# And do the same thing to our non-weighted total
df['Adjusted Total'] = df['Total'] / df['Population'] * 100_000
df['Adjusted 1Y Total'] = df['Total'] / df['Pop Growth 1 Year']
df['Adjusted 5Y Total'] = df['Total'] / df['Pop Growth 5 Year']

df.head()

In [None]:
columns = {
    'index': [ 'MSA', 'Date', 'Year', 'Month', 'filename'],
    # Permit
    'permits': ['Total', '1 Unit', '2 Units', '3 and 4 Units', 
    '5 Units or More', 'Num of Structures With 5 Units or More'],
    # Price
    'price': ['Price', 'Seasonal', 'Trend', 'Residual', 'Price Change', 'Trend Change'],
    # Population
    'population': ['Population 1', 'Population 5', 'Population Diff', 'Population', 'Pop Growth 1 Year',
    'Pop -1 Years', 'Pop Percent 1 Year', 'Pop Growth 5 Year', 'Pop -5 Years', 'Pop Percent 5 Year'],
    # Housing Stock
    'housing': ['Total housing units', 'Occupied housing units', 'Vacant housing units', 
    'Homeowner vacancy rate', 'Rental vacancy rate', '1-unit, detached', '1-unit, attached',
    '2 units', '3 or 4 units', '5 to 9 units', '10 to 19 units', '20 or more units', 
    '1 room', '2 rooms', '3 rooms', '4 rooms', '5 rooms', '6 rooms', '7 rooms', 
    '8 rooms', '9 rooms or more', 'Median rooms', 'No bedroom', '1 bedroom', 
    '2 bedrooms', '3 bedrooms', '4 bedrooms', '5 or more bedrooms'],
    # Income
    'income': ['Income'],
}

In [None]:
housing_df = df[['MSA', 'Month', 'Year', 'filename', 'Income'] + columns['population'] + columns['housing']].copy()
housing_df = housing_df.dropna(subset=columns['housing'], axis='rows')
housing_df.head()

In [None]:
len(housing_df)