In [None]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, accuracy_score, f1_score

In [None]:
df = pd.read_pickle('df.pickle')
df.head()

In [None]:
print(list(df.columns))

In [None]:
df['Weighted Total'] = df['1 Unit'] + (2 * df['2 Units']) + (3.5 * df['3 and 4 Units']) + (10 * df['5 Units or More'])

# Over entire pop, so adjust by 100,000 (standard for population statistics like crime)
df['Adjusted Weighted Total'] = df['Weighted Total'] / df['Population'] * 100_000
# But don't multiply these, since we expect permits to be directly proportional to population growth
df['Adjusted Weighted 1Y Total'] = df['Weighted Total'] / df['Pop Growth 1 Year']
df['Adjusted Weighted 5Y Total'] = df['Weighted Total'] / df['Pop Growth 5 Year']

# And do the same thing to our non-weighted total
df['Adjusted Total'] = df['Total'] / df['Population'] * 100_000
df['Adjusted 1Y Total'] = df['Total'] / df['Pop Growth 1 Year']
df['Adjusted 5Y Total'] = df['Total'] / df['Pop Growth 5 Year']

df.head()

In [None]:
columns = {
    'index': [ 'MSA', 'Date', 'Year', 'Month', 'filename'],
    # Permit
    'permits': ['Total', '1 Unit', '2 Units', '3 and 4 Units', 
    '5 Units or More', 'Num of Structures With 5 Units or More'],
    # Price
    'price': ['Price', 'Seasonal', 'Trend', 'Residual', 'Price Change', 'Trend Change'],
    # Population
    'population': ['Population 1', 'Population 5', 'Population Diff', 'Population', 'Pop Growth 1 Year',
    'Pop -1 Years', 'Pop Percent 1 Year', 'Pop Growth 5 Year', 'Pop -5 Years', 'Pop Percent 5 Year'],
    # Housing Stock
    'housing': ['Total housing units', 'Occupied housing units', 'Vacant housing units', 
    'Homeowner vacancy rate', 'Rental vacancy rate', '1-unit, detached', '1-unit, attached',
    '2 units', '3 or 4 units', '5 to 9 units', '10 to 19 units', '20 or more units', 
    '1 room', '2 rooms', '3 rooms', '4 rooms', '5 rooms', '6 rooms', '7 rooms', 
    '8 rooms', '9 rooms or more', 'Median rooms'],
    # Income
    'income': ['Income'],
}

In [None]:
house_columns = ['Zillow', 'Month', 'Year', 'filename', 'Income', 'Population', 'Price'] + columns['housing']

In [None]:
housing_df = df[house_columns].copy()
housing_df = housing_df.dropna(subset=columns['housing'], axis='rows')
housing_df.head()

In [None]:
X_columns = ['Income'] + columns['housing']
scores = list()

def test_model(Model, kwargs=None):
    r2_scores = list()
    for file_name, file_df in housing_df.groupby('filename'):
        file_df = file_df.dropna(subset=X_columns + ['Price'], axis="rows")
        train_df, test_df = train_test_split(file_df, train_size=0.8, random_state=42)

        X_train = train_df[X_columns]
        y_train = train_df[['Price']]

        X_test = test_df[X_columns]
        y_test = test_df[['Price']]

        model = Model().fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        r2_scores.append({'file': file_name, 'model': Model.__name__, 'score': r2})
        
        print(f"{file_name} R2 {r2}")
    
    return r2_scores

scores += test_model(LinearRegression)

In [None]:
scores += test_model(Lasso)

In [None]:
scores += test_model(Ridge)

In [None]:
scores += test_model(DecisionTreeRegressor)

In [None]:
scores += test_model(GradientBoostingRegressor)

In [None]:
def get_tier(x):
    if '0.33_0.67' in x:
        return 'Mid'
    if '0.0_0.33' in x:
        return 'Low'
    return 'Hi'

def get_type(x):
    if 'bdrmcnt' in x:
        count = x.split("_")[3]
        return f"{count} Bedroom(s)"
    if '_condo_tier' in x:
        return "Condo"
    if "_sfr_tier" in x:
        return "Single Family"
    return "All"

def add_file_data(score_list):
    df = pd.DataFrame(score_list)

    df['tier'] = df.file.apply(get_tier)
    df['type'] = df.file.apply(get_type)
    df['category'] = df.apply(lambda x: f"{x.tier} Tier, {x.type}", axis='columns')

    return df.copy()

r2_df = add_file_data(scores)
r2_df.head()

In [None]:
alt.Chart(r2_df).mark_bar().encode(
    y=alt.Y('score', title='R2 Score'),
    x=alt.X('category', title=None),
    column=alt.Column('model', title=None),
    color=alt.Color('category', legend=None),
)

In [None]:
scaled_scores = list()
def test_scaled_model(Model, Scaler, kwargs=None):
    r2_scores = list()
    
    for file_name, file_df in housing_df.groupby('filename'):
        file_df = file_df.dropna(subset=X_columns + ['Price'], axis="rows")
        
        scaled_X = Scaler().fit_transform(file_df[X_columns], file_df.Price) 
        X_train, X_test, y_train, y_test = train_test_split(scaled_X, file_df.Price, train_size=0.8, random_state=42)

        model = Model().fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        r2_scores.append({'file': file_name, 'model': Model.__name__, 'scaler': Scaler.__name__, 'score': r2})
        
        print(f"{file_name} R2 {r2}")
    
    return r2_scores

scaled_scores += test_scaled_model(LinearRegression, StandardScaler)

In [None]:
scaled_scores += test_scaled_model(Lasso, StandardScaler)
scaled_scores += test_scaled_model(Ridge, StandardScaler)

In [None]:
scaled_r2_df = add_file_data(scaled_scores)
scaled_r2_df.head()

In [None]:
r2_df['scaler'] = 'None'
r2_df = pd.concat([scaled_r2_df, r2_df[r2_df.model.isin(scaled_r2_df.model.unique())]])

r2_df.head()

In [None]:
alt.Chart(r2_df).mark_point().encode(
    y=alt.Y('score', title=None),
    x=alt.X('category', title=None),
    column=alt.Column('model', title=None),
    color=alt.Color('scaler')
)

In [None]:
r2_df.scaler.unique()

In [None]:
def test_pca_model(Model, n, kwargs=None):
    r2_scores = list()
    pca_X = PCA(n_components=n).fit_transform(housing_df[X_columns])
    
    pca_df = pd.DataFrame(pca_X)
    pca_cols = list(pca_df.columns)
    pca_df['filename'] = housing_df['filename']
    pca_df['Price'] = housing_df['Price']
        
    for file_name, file_df in pca_df.groupby('filename'):
        file_df = file_df.dropna(subset=pca_cols + ['Price'], axis="rows")
        
        X_train, X_test, y_train, y_test = train_test_split(
            file_df[pca_cols], file_df.Price, train_size=0.8, random_state=42
        )

        model = Model().fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        r2_scores.append({'file': file_name, 'model': Model.__name__, 'components': n, 'score': r2})
        
        print(f"{file_name} R2 {r2}")
    
    return r2_scores

In [None]:
pca_scores = list()
pca_scores += test_pca_model(LinearRegression, 3)
pca_scores += test_pca_model(LinearRegression, 5)
pca_scores += test_pca_model(LinearRegression, 10)
pca_scores += test_pca_model(LinearRegression, 20)
pca_scores += test_pca_model(LinearRegression, 23)

In [None]:
pca_scores_df = add_file_data(pca_scores)

alt.Chart(pca_scores_df).mark_bar().encode(
    y=alt.Y('score', title='R2 Score', scale=alt.Scale(domain=[0, 1])),
    x=alt.X('components:N', title='Components'),
    column=alt.Column('category', title=None),
    color=alt.Color('category', legend=None),
)