# House pricing model
In this notebook we produce a linear regression model from price.

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import RegressionAnalysis
ra = RegressionAnalysis.RegressionAnalysis()

In [None]:
dfA = ra.load_data()
df = dfA.query('price < 2*10**6')
df.shape

In [None]:
y = df['price'].apply(np.log10)
y.columns = ['log_price']
y.shape

In [None]:
bc_living15, bc_living15_lambda = stats.boxcox(df['sqft_living15'])
len(bc_living15)

In [None]:
bed_density = df['bedrooms'] / df['sqft_living']
bc_bed_density, bc_bed_density_lambda = stats.boxcox(bed_density)
len(bc_bed_density)

In [None]:
bath_bed_ratio = df['bathrooms'] / df['bedrooms']
bc_bath_bed_ratio, bc_bath_bed_ratio_lambda = stats.boxcox(bath_bed_ratio)
len(bc_bath_bed_ratio)

In [None]:
zip_medians = df[['zipcode', 'price']].groupby(by='zipcode').agg('median')
q1, q2 = zip_medians['price'].quantile(q=[1/3, 2/3])
def get_zip_category(price):
    if price < q1:
        category = 'low'
    elif price < q2:
        category = 'med'
    else:
        category = 'high'
    return category
zip_medians['category'] = zip_medians['price'].apply(get_zip_category)
zip_category_map = zip_medians['category'].to_dict()
zip_categories = df['zipcode'].apply(lambda x: zip_category_map[x])
zip_cat_dummies = pd.get_dummies(zip_categories, drop_first=True, prefix='zipcode')
zip_cat_dummies.shape

In [None]:
lot_ratio = df['sqft_lot'] / df['sqft_living']
bc_lot_ratio, bc_lot_ratio_lambda = stats.boxcox(lot_ratio)
len(bc_lot_ratio)

In [None]:
X = zip_cat_dummies
X['bc_living15'] = bc_living15
#X['bc_bed_density'] = bc_bed_density
X['bc_bath_bed_ratio'] = bc_bath_bed_ratio
#X['bc_lot_ratio'] = bc_lot_ratio
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
ra.make_pair_plot(pd.concat([X_train,y_train], axis=1))

In [None]:
ra.make_correlation_plot(pd.concat([X_train, y_train], axis=1))

In [None]:
model = ra.fit_model(X_train, y_train)

In [None]:
y_hat_train = ra.predict_target(X_train, model)

In [None]:
residuals_train = ra.compute_residuals(y_train, y_hat_train)

In [None]:
ra.plot_residuals_against_inputs(X_train, residuals_train)

In [None]:
ra.plot_residuals_against_prediction(y_hat_train, residuals_train)

In [None]:
ra.plot_residuals_distribution(residuals_train)

In [None]:
ra.plot_residuals_normal_qq(residuals_train)

In [None]:
ra.compute_vif(X_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
price_res_train = ra.compute_residuals(10**y_train, 10**y_hat_train)

In [None]:
ra.plot_residuals_against_prediction(10**y_hat_train, price_res_train)

In [None]:
X1_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X1_train, hasconst=True).fit()
model.summary()