In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from math import sqrt
from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LassoLars, LinearRegression, TweedieRegressor

import acquire
import prepare
import explore
import model

%matplotlib inline
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings("ignore")

seed = 42

In [2]:
df = acquire.get_zillow()

In [3]:
df = prepare.prep_zillow(df)

In [4]:
df

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value,year_built,fips
0,3.0,2.0,1175.0,327859.0,1953.0,6037.0
1,3.0,2.0,1630.0,63948.0,1953.0,6037.0
2,2.0,2.0,1206.0,356218.0,1954.0,6037.0
3,3.0,2.0,1790.0,242773.0,1964.0,6037.0
4,3.0,2.0,1400.0,251246.0,1953.0,6037.0
...,...,...,...,...,...,...
56075,3.0,2.0,1400.0,318206.0,1951.0,6037.0
56076,4.0,2.0,1446.0,140804.0,1951.0,6037.0
56077,4.0,2.0,1584.0,412114.0,1955.0,6037.0
56078,4.0,2.0,1584.0,186627.0,1955.0,6037.0


In [5]:
cols = ['bedrooms', 'bathrooms', 'sqft', 'tax_value', 'year_built', 'fips']

In [6]:
df = prepare.remove_outliers(df, 1, cols)

In [7]:
df

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value,year_built,fips
0,3.0,2.0,1175.0,327859.0,1953.0,6037.0
1,3.0,2.0,1630.0,63948.0,1953.0,6037.0
3,3.0,2.0,1790.0,242773.0,1964.0,6037.0
4,3.0,2.0,1400.0,251246.0,1953.0,6037.0
6,4.0,3.0,2603.0,185086.0,1976.0,6037.0
...,...,...,...,...,...,...
56074,3.0,2.0,1549.0,482243.0,1958.0,6037.0
56075,3.0,2.0,1400.0,318206.0,1951.0,6037.0
56076,4.0,2.0,1446.0,140804.0,1951.0,6037.0
56077,4.0,2.0,1584.0,412114.0,1955.0,6037.0


In [8]:
train, validate, test = prepare.subset_df(df)

(15218, 6) (5073, 6) (5073, 6)


In [76]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model.xy_subsets(train, validate, test, 'tax_value')

In [77]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft,year_built,fips
8567,4.0,3.0,2627.0,1966.0,6059.0
30792,4.0,2.0,1835.0,1948.0,6037.0
55725,3.0,2.0,1664.0,1951.0,6037.0
10157,3.0,2.0,1376.0,1953.0,6037.0
47071,3.0,3.0,1494.0,1951.0,6037.0
...,...,...,...,...,...
47986,4.0,2.0,1405.0,1962.0,6037.0
12991,3.0,2.0,1563.0,1940.0,6037.0
2886,3.0,3.0,1863.0,1946.0,6037.0
35716,4.0,3.0,2433.0,1955.0,6037.0


In [78]:
cols = ['sqft', 'year_built']

X_train_scaled, X_validate_scaled, X_test_scaled = prepare.scale_data(X_train, X_validate, X_test, MinMaxScaler(), cols)

In [79]:
X_train_scaled.shape, X_validate_scaled.shape, X_test_scaled.shape

((15218, 5), (5073, 5), (5073, 5))

In [13]:
X_train_scaled

Unnamed: 0,bedrooms,bathrooms,sqft,year_built,fips
8567,4.0,3.0,0.873286,0.515625,6059.0
30792,4.0,2.0,0.498818,0.234375,6037.0
55725,3.0,2.0,0.417967,0.281250,6037.0
10157,3.0,2.0,0.281797,0.312500,6037.0
47071,3.0,3.0,0.337589,0.281250,6037.0
...,...,...,...,...,...
47986,4.0,2.0,0.295508,0.453125,6037.0
12991,3.0,2.0,0.370213,0.109375,6037.0
2886,3.0,3.0,0.512057,0.203125,6037.0
35716,4.0,3.0,0.781560,0.343750,6037.0


In [14]:
def rfe(n_features, X_train, y_train):
    
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=n_features)

    rfe.fit(X_train, y_train)

    ranks = rfe.ranking_
    columns = X_train.columns.tolist()
    
    feature_ranks = pd.DataFrame({'ranking': ranks,
                              'feature': columns})

    feature_ranks = feature_ranks.sort_values('ranking')

    return pd.DataFrame(feature_ranks).head(n_features)

In [15]:
def f_selector(k, X_train, y_train):
    
    f_selector = SelectKBest(f_regression, k=k)

    f_selector.fit(X_train, y_train)

    f_select_mask = f_selector.get_support()

    select_k_best_features = X_train.iloc[:,f_select_mask]
    
    #print(select_k_best_features.head(k))
    
    return pd.DataFrame(select_k_best_features)

In [16]:
def create_preds_df(y_train):
    
    preds_df = pd.DataFrame({'actual': y_train})

    preds_df['baseline_median'] = y_train.median()
    
    return preds_df

In [17]:
def lin_regression(X_train, y_train):

    lm = LinearRegression()

    lm.fit(X_train, y_train)
    
    preds_df['lm_preds'] = lm.predict(X_train)
    
    return preds_df

In [18]:
def lasso_lars(X_train, y_train, alpha=.1):
    
    lasso = LassoLars(alpha=alpha)

    lasso.fit(X_train, y_train)

    preds_df['lasso_preds'] = lasso.predict(X_train)
    
    return preds_df

In [19]:
def glm_model(X_train, y_train, power=0):
    
    tweedie = TweedieRegressor(power=power)

    tweedie.fit(X_train, y_train)

    preds_df['tweedie_preds'] = tweedie.predict(X_train)
    
    return preds_df

In [20]:
def poly_subset(X_train, y_train, degree=2):
    
    pf = PolynomialFeatures(degree=degree)

    pf.fit(X_train, y_train)

    X_polynomial = pf.transform(X_train)
    
    return X_polynomial

In [21]:
def poly_model(X_polynomial, y_train, m):
    
    model = m

    model.fit(X_polynomial, y_train)

    preds_df['poly_preds'] = model.predict(X_polynomial)
    
    return preds_df

In [22]:
def get_rmses(preds_df):
    
    lm_rmse = sqrt(mean_squared_error(preds_df['lm_preds'], preds_df['actual']))
    lasso_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['lasso_preds']))
    tweedie_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedie_preds']))
    poly_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['poly_preds']))
    
    print(f'Linear Regression RMSE is: {lm_rmse}')
    print(f'Lasso-Lars Regression RMSE is: {lasso_rmse}')
    print(f'GLM Regression RMSE is: {tweedie_rmse}')
    print(f'Polynomial Regression RMSE is: {poly_rmse}')
    
    results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
                            'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

    return results

In [23]:
def model_results():

    results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
                            'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

    return results

In [24]:
def hist_charts(df):

    for col in df:

        plt.hist(df[col], bins=25)
        plt.title(f'{col} distribution')
        plt.show()

In [25]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['bedrooms', 'bathrooms'])

## linear regression model #1

In [28]:
preds_df = create_preds_df(y_train)

In [29]:
preds_df = lin_regression(X_train_scaled, y_train)

In [30]:
lm_rmse = sqrt(mean_squared_error(preds_df['lm_preds'], preds_df['actual']))

lm_rmse

172009.51215233456

In [31]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds
8567,725903.0,300238.0,469223.647989
30792,57938.0,300238.0,311659.602352
55725,422528.0,300238.0,304173.920233
10157,231444.0,300238.0,268842.056472
47071,434923.0,300238.0,303286.134904
...,...,...,...
47986,80762.0,300238.0,258764.574362
12991,201840.0,300238.0,291934.725208
2886,468235.0,300238.0,348586.642707
35716,287138.0,300238.0,404830.445538


In [32]:
sqrt(mean_squared_error(preds_df['actual'], preds_df['baseline_median']))

183442.57765522916

In [None]:
preds_df['baseline_avg'] = y_train.mean()

In [35]:
sqrt(mean_squared_error(preds_df['actual'], preds_df['baseline_avg']))

182205.25866689716

## lasso lars model #1

In [41]:
preds_df = lasso_lars(X_train_scaled, y_train, alpha=.33)

In [42]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,baseline_avg,lasso_preds
8567,725903.0,300238.0,469223.647989,321508.237679,469033.450646
30792,57938.0,300238.0,311659.602352,321508.237679,311712.986911
55725,422528.0,300238.0,304173.920233,321508.237679,304127.474660
10157,231444.0,300238.0,268842.056472,321508.237679,268839.403249
47071,434923.0,300238.0,303286.134904,321508.237679,303132.021547
...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,321508.237679,258946.683601
12991,201840.0,300238.0,291934.725208,321508.237679,291836.343325
2886,468235.0,300238.0,348586.642707,321508.237679,348362.402474
35716,287138.0,300238.0,404830.445538,321508.237679,404730.311037


In [43]:
lasso_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['lasso_preds']))
lasso_rmse

172009.548540561

## glm model #1

In [50]:
glm_model(X_train_scaled, y_train, 0)

Unnamed: 0,actual,baseline_median,lm_preds,baseline_avg,lasso_preds,tweedie_preds
8567,725903.0,300238.0,469223.647989,321508.237679,469033.450646,371810.587733
30792,57938.0,300238.0,311659.602352,321508.237679,311712.986911,305191.317845
55725,422528.0,300238.0,304173.920233,321508.237679,304127.474660,300075.594157
10157,231444.0,300238.0,268842.056472,321508.237679,268839.403249,298857.792789
47071,434923.0,300238.0,303286.134904,321508.237679,303132.021547,312816.141493
...,...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,321508.237679,258946.683601,303844.247916
12991,201840.0,300238.0,291934.725208,321508.237679,291836.343325,299147.931500
2886,468235.0,300238.0,348586.642707,321508.237679,348362.402474,314272.173025
35716,287138.0,300238.0,404830.445538,321508.237679,404730.311037,321706.956621


In [51]:
tweedie_norm = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedie_preds']))

tweedie_norm

178810.67431911838

In [56]:
X_polynomial = poly_subset(X_train_scaled, y_train, 2)

In [57]:
preds_df = poly_model(X_polynomial, y_train, LinearRegression())

In [58]:
poly_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['poly_preds']))

poly_rmse

169594.13842512123

Polynomial with a degree of 2 and Linear Regression is best so far

In [59]:
results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
              'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

results

Unnamed: 0,model,rmse
0,linear,172009.512152
1,lasso,172009.548541
2,tweedie_norm,178810.674319
3,linear_poly,169594.138425


In [60]:
get_rmses(preds_df)

Linear Regression RMSE is: 172009.51215233456
Lasso-Lars Regression RMSE is: 172009.548540561
GLM Regression RMSE is: 178810.67431911838
Polynomial Regression RMSE is: 169594.13842512123


Unnamed: 0,model,rmse
0,linear,172009.512152
1,lasso,172009.548541
2,tweedie_norm,178810.674319
3,linear_poly,169594.138425


# Again, but with feature engineering

## selectkbest

In [61]:
cols = ['sqft', 'year_built']

X_train_scaled, X_validate_scaled, X_test_scaled = prepare.scale_data(X_train, X_validate, X_test, MinMaxScaler(), cols)

In [62]:
X_train_scaled = f_selector(4, X_train_scaled, y_train)

In [64]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['bathrooms'])

In [65]:
X_train_scaled

Unnamed: 0,sqft,year_built,fips,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5
8567,0.873286,0.515625,6059.0,0,0,0,1,0
30792,0.498818,0.234375,6037.0,0,1,0,0,0
55725,0.417967,0.281250,6037.0,0,1,0,0,0
10157,0.281797,0.312500,6037.0,0,1,0,0,0
47071,0.337589,0.281250,6037.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
47986,0.295508,0.453125,6037.0,0,1,0,0,0
12991,0.370213,0.109375,6037.0,0,1,0,0,0
2886,0.512057,0.203125,6037.0,0,0,0,1,0
35716,0.781560,0.343750,6037.0,0,0,0,1,0


In [66]:
preds_df = create_preds_df(y_train)

In [67]:
preds_df = lin_regression(X_train_scaled, y_train)

In [68]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds
8567,725903.0,300238.0,461515.980003
30792,57938.0,300238.0,321668.361453
55725,422528.0,300238.0,301391.467529
10157,231444.0,300238.0,267634.505863
47071,434923.0,300238.0,298459.167789
...,...,...,...
47986,80762.0,300238.0,269847.246804
12991,201840.0,300238.0,291061.828151
2886,468235.0,300238.0,342024.558352
35716,287138.0,300238.0,407164.405726


In [69]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [70]:
preds_df = glm_model(X_train_scaled, y_train, 0)

In [71]:
X_polynomial = poly_subset(X_train_scaled, y_train, 3)

In [72]:
poly_model(X_train_scaled, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
8567,725903.0,300238.0,461515.980003,461461.079063,373114.581507,461515.980003
30792,57938.0,300238.0,321668.361453,321658.772949,300283.707246,321668.361453
55725,422528.0,300238.0,301391.467529,301389.437755,299648.524469,301391.467529
10157,231444.0,300238.0,267634.505863,267639.189282,298455.955476,267634.505863
47071,434923.0,300238.0,298459.167789,298420.559640,320604.694100,298459.167789
...,...,...,...,...,...,...
47986,80762.0,300238.0,269847.246804,269869.391878,298946.152456,269847.246804
12991,201840.0,300238.0,291061.828151,291039.087058,298759.626533,291061.828151
2886,468235.0,300238.0,342024.558352,341972.544163,322034.628383,342024.558352
35716,287138.0,300238.0,407164.405726,407124.648278,324916.150480,407164.405726


In [73]:
sqrt(mean_squared_error(y_train, preds_df['poly_preds']))

171936.13045323553

In [74]:
get_rmses(preds_df)

Linear Regression RMSE is: 171936.13045323553
Lasso-Lars Regression RMSE is: 171936.13403601473
GLM Regression RMSE is: 178426.08163969836
Polynomial Regression RMSE is: 171936.13045323553


Unnamed: 0,model,rmse
0,linear,171936.130453
1,lasso,171936.134036
2,tweedie_norm,178810.674319
3,linear_poly,171936.130453


## rfe 

In [80]:
rfe(3, X_train_scaled, y_train)

Unnamed: 0,ranking,feature
0,1,bedrooms
1,1,bathrooms
2,1,sqft


In [81]:
preds_df = create_preds_df(y_train)

In [82]:
preds_df = lin_regression(X_train_scaled, y_train)

In [83]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [86]:
preds_df = glm_model(X_train_scaled, y_train, 0)

In [87]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds
8567,725903.0,300238.0,469223.647989,469166.012431,371810.587733
30792,57938.0,300238.0,311659.602352,311675.779491,305191.317845
55725,422528.0,300238.0,304173.920233,304159.845817,300075.594157
10157,231444.0,300238.0,268842.056472,268841.252465,298857.792789
47071,434923.0,300238.0,303286.134904,303239.433887,312816.141493
...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,258819.758980,303844.247916
12991,201840.0,300238.0,291934.725208,291904.912516,299147.931500
2886,468235.0,300238.0,348586.642707,348518.691121,314272.173025
35716,287138.0,300238.0,404830.445538,404800.101750,321706.956621


In [88]:
poly_model(X_train_scaled, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
8567,725903.0,300238.0,469223.647989,469166.012431,371810.587733,469223.647989
30792,57938.0,300238.0,311659.602352,311675.779491,305191.317845,311659.602352
55725,422528.0,300238.0,304173.920233,304159.845817,300075.594157,304173.920233
10157,231444.0,300238.0,268842.056472,268841.252465,298857.792789,268842.056472
47071,434923.0,300238.0,303286.134904,303239.433887,312816.141493,303286.134904
...,...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,258819.758980,303844.247916,258764.574362
12991,201840.0,300238.0,291934.725208,291904.912516,299147.931500,291934.725208
2886,468235.0,300238.0,348586.642707,348518.691121,314272.173025,348586.642707
35716,287138.0,300238.0,404830.445538,404800.101750,321706.956621,404830.445538


In [89]:
get_rmses(preds_df)

Linear Regression RMSE is: 172009.51215233456
Lasso-Lars Regression RMSE is: 172009.5154937698
GLM Regression RMSE is: 178810.67431911838
Polynomial Regression RMSE is: 172009.51215233456


Unnamed: 0,model,rmse
0,linear,172009.512152
1,lasso,172009.515494
2,tweedie_norm,178810.674319
3,linear_poly,172009.512152


# Top 3

- polynomial, 2deg, LinReg is #1
- LinReg, no rfe/selectkbest, is #2
- LassoLars no rfe/selectkbest, alpha .1 is #3

In [90]:
X_validate_scaled = pd.get_dummies(X_validate_scaled, columns=['bedrooms', 'bathrooms', 'fips'])

In [91]:
X_validate_scaled

Unnamed: 0,sqft,year_built,bedrooms_3.0,bedrooms_4.0,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,fips_6037.0,fips_6059.0
54695,0.365957,0.437500,1,0,0,1,0,0,0,0,1
12442,0.772104,0.687500,1,0,0,0,1,0,0,0,1
34676,0.376832,0.625000,1,0,0,1,0,0,0,1,0
19391,0.449173,0.406250,0,1,0,1,0,0,0,0,1
30898,0.282742,0.343750,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
45330,0.595745,0.500000,1,0,0,1,0,0,0,1,0
54993,0.228369,0.703125,1,0,0,1,0,0,0,1,0
39225,0.182506,0.468750,1,0,0,1,0,0,0,0,1
51704,0.247754,0.437500,1,0,0,1,0,0,0,1,0


In [92]:
preds_df = create_preds_df(y_validate)

In [93]:
preds_df = lin_regression(X_validate_scaled, y_validate)

In [94]:
preds_df = lasso_lars(X_validate_scaled, y_validate)

In [95]:
preds_df = glm_model(X_validate_scaled, y_validate, 1)

In [99]:
X_polynomial = poly_subset(X_validate_scaled, y_validate, 2)

In [100]:
poly_model(X_validate_scaled, y_validate, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
54695,330948.0,303453.0,322560.0,320815.692458,320402.967615,322560.0
12442,702386.0,303453.0,460800.0,460182.997232,482312.640085,460800.0
34676,705629.0,303453.0,291840.0,299773.548729,294821.549166,291840.0
19391,299869.0,303453.0,336896.0,323459.656884,319064.373655,336896.0
30898,384883.0,303453.0,275456.0,278725.171892,279628.723316,275456.0
...,...,...,...,...,...,...
45330,145963.0,303453.0,349184.0,355286.588006,349794.830337,349184.0
54993,224517.0,303453.0,252928.0,262181.195081,262650.655373,252928.0
39225,400708.0,303453.0,275456.0,274911.942640,278889.176294,275456.0
51704,345000.0,303453.0,263168.0,269233.515087,270860.490334,263168.0


In [101]:
get_rmses(preds_df)

Linear Regression RMSE is: 171167.11445062698
Lasso-Lars Regression RMSE is: 171031.44159846936
GLM Regression RMSE is: 171136.8660334656
Polynomial Regression RMSE is: 171167.11445062698


Unnamed: 0,model,rmse
0,linear,171167.114451
1,lasso,171031.441598
2,tweedie_norm,178810.674319
3,linear_poly,171167.114451


- Lasso-Lars with no polynomial regression or rfe/selectkbest #1

- Linear Regression with a polnomial degree of 2 no rfe/selectkbest #2

- Basic Linear Regression #3