In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from math import sqrt
from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LassoLars, LinearRegression, TweedieRegressor

import acquire
import prepare
import explore
import model

%matplotlib inline
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings("ignore")

seed = 42

In [136]:
df = acquire.get_zillow()

In [137]:
df = prepare.prep_zillow(df)

In [138]:
df

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value,year_built,fips
0,3.0,2.0,1175.0,327859.0,1953.0,6037.0
1,3.0,2.0,1630.0,63948.0,1953.0,6037.0
2,2.0,2.0,1206.0,356218.0,1954.0,6037.0
3,3.0,2.0,1790.0,242773.0,1964.0,6037.0
4,3.0,2.0,1400.0,251246.0,1953.0,6037.0
...,...,...,...,...,...,...
56075,3.0,2.0,1400.0,318206.0,1951.0,6037.0
56076,4.0,2.0,1446.0,140804.0,1951.0,6037.0
56077,4.0,2.0,1584.0,412114.0,1955.0,6037.0
56078,4.0,2.0,1584.0,186627.0,1955.0,6037.0


In [139]:
cols = ['bedrooms', 'bathrooms', 'sqft', 'tax_value', 'year_built', 'fips']

In [140]:
df = prepare.remove_outliers(df, 1, cols)

In [141]:
df

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value,year_built,fips
0,3.0,2.0,1175.0,327859.0,1953.0,6037.0
1,3.0,2.0,1630.0,63948.0,1953.0,6037.0
3,3.0,2.0,1790.0,242773.0,1964.0,6037.0
4,3.0,2.0,1400.0,251246.0,1953.0,6037.0
6,4.0,3.0,2603.0,185086.0,1976.0,6037.0
...,...,...,...,...,...,...
56074,3.0,2.0,1549.0,482243.0,1958.0,6037.0
56075,3.0,2.0,1400.0,318206.0,1951.0,6037.0
56076,4.0,2.0,1446.0,140804.0,1951.0,6037.0
56077,4.0,2.0,1584.0,412114.0,1955.0,6037.0


In [142]:
train, validate, test = prepare.subset_df(df)

(15218, 6) (5073, 6) (5073, 6)


In [143]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model.xy_subsets(train, validate, test, 'tax_value')

In [144]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft,year_built,fips
8567,4.0,3.0,2627.0,1966.0,6059.0
30792,4.0,2.0,1835.0,1948.0,6037.0
55725,3.0,2.0,1664.0,1951.0,6037.0
10157,3.0,2.0,1376.0,1953.0,6037.0
47071,3.0,3.0,1494.0,1951.0,6037.0
...,...,...,...,...,...
47986,4.0,2.0,1405.0,1962.0,6037.0
12991,3.0,2.0,1563.0,1940.0,6037.0
2886,3.0,3.0,1863.0,1946.0,6037.0
35716,4.0,3.0,2433.0,1955.0,6037.0


In [145]:
cols = ['sqft', 'year_built']

X_train_scaled, X_validate_scaled, X_test_scaled = prepare.scale_data(X_train, X_validate, X_test, MinMaxScaler(), cols)

In [146]:
X_train_scaled.shape, X_validate_scaled.shape, X_test_scaled.shape

((15218, 5), (5073, 5), (5073, 5))

In [103]:
X_train_scaled

Unnamed: 0,bedrooms,bathrooms,sqft,year_built,fips
8567,4.0,3.0,0.873286,0.515625,6059.0
30792,4.0,2.0,0.498818,0.234375,6037.0
55725,3.0,2.0,0.417967,0.281250,6037.0
10157,3.0,2.0,0.281797,0.312500,6037.0
47071,3.0,3.0,0.337589,0.281250,6037.0
...,...,...,...,...,...
47986,4.0,2.0,0.295508,0.453125,6037.0
12991,3.0,2.0,0.370213,0.109375,6037.0
2886,3.0,3.0,0.512057,0.203125,6037.0
35716,4.0,3.0,0.781560,0.343750,6037.0


In [18]:
def rfe(n_features, X_train, y_train):
    
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=n_features)

    rfe.fit(X_train, y_train)

    ranks = rfe.ranking_
    columns = X_train.columns.tolist()
    
    feature_ranks = pd.DataFrame({'ranking': ranks,
                              'feature': columns})

    feature_ranks = feature_ranks.sort_values('ranking')

    return pd.DataFrame(feature_ranks).head(n_features)

In [19]:
def f_selector(k, X_train, y_train):
    
    f_selector = SelectKBest(f_regression, k=k)

    f_selector.fit(X_train, y_train)

    f_select_mask = f_selector.get_support()

    select_k_best_features = X_train.iloc[:,f_select_mask]
    
    #print(select_k_best_features.head(k))
    
    return pd.DataFrame(select_k_best_features)

In [20]:
def create_preds_df(y_train):
    
    preds_df = pd.DataFrame({'actual': y_train})

    preds_df['baseline_median'] = y_train.median()
    
    return preds_df

In [21]:
def lin_regression(X_train, y_train):

    lm = LinearRegression()

    lm.fit(X_train, y_train)
    
    preds_df['lm_preds'] = lm.predict(X_train)
    
    return preds_df

In [22]:
def lasso_lars(X_train, y_train, alpha=.1):
    
    lasso = LassoLars(alpha=alpha)

    lasso.fit(X_train, y_train)

    preds_df['lasso_preds'] = lasso.predict(X_train)
    
    return preds_df

In [23]:
def glm_model(X_train, y_train, power=0):
    
    tweedie = TweedieRegressor(power=power)

    tweedie.fit(X_train, y_train)

    preds_df['tweedie_preds'] = tweedie.predict(X_train)
    
    return preds_df

In [24]:
def poly_subset(X_train, y_train, degree=2):
    
    pf = PolynomialFeatures(degree=degree)

    pf.fit(X_train, y_train)

    X_polynomial = pf.transform(X_train)
    
    return X_polynomial

In [25]:
def poly_model(X_polynomial, y_train, m):
    
    model = m

    model.fit(X_polynomial, y_train)

    preds_df['poly_preds'] = model.predict(X_polynomial)
    
    return preds_df

In [26]:
def get_rmses(preds_df):
    
    lm_rmse = sqrt(mean_squared_error(preds_df['lm_preds'], preds_df['actual']))
    lasso_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['lasso_preds']))
    tweedie_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedie_preds']))
    poly_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['poly_preds']))
    
    print(f'Linear Regression RMSE is: {lm_rmse}')
    print(f'Lasso-Lars Regression RMSE is: {lasso_rmse}')
    print(f'GLM Regression RMSE is: {tweedie_rmse}')
    print(f'Polynomial Regression RMSE is: {poly_rmse}')
    
    results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
                            'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

    return results

In [27]:
def model_results():

    results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
                            'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

    return results

In [28]:
def hist_charts(df):

    for col in df:

        plt.hist(df[col], bins=25)
        plt.title(f'{col} distribution')
        plt.show()

In [29]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['bedrooms', 'bathrooms'])

## linear regression model #1

In [104]:
preds_df = create_preds_df(y_train)

In [105]:
preds_df = lin_regression(X_train_scaled, y_train)

In [106]:
lm_rmse = sqrt(mean_squared_error(preds_df['lm_preds'], preds_df['actual']))

lm_rmse

172009.51215233456

In [107]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds
8567,725903.0,300238.0,469223.647989
30792,57938.0,300238.0,311659.602352
55725,422528.0,300238.0,304173.920233
10157,231444.0,300238.0,268842.056472
47071,434923.0,300238.0,303286.134904
...,...,...,...
47986,80762.0,300238.0,258764.574362
12991,201840.0,300238.0,291934.725208
2886,468235.0,300238.0,348586.642707
35716,287138.0,300238.0,404830.445538


In [108]:
sqrt(mean_squared_error(preds_df['actual'], preds_df['baseline_median']))

183442.57765522916

In [111]:
sqrt(mean_squared_error(preds_df['actual'], preds_df['baseline_avg']))

182205.25866689716

In [110]:
preds_df['baseline_avg'] = y_train.mean()


## lasso lars model #1

In [112]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [113]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,baseline_avg,lasso_preds
8567,725903.0,300238.0,469223.647989,321508.237679,469166.012431
30792,57938.0,300238.0,311659.602352,321508.237679,311675.779491
55725,422528.0,300238.0,304173.920233,321508.237679,304159.845817
10157,231444.0,300238.0,268842.056472,321508.237679,268841.252465
47071,434923.0,300238.0,303286.134904,321508.237679,303239.433887
...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,321508.237679,258819.758980
12991,201840.0,300238.0,291934.725208,321508.237679,291904.912516
2886,468235.0,300238.0,348586.642707,321508.237679,348518.691121
35716,287138.0,300238.0,404830.445538,321508.237679,404800.101750


In [114]:
lasso_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['lasso_preds']))
lasso_rmse

172009.5154937698

## glm model #1

In [116]:
glm_model(X_train_scaled, y_train, 2)

Unnamed: 0,actual,baseline_median,lm_preds,baseline_avg,lasso_preds,tweedie_preds
8567,725903.0,300238.0,469223.647989,321508.237679,469166.012431,338461.871121
30792,57938.0,300238.0,311659.602352,321508.237679,311675.779491,320568.619658
55725,422528.0,300238.0,304173.920233,321508.237679,304159.845817,314129.565546
10157,231444.0,300238.0,268842.056472,321508.237679,268841.252465,313054.015401
47071,434923.0,300238.0,303286.134904,321508.237679,303239.433887,326653.522147
...,...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,321508.237679,258819.758980,319559.049377
12991,201840.0,300238.0,291934.725208,321508.237679,291904.912516,313097.109275
2886,468235.0,300238.0,348586.642707,321508.237679,348518.691121,327949.366384
35716,287138.0,300238.0,404830.445538,321508.237679,404800.101750,337153.770125


In [117]:
tweedie_norm = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedie_preds']))

tweedie_norm

180268.50273177662

In [118]:
X_polynomial = poly_subset(X_train_scaled, y_train, 3)

In [119]:
preds_df = poly_model(X_polynomial, y_train, LinearRegression())

In [120]:
poly_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['poly_preds']))

poly_rmse

169139.89541249932

Polynomial with a degree of 2 and Linear Regression is best so far

In [121]:
results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
              'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

results

Unnamed: 0,model,rmse
0,linear,172009.512152
1,lasso,172009.515494
2,tweedie_norm,180268.502732
3,linear_poly,169139.895412


In [122]:
get_rmses(preds_df)

Linear Regression RMSE is: 172009.51215233456
Lasso-Lars Regression RMSE is: 172009.5154937698
GLM Regression RMSE is: 180268.50273177662
Polynomial Regression RMSE is: 169139.89541249932


Unnamed: 0,model,rmse
0,linear,172009.512152
1,lasso,172009.515494
2,tweedie_norm,180268.502732
3,linear_poly,169139.895412


# Again, but with feature engineering

## selectkbest

In [123]:
cols = ['sqft', 'year_built']

X_train_scaled, X_validate_scaled, X_test_scaled = prepare.scale_data(X_train, X_validate, X_test, MinMaxScaler(), cols)

In [124]:
X_train_scaled = f_selector(4, X_train_scaled, y_train)

In [125]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['bathrooms'])

In [126]:
X_train_scaled

Unnamed: 0,sqft,year_built,fips,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5
8567,0.873286,0.515625,6059.0,0,0,0,1,0
30792,0.498818,0.234375,6037.0,0,1,0,0,0
55725,0.417967,0.281250,6037.0,0,1,0,0,0
10157,0.281797,0.312500,6037.0,0,1,0,0,0
47071,0.337589,0.281250,6037.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
47986,0.295508,0.453125,6037.0,0,1,0,0,0
12991,0.370213,0.109375,6037.0,0,1,0,0,0
2886,0.512057,0.203125,6037.0,0,0,0,1,0
35716,0.781560,0.343750,6037.0,0,0,0,1,0


In [127]:
preds_df = create_preds_df(y_train)

In [128]:
preds_df = lin_regression(X_train_scaled, y_train)

In [129]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds
8567,725903.0,300238.0,461515.980003
30792,57938.0,300238.0,321668.361453
55725,422528.0,300238.0,301391.467529
10157,231444.0,300238.0,267634.505863
47071,434923.0,300238.0,298459.167789
...,...,...,...
47986,80762.0,300238.0,269847.246804
12991,201840.0,300238.0,291061.828151
2886,468235.0,300238.0,342024.558352
35716,287138.0,300238.0,407164.405726


In [130]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [131]:
preds_df = glm_model(X_train_scaled, y_train, 1)

In [132]:
poly_subset(X_train_scaled, y_train, 3)

array([[1.        , 0.87328605, 0.515625  , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.49881797, 0.234375  , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.4179669 , 0.28125   , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.51205674, 0.203125  , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.78156028, 0.34375   , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.16453901, 0.328125  , ..., 0.        , 0.        ,
        0.        ]])

In [133]:
poly_model(X_train_scaled, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
8567,725903.0,300238.0,461515.980003,461461.079063,321508.237679,461515.980003
30792,57938.0,300238.0,321668.361453,321658.772949,321508.237679,321668.361453
55725,422528.0,300238.0,301391.467529,301389.437755,321508.237679,301391.467529
10157,231444.0,300238.0,267634.505863,267639.189282,321508.237679,267634.505863
47071,434923.0,300238.0,298459.167789,298420.559640,321508.237679,298459.167789
...,...,...,...,...,...,...
47986,80762.0,300238.0,269847.246804,269869.391878,321508.237679,269847.246804
12991,201840.0,300238.0,291061.828151,291039.087058,321508.237679,291061.828151
2886,468235.0,300238.0,342024.558352,341972.544163,321508.237679,342024.558352
35716,287138.0,300238.0,407164.405726,407124.648278,321508.237679,407164.405726


In [134]:
sqrt(mean_squared_error(y_train, preds_df['poly_preds']))

171936.13045323553

In [135]:
get_rmses(preds_df)

Linear Regression RMSE is: 171936.13045323553
Lasso-Lars Regression RMSE is: 171936.13403601473
GLM Regression RMSE is: 182205.25866689716
Polynomial Regression RMSE is: 171936.13045323553


Unnamed: 0,model,rmse
0,linear,171936.130453
1,lasso,171936.134036
2,tweedie_norm,180268.502732
3,linear_poly,171936.130453


## rfe 

In [147]:
rfe(3, X_train_scaled, y_train)

Unnamed: 0,ranking,feature
0,1,bedrooms
1,1,bathrooms
2,1,sqft


In [148]:
preds_df = create_preds_df(y_train)

In [149]:
preds_df = lin_regression(X_train_scaled, y_train)

In [150]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [151]:
preds_df = glm_model(X_train_scaled, y_train, 1)

In [152]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds
8567,725903.0,300238.0,469223.647989,469166.012431,321508.237679
30792,57938.0,300238.0,311659.602352,311675.779491,321508.237679
55725,422528.0,300238.0,304173.920233,304159.845817,321508.237679
10157,231444.0,300238.0,268842.056472,268841.252465,321508.237679
47071,434923.0,300238.0,303286.134904,303239.433887,321508.237679
...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,258819.758980,321508.237679
12991,201840.0,300238.0,291934.725208,291904.912516,321508.237679
2886,468235.0,300238.0,348586.642707,348518.691121,321508.237679
35716,287138.0,300238.0,404830.445538,404800.101750,321508.237679


In [153]:
poly_model(X_train_scaled, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
8567,725903.0,300238.0,469223.647989,469166.012431,321508.237679,469223.647989
30792,57938.0,300238.0,311659.602352,311675.779491,321508.237679,311659.602352
55725,422528.0,300238.0,304173.920233,304159.845817,321508.237679,304173.920233
10157,231444.0,300238.0,268842.056472,268841.252465,321508.237679,268842.056472
47071,434923.0,300238.0,303286.134904,303239.433887,321508.237679,303286.134904
...,...,...,...,...,...,...
47986,80762.0,300238.0,258764.574362,258819.758980,321508.237679,258764.574362
12991,201840.0,300238.0,291934.725208,291904.912516,321508.237679,291934.725208
2886,468235.0,300238.0,348586.642707,348518.691121,321508.237679,348586.642707
35716,287138.0,300238.0,404830.445538,404800.101750,321508.237679,404830.445538


In [154]:
get_rmses(preds_df)

Linear Regression RMSE is: 172009.51215233456
Lasso-Lars Regression RMSE is: 172009.5154937698
GLM Regression RMSE is: 182205.25866689716
Polynomial Regression RMSE is: 172009.51215233456


Unnamed: 0,model,rmse
0,linear,172009.512152
1,lasso,172009.515494
2,tweedie_norm,180268.502732
3,linear_poly,172009.512152


# Top 3

- polynomial, 2deg, LinReg is #1
- LinReg, no rfe/selectkbest, is #2
- LassoLars no rfe/selectkbest, alpha .1 is #3

In [95]:
X_validate_scaled = pd.get_dummies(X_validate_scaled, columns=['bedrooms', 'bathrooms', 'fips'])

In [96]:
X_validate_scaled

Unnamed: 0,sqft,year_built,bedrooms_2.0,bedrooms_3.0,bedrooms_4.0,bedrooms_5.0,bathrooms_1.0,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,bathrooms_4.0,bathrooms_4.5,fips_6037.0,fips_6059.0
5483,0.369096,0.477477,0,0,1,0,0,0,1,0,0,0,0,0,1,0
4272,0.541108,0.576577,0,0,0,1,0,0,0,0,1,0,0,0,1,0
1071,0.713120,0.612613,0,0,0,1,0,0,0,0,1,0,0,0,1,0
19292,0.329155,0.504505,0,0,1,0,0,0,1,0,0,0,0,0,0,1
8910,0.137609,0.486486,1,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13874,0.638776,0.864865,0,0,1,0,0,0,0,0,0,0,1,0,1,0
9654,0.463557,0.378378,0,0,1,0,0,0,1,0,0,0,0,0,1,0
7807,0.365015,0.513514,0,0,1,0,0,0,1,0,0,0,0,0,0,1
27834,0.206997,0.144144,0,1,0,0,1,0,0,0,0,0,0,0,1,0


In [26]:
preds_df = create_preds_df(y_validate)

In [27]:
preds_df = lin_regression(X_validate_scaled, y_validate)

In [28]:
preds_df = lasso_lars(X_validate_scaled, y_validate)

In [29]:
preds_df = glm_model(X_validate_scaled, y_validate, 1)

In [30]:
X_polynomial = poly_subset(X_validate_scaled, y_validate, 3)

In [31]:
poly_model(X_validate_scaled, y_validate, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
5483,350016.0,300000.0,315847.848183,315879.817479,345486.436585,315847.848183
4272,304817.0,300000.0,412819.810416,412855.956434,345486.436585,412819.810416
1071,747874.0,300000.0,510700.988441,510723.396052,345486.436585,510700.988441
19292,97324.0,300000.0,330122.432971,330138.574889,345486.436585,330122.432971
8910,162994.0,300000.0,257687.369571,257663.912346,345486.436585,257687.369571
...,...,...,...,...,...,...
13874,1040710.0,300000.0,538879.655238,538851.003568,345486.436585,538879.655238
9654,514602.0,300000.0,369295.493206,369323.189602,345486.436585,369295.493206
7807,203539.0,300000.0,350531.997103,350545.233735,345486.436585,350531.997103
27834,272110.0,300000.0,223912.139646,223945.643340,345486.436585,223912.139646


In [32]:
get_rmses(preds_df)

Linear Regression RMSE is: 197846.735294878
Lasso-Lars Regression RMSE is: 197846.73687149206
GLM Regression RMSE is: 227428.42957956833
Polynomial Regression RMSE is: 197846.735294878


- Polynomial with a Degree of 2, using LinearRegression has an rmse of 197846.7

- LinearRegression rmse is 197846.7 as well

- LassoLars with no rfe or selectkbest using an alpha of .1 has an rmse of 197846.7 (the same, rounded, but very slightly different