In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from math import sqrt
from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LassoLars, LinearRegression, TweedieRegressor

import acquire
import prepare
import explore
import model

%matplotlib inline
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings("ignore")

seed = 42

In [2]:
df = acquire.get_zillow()

In [3]:
df = prepare.prep_zillow(df)

In [4]:
df

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value,year_built,fips
0,3.0,2.0,1175.0,327859.0,1953.0,6037.0
1,3.0,2.0,1630.0,63948.0,1953.0,6037.0
2,2.0,2.0,1206.0,356218.0,1954.0,6037.0
3,3.0,2.0,1790.0,242773.0,1964.0,6037.0
4,3.0,2.0,1400.0,251246.0,1953.0,6037.0
...,...,...,...,...,...,...
56075,3.0,2.0,1400.0,318206.0,1951.0,6037.0
56076,4.0,2.0,1446.0,140804.0,1951.0,6037.0
56077,4.0,2.0,1584.0,412114.0,1955.0,6037.0
56078,4.0,2.0,1584.0,186627.0,1955.0,6037.0


In [5]:
cols = ['bedrooms', 'bathrooms', 'sqft', 'tax_value', 'year_built', 'fips']

In [6]:
df = prepare.remove_outliers(df, 2, cols)

In [7]:
df

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value,year_built,fips
0,3.0,2.0,1175.0,327859.0,1953.0,6037.0
1,3.0,2.0,1630.0,63948.0,1953.0,6037.0
2,2.0,2.0,1206.0,356218.0,1954.0,6037.0
3,3.0,2.0,1790.0,242773.0,1964.0,6037.0
4,3.0,2.0,1400.0,251246.0,1953.0,6037.0
...,...,...,...,...,...,...
56075,3.0,2.0,1400.0,318206.0,1951.0,6037.0
56076,4.0,2.0,1446.0,140804.0,1951.0,6037.0
56077,4.0,2.0,1584.0,412114.0,1955.0,6037.0
56078,4.0,2.0,1584.0,186627.0,1955.0,6037.0


In [8]:
train, validate, test = prepare.subset_df(df)

(27650, 6) (9217, 6) (9217, 6)


In [9]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model.xy_subsets(train, validate, test, 'tax_value')

In [10]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft,year_built,fips
39518,4.0,3.0,1888.0,1972.0,6059.0
36470,4.0,2.0,1779.0,1996.0,6037.0
14646,4.0,3.0,2759.0,1990.0,6037.0
6714,4.0,2.0,1446.0,1947.0,6037.0
44146,3.0,2.0,1667.0,1941.0,6059.0
...,...,...,...,...,...
14332,3.0,2.0,1509.0,1957.0,6037.0
54643,5.0,2.0,2268.0,1955.0,6059.0
46604,3.0,2.0,1832.0,1952.0,6037.0
927,3.0,1.0,1723.0,1950.0,6037.0


In [11]:
cols = ['sqft', 'year_built']

X_train_scaled, X_validate_scaled, X_test_scaled = prepare.scale_data(X_train, X_validate, X_test, MinMaxScaler(), cols)

In [12]:
X_train_scaled.shape, X_validate_scaled.shape, X_test_scaled.shape

((27650, 5), (9217, 5), (9217, 5))

In [66]:
X_train_scaled

Unnamed: 0,bedrooms,bathrooms,sqft,year_built,fips
39518,4.0,3.0,0.410496,0.603604,6059.0
36470,4.0,2.0,0.378717,0.819820,6037.0
14646,4.0,3.0,0.664431,0.765766,6037.0
6714,4.0,2.0,0.281633,0.378378,6037.0
44146,3.0,2.0,0.346064,0.324324,6059.0
...,...,...,...,...,...
14332,3.0,2.0,0.300000,0.468468,6037.0
54643,5.0,2.0,0.521283,0.450450,6059.0
46604,3.0,2.0,0.394169,0.423423,6037.0
927,3.0,1.0,0.362391,0.405405,6037.0


In [13]:
def rfe(n_features, X_train, y_train):
    
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=n_features)

    rfe.fit(X_train, y_train)

    ranks = rfe.ranking_
    columns = X_train.columns.tolist()
    
    feature_ranks = pd.DataFrame({'ranking': ranks,
                              'feature': columns})

    feature_ranks = feature_ranks.sort_values('ranking')

    return pd.DataFrame(feature_ranks).head(n_features)

In [14]:
def f_selector(k, X_train, y_train):
    
    f_selector = SelectKBest(f_regression, k=k)

    f_selector.fit(X_train, y_train)

    f_select_mask = f_selector.get_support()

    select_k_best_features = X_train.iloc[:,f_select_mask]
    
    #print(select_k_best_features.head(k))
    
    return pd.DataFrame(select_k_best_features)

In [15]:
def create_preds_df(y_train):
    
    preds_df = pd.DataFrame({'actual': y_train})

    preds_df['baseline_median'] = y_train.median()
    
    return preds_df

In [16]:
def lin_regression(X_train, y_train):

    lm = LinearRegression()

    lm.fit(X_train, y_train)
    
    preds_df['lm_preds'] = lm.predict(X_train)
    
    return preds_df

In [17]:
def lasso_lars(X_train, y_train, alpha=.1):
    
    lasso = LassoLars(alpha=alpha)

    lasso.fit(X_train, y_train)

    preds_df['lasso_preds'] = lasso.predict(X_train)
    
    return preds_df

In [18]:
def glm_model(X_train, y_train, power=0):
    
    tweedie = TweedieRegressor(power=power)

    tweedie.fit(X_train, y_train)

    preds_df['tweedie_preds'] = tweedie.predict(X_train)
    
    return preds_df

In [19]:
def poly_subset(X_train, y_train, degree=2):
    
    pf = PolynomialFeatures(degree=degree)

    pf.fit(X_train, y_train)

    X_polynomial = pf.transform(X_train)
    
    return X_polynomial

In [20]:
def poly_model(X_polynomial, y_train, m):
    
    model = m

    model.fit(X_polynomial, y_train)

    preds_df['poly_preds'] = model.predict(X_polynomial)
    
    return preds_df

In [21]:
def get_rmses(preds_df):
    
    lm_rmse = sqrt(mean_squared_error(preds_df['lm_preds'], preds_df['actual']))
    lasso_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['lasso_preds']))
    tweedie_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedie_preds']))
    poly_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['poly_preds']))
    
    print(f'Linear Regression RMSE is: {lm_rmse}')
    print(f'Lasso-Lars Regression RMSE is: {lasso_rmse}')
    print(f'GLM Regression RMSE is: {tweedie_rmse}')
    print(f'Polynomial Regression RMSE is: {poly_rmse}')
    
    results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
                            'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

    return results

In [22]:
def model_results():

    results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
                            'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

    return results

In [72]:
def hist_charts(df):

    for col in df:

        plt.hist(df[col], bins=25)
        plt.title(f'{col} distribution')
        plt.show()

In [24]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['bedrooms', 'bathrooms'])

In [25]:
X_train_scaled


Unnamed: 0,sqft,year_built,fips,bedrooms_2.0,bedrooms_3.0,bedrooms_4.0,bedrooms_5.0,bathrooms_1.0,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,bathrooms_4.0,bathrooms_4.5
39518,0.410496,0.603604,6059.0,0,0,1,0,0,0,0,0,1,0,0,0
36470,0.378717,0.819820,6037.0,0,0,1,0,0,0,1,0,0,0,0,0
14646,0.664431,0.765766,6037.0,0,0,1,0,0,0,0,0,1,0,0,0
6714,0.281633,0.378378,6037.0,0,0,1,0,0,0,1,0,0,0,0,0
44146,0.346064,0.324324,6059.0,0,1,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,0.300000,0.468468,6037.0,0,1,0,0,0,0,1,0,0,0,0,0
54643,0.521283,0.450450,6059.0,0,0,0,1,0,0,1,0,0,0,0,0
46604,0.394169,0.423423,6037.0,0,1,0,0,0,0,1,0,0,0,0,0
927,0.362391,0.405405,6037.0,0,1,0,0,1,0,0,0,0,0,0,0


## linear regression model #1

In [54]:
preds_df = lin_regression(X_train_scaled, y_train)

In [55]:
lm_rmse = sqrt(mean_squared_error(preds_df['lm_preds'], preds_df['actual']))

lm_rmse

201151.25482385387

In [56]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
39518,367584.0,305203.5,382336.0,382371.938598,373393.240084,382336.0
36470,414224.0,305203.5,334848.0,335263.900366,325013.299560,334848.0
14646,565097.0,305203.5,529600.0,530739.880620,532005.673971,529600.0
6714,238731.0,305203.5,275200.0,274215.856123,282799.455621,275200.0
44146,256532.0,305203.5,339264.0,338117.607110,330480.117703,339264.0
...,...,...,...,...,...,...
14332,176100.0,305203.5,314048.0,313181.452828,310459.997083,314048.0
54643,77121.0,305203.5,374016.0,372960.605076,361461.451270,374016.0
46604,125361.0,305203.5,367744.0,367003.300448,353644.608524,367744.0
927,114100.0,305203.5,327232.0,331086.371150,296333.990329,327232.0


In [57]:
sqrt(mean_squared_error(preds_df['actual'], preds_df['baseline_median']))

236356.89463948304

## lasso lars model #1

In [58]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [59]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
39518,367584.0,305203.5,382336.0,382371.938598,373393.240084,382336.0
36470,414224.0,305203.5,334848.0,335263.900366,325013.299560,334848.0
14646,565097.0,305203.5,529600.0,530739.880620,532005.673971,529600.0
6714,238731.0,305203.5,275200.0,274215.856123,282799.455621,275200.0
44146,256532.0,305203.5,339264.0,338117.607110,330480.117703,339264.0
...,...,...,...,...,...,...
14332,176100.0,305203.5,314048.0,313181.452828,310459.997083,314048.0
54643,77121.0,305203.5,374016.0,372960.605076,361461.451270,374016.0
46604,125361.0,305203.5,367744.0,367003.300448,353644.608524,367744.0
927,114100.0,305203.5,327232.0,331086.371150,296333.990329,327232.0


In [60]:
lasso_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['lasso_preds']))
lasso_rmse

201145.61629488488

## glm model #1

In [62]:
preds_df = preds_df.drop(columns='tweedie_preds')

In [63]:
glm_model(X_train_scaled, y_train, 2)

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,poly_preds,tweedie_preds
39518,367584.0,305203.5,382336.0,382371.938598,382336.0,382586.600292
36470,414224.0,305203.5,334848.0,335263.900366,334848.0,354963.621868
14646,565097.0,305203.5,529600.0,530739.880620,529600.0,388688.209291
6714,238731.0,305203.5,275200.0,274215.856123,275200.0,349663.390637
44146,256532.0,305203.5,339264.0,338117.607110,339264.0,334972.420109
...,...,...,...,...,...,...
14332,176100.0,305203.5,314048.0,313181.452828,314048.0,335389.463138
54643,77121.0,305203.5,374016.0,372960.605076,374016.0,349343.400717
46604,125361.0,305203.5,367744.0,367003.300448,367744.0,336523.027522
927,114100.0,305203.5,327232.0,331086.371150,327232.0,329723.547148


In [65]:
tweedie_norm = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedie_preds']))

tweedie_norm

224743.78428993176

In [66]:
X_polynomial = poly_subset(X_train_scaled, y_train)

In [67]:
poly_model(X_polynomial, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,poly_preds,tweedie_preds
39518,367584.0,305203.5,382336.0,382371.938598,379792.0,382586.600292
36470,414224.0,305203.5,334848.0,335263.900366,322984.0,354963.621868
14646,565097.0,305203.5,529600.0,530739.880620,541168.0,388688.209291
6714,238731.0,305203.5,275200.0,274215.856123,286280.0,349663.390637
44146,256532.0,305203.5,339264.0,338117.607110,360784.0,334972.420109
...,...,...,...,...,...,...
14332,176100.0,305203.5,314048.0,313181.452828,314384.0,335389.463138
54643,77121.0,305203.5,374016.0,372960.605076,394256.0,349343.400717
46604,125361.0,305203.5,367744.0,367003.300448,375952.0,336523.027522
927,114100.0,305203.5,327232.0,331086.371150,309968.0,329723.547148


In [68]:
poly_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['poly_preds']))

poly_rmse

199157.55520772957

Polynomial with a degree of 2 and Linear Regression is best so far

In [69]:
results = pd.DataFrame({'model':['linear', 'lasso', 'tweedie_norm', 'linear_poly'],
              'rmse':[lm_rmse, lasso_rmse, tweedie_norm, poly_rmse]})

results

Unnamed: 0,model,rmse
0,linear,201151.254824
1,lasso,201145.616295
2,tweedie_norm,224743.78429
3,linear_poly,199157.555208


In [70]:
get_rmses(preds_df)

Linear Regression RMSE is: 201151.25482385387
Lasso-Lars Regression RMSE is: 201145.61629488488
GLM Regression RMSE is: 224743.78428993176
Polynomial Regression RMSE is: 199157.55520772957


# Again, but with feature engineering

## selectkbest

In [41]:
cols = ['sqft', 'year_built']

X_train_scaled, X_validate_scaled, X_test_scaled = prepare.scale_data(X_train, X_validate, X_test, MinMaxScaler(), cols)

In [42]:
X_train_scaled = f_selector(4, X_train_scaled, y_train)

In [43]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['bedrooms', 'bathrooms'])

In [44]:
X_train_scaled

Unnamed: 0,sqft,year_built,bedrooms_2.0,bedrooms_3.0,bedrooms_4.0,bedrooms_5.0,bathrooms_1.0,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,bathrooms_4.0,bathrooms_4.5
39518,0.410496,0.603604,0,0,1,0,0,0,0,0,1,0,0,0
36470,0.378717,0.819820,0,0,1,0,0,0,1,0,0,0,0,0
14646,0.664431,0.765766,0,0,1,0,0,0,0,0,1,0,0,0
6714,0.281633,0.378378,0,0,1,0,0,0,1,0,0,0,0,0
44146,0.346064,0.324324,0,1,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,0.300000,0.468468,0,1,0,0,0,0,1,0,0,0,0,0
54643,0.521283,0.450450,0,0,0,1,0,0,1,0,0,0,0,0
46604,0.394169,0.423423,0,1,0,0,0,0,1,0,0,0,0,0
927,0.362391,0.405405,0,1,0,0,1,0,0,0,0,0,0,0


In [45]:
preds_df = create_preds_df(y_train)

In [46]:
preds_df = lin_regression(X_train_scaled, y_train)

In [47]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds
39518,367584.0,305203.5,382336.0
36470,414224.0,305203.5,334848.0
14646,565097.0,305203.5,529600.0
6714,238731.0,305203.5,275200.0
44146,256532.0,305203.5,339264.0
...,...,...,...
14332,176100.0,305203.5,314048.0
54643,77121.0,305203.5,374016.0
46604,125361.0,305203.5,367744.0
927,114100.0,305203.5,327232.0


In [48]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [49]:
preds_df = glm_model(X_train_scaled, y_train, 1)

In [50]:
poly_subset(X_train_scaled, y_train, 3)

array([[1.        , 0.41049563, 0.6036036 , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.3787172 , 0.81981982, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.66443149, 0.76576577, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.3941691 , 0.42342342, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.36239067, 0.40540541, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.44693878, 0.53153153, ..., 0.        , 0.        ,
        0.        ]])

In [51]:
poly_model(X_train_scaled, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
39518,367584.0,305203.5,382336.0,382371.938598,373393.240084,382336.0
36470,414224.0,305203.5,334848.0,335263.900366,325013.299560,334848.0
14646,565097.0,305203.5,529600.0,530739.880620,532005.673971,529600.0
6714,238731.0,305203.5,275200.0,274215.856123,282799.455621,275200.0
44146,256532.0,305203.5,339264.0,338117.607110,330480.117703,339264.0
...,...,...,...,...,...,...
14332,176100.0,305203.5,314048.0,313181.452828,310459.997083,314048.0
54643,77121.0,305203.5,374016.0,372960.605076,361461.451270,374016.0
46604,125361.0,305203.5,367744.0,367003.300448,353644.608524,367744.0
927,114100.0,305203.5,327232.0,331086.371150,296333.990329,327232.0


In [52]:
sqrt(mean_squared_error(y_train, preds_df['poly_preds']))

201151.25482385387

In [53]:
get_rmses(preds_df)

Linear Regression RMSE is: 201151.25482385387
Lasso-Lars Regression RMSE is: 201145.61629488488
GLM Regression RMSE is: 201574.06010070135
Polynomial Regression RMSE is: 201151.25482385387


## rfe 

In [33]:
rfe(3, X_train_scaled, y_train)

Unnamed: 0,ranking,feature
7,1,bathrooms_1.0
8,1,bathrooms_1.5
9,1,bathrooms_2.0


In [34]:
preds_df = create_preds_df(y_train)

In [35]:
preds_df = lin_regression(X_train_scaled, y_train)

In [36]:
preds_df = lasso_lars(X_train_scaled, y_train)

In [37]:
preds_df = glm_model(X_train_scaled, y_train, 1)

In [38]:
preds_df

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds
39518,367584.0,305203.5,411279.992724,411217.187658,351356.679349
36470,414224.0,305203.5,317456.467542,317661.299439,351356.679349
14646,565097.0,305203.5,517087.201223,517134.905453,351356.679349
6714,238731.0,305203.5,265101.887586,265172.263586,351356.679349
44146,256532.0,305203.5,371335.813727,371239.208445,351356.679349
...,...,...,...,...,...
14332,176100.0,305203.5,305047.062896,305052.515004,351356.679349
54643,77121.0,305203.5,398487.895525,398631.707498,351356.679349
46604,125361.0,305203.5,359456.009697,359448.783816,351356.679349
927,114100.0,305203.5,326357.659000,326454.630197,351356.679349


In [39]:
poly_model(X_train_scaled, y_train, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
39518,367584.0,305203.5,411279.992724,411217.187658,351356.679349,411279.992724
36470,414224.0,305203.5,317456.467542,317661.299439,351356.679349,317456.467542
14646,565097.0,305203.5,517087.201223,517134.905453,351356.679349,517087.201223
6714,238731.0,305203.5,265101.887586,265172.263586,351356.679349,265101.887586
44146,256532.0,305203.5,371335.813727,371239.208445,351356.679349,371335.813727
...,...,...,...,...,...,...
14332,176100.0,305203.5,305047.062896,305052.515004,351356.679349,305047.062896
54643,77121.0,305203.5,398487.895525,398631.707498,351356.679349,398487.895525
46604,125361.0,305203.5,359456.009697,359448.783816,351356.679349,359456.009697
927,114100.0,305203.5,326357.659000,326454.630197,351356.679349,326357.659000


In [40]:
get_rmses(preds_df)

Linear Regression RMSE is: 200631.59415393448
Lasso-Lars Regression RMSE is: 200631.61237402717
GLM Regression RMSE is: 231806.95778944652
Polynomial Regression RMSE is: 200631.59415393448


# Top 3

- polynomial, 2deg, LinReg is #1
- LinReg, no rfe/selectkbest, is #2
- LassoLars no rfe/selectkbest, alpha .1 is #3

In [95]:
X_validate_scaled = pd.get_dummies(X_validate_scaled, columns=['bedrooms', 'bathrooms', 'fips'])

In [96]:
X_validate_scaled

Unnamed: 0,sqft,year_built,bedrooms_2.0,bedrooms_3.0,bedrooms_4.0,bedrooms_5.0,bathrooms_1.0,bathrooms_1.5,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,bathrooms_4.0,bathrooms_4.5,fips_6037.0,fips_6059.0
5483,0.369096,0.477477,0,0,1,0,0,0,1,0,0,0,0,0,1,0
4272,0.541108,0.576577,0,0,0,1,0,0,0,0,1,0,0,0,1,0
1071,0.713120,0.612613,0,0,0,1,0,0,0,0,1,0,0,0,1,0
19292,0.329155,0.504505,0,0,1,0,0,0,1,0,0,0,0,0,0,1
8910,0.137609,0.486486,1,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13874,0.638776,0.864865,0,0,1,0,0,0,0,0,0,0,1,0,1,0
9654,0.463557,0.378378,0,0,1,0,0,0,1,0,0,0,0,0,1,0
7807,0.365015,0.513514,0,0,1,0,0,0,1,0,0,0,0,0,0,1
27834,0.206997,0.144144,0,1,0,0,1,0,0,0,0,0,0,0,1,0


In [26]:
preds_df = create_preds_df(y_validate)

In [27]:
preds_df = lin_regression(X_validate_scaled, y_validate)

In [28]:
preds_df = lasso_lars(X_validate_scaled, y_validate)

In [29]:
preds_df = glm_model(X_validate_scaled, y_validate, 1)

In [30]:
X_polynomial = poly_subset(X_validate_scaled, y_validate, 3)

In [31]:
poly_model(X_validate_scaled, y_validate, LinearRegression())

Unnamed: 0,actual,baseline_median,lm_preds,lasso_preds,tweedie_preds,poly_preds
5483,350016.0,300000.0,315847.848183,315879.817479,345486.436585,315847.848183
4272,304817.0,300000.0,412819.810416,412855.956434,345486.436585,412819.810416
1071,747874.0,300000.0,510700.988441,510723.396052,345486.436585,510700.988441
19292,97324.0,300000.0,330122.432971,330138.574889,345486.436585,330122.432971
8910,162994.0,300000.0,257687.369571,257663.912346,345486.436585,257687.369571
...,...,...,...,...,...,...
13874,1040710.0,300000.0,538879.655238,538851.003568,345486.436585,538879.655238
9654,514602.0,300000.0,369295.493206,369323.189602,345486.436585,369295.493206
7807,203539.0,300000.0,350531.997103,350545.233735,345486.436585,350531.997103
27834,272110.0,300000.0,223912.139646,223945.643340,345486.436585,223912.139646


In [32]:
get_rmses(preds_df)

Linear Regression RMSE is: 197846.735294878
Lasso-Lars Regression RMSE is: 197846.73687149206
GLM Regression RMSE is: 227428.42957956833
Polynomial Regression RMSE is: 197846.735294878


- Polynomial with a Degree of 2, using LinearRegression has an rmse of 197846.7

- LinearRegression rmse is 197846.7 as well

- LassoLars with no rfe or selectkbest using an alpha of .1 has an rmse of 197846.7 (the same, rounded, but very slightly different