In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from wrangle import Wrangle
from prepare import Prepare
import pandas as pd
import numpy as np
from scipy import stats
train, val, test = Wrangle().wrangle_zillow()
train.head()

Unnamed: 0,bedrooms,bathrooms,square_feet,home_tax_value,year_built,fips,county
24082,3.0,2.0,1040.0,315000.0,1971.0,6037.0,Los Angeles
9432,4.0,1.0,1354.0,51513.0,1947.0,6037.0,Los Angeles
50142,3.0,1.0,1180.0,64709.0,1949.0,6037.0,Los Angeles
38310,3.0,2.0,1922.0,218663.0,1952.0,6037.0,Los Angeles
7652,4.0,4.0,3710.0,1700000.0,1987.0,6037.0,Los Angeles


In [7]:
p = Prepare()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = p.get_Xy(train, val, test)
X_train.head()

Unnamed: 0,bedrooms,bathrooms,square_feet
24082,3.0,2.0,1040.0
9432,4.0,1.0,1354.0
50142,3.0,1.0,1180.0
38310,3.0,2.0,1922.0
7652,4.0,4.0,3710.0


In [8]:
X_train_scaled, X_val_scaled, X_test_scaled, _ = p.scaling(X_train, X_val, X_test)
X_train_scaled.head()

Unnamed: 0,bedrooms,bathrooms,square_feet
0,-0.304758,-0.258425,-1.01884
1,0.830145,-1.417729,-0.6148
2,-0.304758,-1.417729,-0.838695
3,-0.304758,-0.258425,0.116074
4,0.830145,2.060183,2.416785


In [11]:
act_pred_res = pd.DataFrame({"actual": y_train})
act_pred_res

Unnamed: 0,actual
24082,315000.0
9432,51513.0
50142,64709.0
38310,218663.0
7652,1700000.0
...,...
8132,512700.0
40424,252760.0
10848,530114.0
21149,199838.0


In [9]:
baseline_pred = train["home_tax_value"].mean()
baseline_pred

455123.24122306815

In [12]:
act_pred_res["baseline_prediction"] = baseline_pred
act_pred_res

Unnamed: 0,actual,baseline_prediction
24082,315000.0,455123.241223
9432,51513.0,455123.241223
50142,64709.0,455123.241223
38310,218663.0,455123.241223
7652,1700000.0,455123.241223
...,...,...
8132,512700.0,455123.241223
40424,252760.0,455123.241223
10848,530114.0,455123.241223
21149,199838.0,455123.241223


In [14]:
baseline_rmse = mean_squared_error(train["home_tax_value"], act_pred_res["baseline_prediction"] , squared=False)
baseline_rmse

386069.73075485

### Baseline RMSE is 386069.73

In [10]:
def grid_search(X, y, model, params_dic):
    grid = GridSearchCV(model, params_dic, n_jobs=-1)
    return grid.fit(X, y)

In [22]:
# lasso_grid = {"alpha": [0.01, 0.1, 0.25, 0.5, 0.75, 1.0]}
# tweedie_grid = {"power": [0, 1, 2, 3], "alpha": [0.01, 0.1, 0.25, 0.5, 0.75, 1.0]}
# models = [LassoLars(random_state=123, normalize=False), TweedieRegressor()]

# lasso = grid_search(X_train_scaled, y_train, models[0], lasso_grid)
# tweddie = grid_search(X_train_scaled, y_train, models[1], tweedie_grid)

In [23]:
lasso.best_estimator_


LassoLars(normalize=False, random_state=123)

In [24]:
lasso.best_params_

{'alpha': 1.0}

In [25]:
tweddie.best_estimator_

TweedieRegressor(power=1)

In [26]:
tweddie.best_params_

{'alpha': 1.0, 'power': 1}

In [33]:
lasso = LassoLars(alpha=1.0, normalize=False).fit(X_train_scaled, y_train)
tweedie = TweedieRegressor(power=1, alpha=1.0).fit(X_train_scaled, y_train)
lr = LinearRegression().fit(X_train_scaled, y_train)

In [34]:
lass_pred_train = lasso.predict(X_train_scaled)
tweed_pred_train = tweedie.predict(X_train_scaled)
lr_pred_train = lr.predict(X_train_scaled)


In [38]:
mean_squared_error(y_train, tweed_pred_train, squared=False), mean_squared_error(y_train, lass_pred_train, squared=False), mean_squared_error(y_train, lr_pred_train, squared=False)

(319198.00647314545, 318897.305862654, 318897.30585312896)

In [39]:
lass_pred_val = lasso.predict(X_val_scaled)
tweed_pred_val = tweedie.predict(X_val_scaled)
lr_pred_val = lr.predict(X_val_scaled)


In [41]:
mean_squared_error(y_val, tweed_pred_val, squared=False), mean_squared_error(y_val, lass_pred_val, squared=False), mean_squared_error(y_val, lr_pred_val, squared=False)

(318701.7141916137, 318602.79952656466, 318602.77199369104)

In [42]:
mean_squared_error(y_train, tweed_pred_train, squared=False) - mean_squared_error(y_val, tweed_pred_val, squared=False), mean_squared_error(y_train, lass_pred_train, squared=False) - mean_squared_error(y_val, lass_pred_val, squared=False), mean_squared_error(y_train, lr_pred_train, squared=False) -  mean_squared_error(y_val, lr_pred_val, squared=False)

(496.2922815317288, 294.50633608934004, 294.53385943791363)

### TEST

In [44]:
tweed_pred_test = tweedie.predict(X_test_scaled)
mean_squared_error(y_test, tweed_pred_test, squared=False)

315397.4604281871