In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from wrangle import Wrangle
from prepare import Prepare
import pandas as pd
train, val, test = Wrangle().wrangle_zillow()
train.head()

Unnamed: 0,bedrooms,bathrooms,square_feet,home_tax_value,year_built,fips
24082,3.0,2.0,1040.0,315000.0,1971.0,6037.0
9432,4.0,1.0,1354.0,51513.0,1947.0,6037.0
50142,3.0,1.0,1180.0,64709.0,1949.0,6037.0
38310,3.0,2.0,1922.0,218663.0,1952.0,6037.0
7652,4.0,4.0,3710.0,1700000.0,1987.0,6037.0


In [2]:
p = Prepare()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = p.get_Xy(train, val, test)
X_train.head()

Unnamed: 0,bedrooms,bathrooms,square_feet
24082,3.0,2.0,1040.0
9432,4.0,1.0,1354.0
50142,3.0,1.0,1180.0
38310,3.0,2.0,1922.0
7652,4.0,4.0,3710.0


In [3]:
X_train_scaled, X_val_scaled, X_test_scaled, _ = p.scaling(X_train, X_val, X_test)

In [5]:
baseline_pred = train["home_tax_value"].mean()
train['baseline_prediction'] = baseline_pred
train.head()

Unnamed: 0,bedrooms,bathrooms,square_feet,home_tax_value,year_built,fips,baseline_prediction
24082,3.0,2.0,1040.0,315000.0,1971.0,6037.0,455123.241223
9432,4.0,1.0,1354.0,51513.0,1947.0,6037.0,455123.241223
50142,3.0,1.0,1180.0,64709.0,1949.0,6037.0,455123.241223
38310,3.0,2.0,1922.0,218663.0,1952.0,6037.0,455123.241223
7652,4.0,4.0,3710.0,1700000.0,1987.0,6037.0,455123.241223


In [6]:
baseline_rmse = mean_squared_error(train["home_tax_value"], train["baseline_prediction"], squared=False)
baseline_rmse

386069.73075485

### Baseline RMSE is 386069.73

In [6]:
def grid_search(X, y, model, params_dic):
    grid = GridSearchCV(model, params_dic, n_jobs=-1)
    return grid.fit(X, y)

In [7]:
lasso_grid = {"alpha": [0.01, 0.1, 0.5, 1.0]}
tweedie_grid = {"power": [0, 1, 2, 3], "alpha": [0.01, 0.1, 0.5, 1.0]}
models = [LassoLars(random_state=123, normalize=False), TweedieRegressor()]

lasso_fit_model = grid_search(X_train_scaled, y_train, models[0], lasso_grid)
tweddie = grid_search(X_train_scaled, y_train, models[1], tweedie_grid)
lr = LinearRegression().fit(X_train_scaled, y_train)

In [8]:
lasso_fit_model.best_estimator_


LassoLars(normalize=False, random_state=123)

In [9]:
lasso_fit_model.best_params_

{'alpha': 1.0}

In [10]:
tweddie.best_estimator_

TweedieRegressor(power=1)

In [11]:
tweddie.best_params_

{'alpha': 1.0, 'power': 1}

In [12]:
lasso = LassoLars(alpha=0.5, normalize=False).fit(X_train_scaled, y_train)
tweedie = TweedieRegressor(power=1, alpha=0.5).fit(X_train_scaled, y_train)

In [13]:
lr.score(X_train_scaled, y_train)

0.3177081322343941

In [14]:
lasso.score(X_train_scaled, y_train)

0.31770813222420446

In [15]:
tweedie.score(X_train_scaled, y_train)

0.3240118509982971