# Regression model exercises

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import wrangle

In [2]:
df = wrangle.wrangle_zillow_data()

Returning saved csv file.


Scale the data

In [3]:
df, scalers = wrangle.zillow_scale(df, columns=['calculatedfinishedsquarefeet'], return_scalers=True)
df

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,taxvaluedollarcnt,county,calculatedfinishedsquarefeet_scaled
4,4,2.0,3633,2005,296425,Los Angeles,2.236203
6,3,4.0,1620,2011,847770,Los Angeles,0.014349
7,3,2.0,2077,1926,646760,Los Angeles,0.518764
18,3,1.0,1244,1950,169471,Los Angeles,-0.400662
19,3,2.0,1300,1950,233266,Los Angeles,-0.338852
...,...,...,...,...,...,...,...
2152856,4,4.0,4375,2015,422400,Los Angeles,3.055188
2152858,4,3.0,2262,2015,960756,Orange,0.722958
2152859,4,4.5,3127,2014,536061,Orange,1.677704
2152861,3,2.5,1974,2015,424353,Orange,0.405077


In [4]:
train, validate, test = wrangle.split_zillow_data(df)

In [5]:
train

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,taxvaluedollarcnt,county,calculatedfinishedsquarefeet_scaled
1174234,4,2.0,2015,1968,273762,Ventura,0.450331
892222,3,2.0,1423,1940,276396,Orange,-0.203091
2056706,4,2.0,2508,1987,456054,Ventura,0.994481
666486,3,2.5,1789,1974,357041,Orange,0.200883
2068010,4,4.0,2829,1961,690548,Los Angeles,1.348786
...,...,...,...,...,...,...,...
421920,5,3.0,2834,1956,481909,Los Angeles,1.354305
1204808,2,1.5,1366,1977,436161,Orange,-0.266004
2002848,3,2.0,1699,1971,496344,Ventura,0.101545
1731210,3,2.0,1362,1968,66093,Orange,-0.270419


Make `X_train`, `y_train`, `X_validate` and `y_validate`

In [6]:
X_train = train.drop(columns = ['taxvaluedollarcnt', 'county', 'calculatedfinishedsquarefeet'])
y_train = pd.DataFrame(train[['taxvaluedollarcnt']])

X_validate = validate.drop(columns = ['taxvaluedollarcnt', 'county', 'calculatedfinishedsquarefeet'])
y_validate = pd.DataFrame(validate[['taxvaluedollarcnt']])

In [7]:
y_train

Unnamed: 0,taxvaluedollarcnt
1174234,273762
892222,276396
2056706,456054
666486,357041
2068010,690548
...,...
421920,481909
1204808,436161
2002848,496344
1731210,66093


In [8]:
X_train

Unnamed: 0,bedroomcnt,bathroomcnt,yearbuilt,calculatedfinishedsquarefeet_scaled
1174234,4,2.0,1968,0.450331
892222,3,2.0,1940,-0.203091
2056706,4,2.0,1987,0.994481
666486,3,2.5,1974,0.200883
2068010,4,4.0,1961,1.348786
...,...,...,...,...
421920,5,3.0,1956,1.354305
1204808,2,1.5,1977,-0.266004
2002848,3,2.0,1971,0.101545
1731210,3,2.0,1968,-0.270419


In [9]:
y_train['taxvaluedollarcnt'].mean()

398120.95571158384

## Baseline model

use the mean and the median to see which is better

In [10]:
#mean baseline model
y_train['baseline_mean_pred'] = y_train['taxvaluedollarcnt'].mean()
y_validate['baseline_mean_pred'] = y_validate['taxvaluedollarcnt'].mean()

#median baseline model
y_train['baseline_median_pred'] = y_train['taxvaluedollarcnt'].median()
y_validate['baseline_median_pred'] = y_validate['taxvaluedollarcnt'].median()


In [11]:
#determine RSMSE
results = []

baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_mean_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_mean_pred'])**(0.5)

}
baseline_median = {
    'model':'baseline_median',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_median_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_median_pred'])**(0.5)

}
results.append(baseline_mean)
results.append(baseline_median)

In [12]:
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,312099.231468,311678.06514
1,baseline_median,321236.237971,320680.947753


## Linear Regression model

In [13]:
lm = LinearRegression(normalize = True)

lm.fit(X_train, y_train['taxvaluedollarcnt'])

y_train['lm_train_pred'] = lm.predict(X_train)

y_validate['lm_validate_pred'] = lm.predict(X_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lm_train_pred
1174234,273762,398120.955712,322050.0,393222.793630
892222,276396,398120.955712,322050.0,323918.088586
2056706,456054,398120.955712,322050.0,503264.979786
666486,357041,398120.955712,322050.0,428034.805547
2068010,690548,398120.955712,322050.0,695257.988865
...,...,...,...,...
421920,481909,398120.955712,322050.0,581050.652406
1204808,436161,398120.955712,322050.0,334903.303898
2002848,496344,398120.955712,322050.0,380231.602378
1731210,66093,398120.955712,322050.0,302414.285242


In [14]:
lm_rmse = {
    'model':'linear regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lm_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lm_validate_pred'])**(0.5)

}
results.append(lm_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,312099.231468,311678.06514
1,baseline_median,321236.237971,320680.947753
2,linear regression,249432.591414,249380.823468


## LassoLars model

In [15]:
lars = LassoLars(alpha=1.0)

lars.fit(X_train, y_train['taxvaluedollarcnt'])

y_train['lars_train_pred'] = lars.predict(X_train)

y_validate['lars_validate_pred'] = lars.predict(X_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lm_train_pred,lars_train_pred
1174234,273762,398120.955712,322050.0,393222.793630,396462.835722
892222,276396,398120.955712,322050.0,323918.088586,322322.770932
2056706,456054,398120.955712,322050.0,503264.979786,507245.462213
666486,357041,398120.955712,322050.0,428034.805547,427507.086379
2068010,690548,398120.955712,322050.0,695257.988865,689778.328269
...,...,...,...,...,...
421920,481909,398120.955712,322050.0,581050.652406,581484.699338
1204808,436161,398120.955712,322050.0,334903.303898,335305.136364
2002848,496344,398120.955712,322050.0,380231.602378,381124.915766
1731210,66093,398120.955712,322050.0,302414.285242,303818.777400


In [16]:
lars_rmse = {
    'model':'LassoLars regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lars_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lars_validate_pred'])**(0.5)

}
results.append(lars_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,312099.231468,311678.06514
1,baseline_median,321236.237971,320680.947753
2,linear regression,249432.591414,249380.823468
3,LassoLars regression,249453.620291,249390.926181


## Polynomial Regression

In [17]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate)

In [18]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train['taxvaluedollarcnt'])

# predict train
y_train['poly_train_pred'] = lm2.predict(X_train_degree2)

# predict validate
y_validate['poly_validate_pred'] = lm2.predict(X_validate_degree2)

In [19]:
y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lm_train_pred,lars_train_pred,poly_train_pred
1174234,273762,398120.955712,322050.0,393222.793630,396462.835722,381352.305427
892222,276396,398120.955712,322050.0,323918.088586,322322.770932,322648.179255
2056706,456054,398120.955712,322050.0,503264.979786,507245.462213,496754.156838
666486,357041,398120.955712,322050.0,428034.805547,427507.086379,408772.507546
2068010,690548,398120.955712,322050.0,695257.988865,689778.328269,696678.845422
...,...,...,...,...,...,...
421920,481909,398120.955712,322050.0,581050.652406,581484.699338,547956.930804
1204808,436161,398120.955712,322050.0,334903.303898,335305.136364,328771.226790
2002848,496344,398120.955712,322050.0,380231.602378,381124.915766,365508.922674
1731210,66093,398120.955712,322050.0,302414.285242,303818.777400,290501.476413


In [20]:
poly_rmse = {
    'model':'Polynomial regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['poly_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['poly_validate_pred'])**(0.5)

}
results.append(poly_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,312099.231468,311678.06514
1,baseline_median,321236.237971,320680.947753
2,linear regression,249432.591414,249380.823468
3,LassoLars regression,249453.620291,249390.926181
4,Polynomial regression,247593.831931,247453.300464


## TweedieRegressor 

In [21]:
glm = TweedieRegressor()

glm.fit(X_train, y_train['taxvaluedollarcnt'])

y_train['glm_train_pred'] = glm.predict(X_train)

y_validate['glm_validate_pred'] = glm.predict(X_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lm_train_pred,lars_train_pred,poly_train_pred,glm_train_pred
1174234,273762,398120.955712,322050.0,393222.793630,396462.835722,381352.305427,422078.084825
892222,276396,398120.955712,322050.0,323918.088586,322322.770932,322648.179255,327519.456800
2056706,456054,398120.955712,322050.0,503264.979786,507245.462213,496754.156838,488452.118020
666486,357041,398120.955712,322050.0,428034.805547,427507.086379,408772.507546,432594.783641
2068010,690548,398120.955712,322050.0,695257.988865,689778.328269,696678.845422,557811.198738
...,...,...,...,...,...,...,...
421920,481909,398120.955712,322050.0,581050.652406,581484.699338,547956.930804,510878.911480
1204808,436161,398120.955712,322050.0,334903.303898,335305.136364,328771.226790,356574.859222
2002848,496344,398120.955712,322050.0,380231.602378,381124.915766,365508.922674,398698.559920
1731210,66093,398120.955712,322050.0,302414.285242,303818.777400,290501.476413,370013.838181


In [22]:
glm_rmse = {
    'model':'TweedieRegressor regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['glm_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['glm_validate_pred'])**(0.5)

}
results.append(glm_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,312099.231468,311678.06514
1,baseline_median,321236.237971,320680.947753
2,linear regression,249432.591414,249380.823468
3,LassoLars regression,249453.620291,249390.926181
4,Polynomial regression,247593.831931,247453.300464
5,TweedieRegressor regression,267666.907672,267314.089061


In [23]:
results_df = pd.DataFrame(results)
results_df.sort_values(['RMSE_])