# ML Project: Regression of Zillow Home Value Index for Virginia Housing
## Nanda, Siddharth (sn9dq); Wilson, Dale (dsw6ru)

In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# to make this notebook's output stable across runs
np.random.seed(42)

In [10]:
# Read in data/split
housing = pd.read_csv('Zip_Zhvi_Summary_AllHomes.csv', encoding='latin-1')
housing = housing.drop(columns=['County', 'City', 'PeakMonth', 'Metro', 'PeakQuarter', 'LastTimeAtCurrZHVI'])
housing = housing.loc[housing['State'] == 'VA'].dropna()
train_set, test_set = train_test_split(housing, test_size = 0.5)
print("Description of label")
display(housing['Zhvi'].describe())
y_train = train_set['Zhvi']
X_train = train_set.drop(columns=['Zhvi'])

y_test = test_set['Zhvi']
X_test = test_set.drop(columns=['Zhvi'])

Description of label


count    4.270000e+02
mean     2.744948e+05
std      1.704222e+05
min      4.280000e+04
25%      1.703000e+05
50%      2.279000e+05
75%      3.273500e+05
max      1.072900e+06
Name: Zhvi, dtype: float64

In [11]:
# Pipelines for data transformation
housing = housing.drop(columns=['Zhvi'])
num_pipeline = Pipeline([
    ('standard_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, list(housing.select_dtypes(include=[np.number]))),
    ('cat', OneHotEncoder(), list(housing.select_dtypes(include=[np.object]).columns))
])

In [12]:
# fit_transform train set, and transform test set (scaling!)
X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
print(r2_score(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

0.9938725219034839
13840.250146965576


In [14]:
from sklearn.linear_model import SGDRegressor
mySGDModel = SGDRegressor() # CV?
mySGDModel.fit(X_train, y_train)
y_predict = mySGDModel.predict(X_test)
print(r2_score(y_test, y_predict))
print(np.sqrt(mean_squared_error(y_test, y_predict)))

0.9563282047216678
36949.07568207716




In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
grid_search = GridSearchCV(Lasso(max_iter=10000), [{'alpha' : np.arange(0.1, 1.1, 0.1)}], cv=5)
grid_search.fit(X_train, y_train)
print("The best estimator is: ", grid_search.best_estimator_)
print("The best parameters are: ", grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

The best estimator is:  Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
The best parameters are:  {'alpha': 1.0}
0.9938729649433092
13839.749786723238




In [16]:
from sklearn.linear_model import Ridge
grid_search = GridSearchCV(Ridge(max_iter=10000), [{'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}], cv=5)
grid_search.fit(X_train, y_train)
print("The best estimator is: ", grid_search.best_estimator_)
print("The best parameters are: ", grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

The best estimator is:  Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, random_state=None, solver='saga', tol=0.001)
The best parameters are:  {'solver': 'saga'}
0.993564054259069
14184.343594946205


In [17]:
from sklearn.linear_model import ElasticNet
grid_search = GridSearchCV(ElasticNet(), [{'l1_ratio': np.arange(0.1, 1.0, 0.1)}], cv=5)
grid_search.fit(X_train, y_train)
print("The best estimator is: ", grid_search.best_estimator_)
print("The best parameters are: ", grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

The best estimator is:  ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.9,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
The best parameters are:  {'l1_ratio': 0.9}
0.9779735963898899
26240.66593169994


