In [14]:
#Here we import any Python libraries that we plan to use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor 
from sklearn.dummy import DummyRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [15]:
# set random seed to ensure that results are repeatable
np.random.seed(1)

In [16]:
#load the cleaned data

X_train = pd.read_csv("kc_house_data_X_train.csv")

X_test = pd.read_csv("kc_house_data_X_test.csv")

y_train = pd.read_csv("kc_house_data_y_train.csv")

y_test = pd.read_csv("kc_house_data_y_test.csv")

# 7.0 Fitting Models the data

### As we selected price as the target variable, regression modeling would provide the best results and house price prediction. Because the target variable price includes continuous values, we are unable to choose classification models. Instead, depending on the data set we have chosen, we select MSC, RMSE, and R-Squared as performance measures. This dataset includes details on King County home sales, including attributes like square footage, location, and the number of bedrooms and bathrooms. The top performance measures that may be utilized to develop a model to forecast home prices using this dataset are MSC, RMSE, and R-Squared.

7.1 Fit and test a Linear Regression Using Stochastic Gradient Descent

In [17]:
# create an instance of the SGDRegressor class
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45729427451.15249
RMSE: 213844.40009304075
R-squared: 0.6975101074201431


  y = column_or_1d(y, warn=True)


7.2 Fit and test SGDRegression using L2 Regularization

In [18]:
# create an instance of the SGDRegressor class with L2 regularization
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, penalty='l2', random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45729427451.15249
RMSE: 213844.40009304075
R-squared: 0.6975101074201431


  y = column_or_1d(y, warn=True)


7.3 Fit and test SGDRegression Using L1 Regularization

In [19]:
# create an instance of the SGDRegressor class with L1 regularization
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, penalty='l1', random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45664161353.02602
RMSE: 213691.7437642971
R-squared: 0.6979418280016543


  y = column_or_1d(y, warn=True)


7.4 Fit and test SGDRegression using ElasticNet Regularization

In [20]:
# create an instance of the SGDRegressor with Elastic Net regularization:
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, penalty='elasticnet', random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45719010762.441444
RMSE: 213820.0429390132
R-squared: 0.6975790114765651


  y = column_or_1d(y, warn=True)


7.5 Fit and test SGDRegression using Early Stopping

In [21]:
# create an instance of the SGDRegressor with Elastic Net regularization:
reg = SGDRegressor(max_iter=1000, tol=1e-3, early_stopping=True, validation_fraction=0.1, n_iter_no_change=5, random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45326063272.027756
RMSE: 212899.18570071552
R-squared: 0.7001782708766905


  y = column_or_1d(y, warn=True)


# 8.0 Polynomial Regression

8.1 Fit and test a polynomial Regression Using Stochastic Gradient Descent

In [22]:
#Create a pipeline with PolynomialFeatures and SGDRegressor
pipeline = make_pipeline(PolynomialFeatures(degree=2, include_bias=False), SGDRegressor(random_state=42))

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 1.6878132959399367e+25
RMSE: 4108300495265.5747
R-squared: -111645059000373.33


  y = column_or_1d(y, warn=True)


8.2 Fit and test a polynomial Regression Using using L2 Regularization

In [23]:
#Create a pipeline with PolynomialFeatures and SGDRegressor
poly = PolynomialFeatures(degree=2, include_bias=False)
sgd_reg = SGDRegressor(penalty='l2', alpha=0.01, random_state=42)
pipeline = make_pipeline(poly, sgd_reg)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 1.5006435785918402e+25
RMSE: 3873814113495.6904
R-squared: -99264202547424.02


  y = column_or_1d(y, warn=True)


8.3 Fit and test a polynomial Regression Using using L1 Regularization

In [24]:
poly = PolynomialFeatures(degree=2, include_bias=False)
sgd_reg = SGDRegressor(penalty='l1', alpha=0.01, random_state=42)
pipeline = make_pipeline(poly, sgd_reg)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

  y = column_or_1d(y, warn=True)


MSE: 1.2529455728353555e+25
RMSE: 3539697123816.3237
R-squared: -82879535751943.39


8.4 Fit and test SGDRegression using ElasticNet Regularization

In [25]:
poly = PolynomialFeatures(degree=2, include_bias=False)
sgd_reg = SGDRegressor(penalty='elasticnet', alpha=0.01, random_state=42)
pipeline = make_pipeline(poly, sgd_reg)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 1.0296670844118014e+24
RMSE: 1014725127515.7235
R-squared: -6811016518616.939


  y = column_or_1d(y, warn=True)


8.5 Fit and test SGDRegression using Early Stopping

In [26]:
pipeline = SGDRegressor(alpha=0.01, max_iter=1000, tol=1e-3, early_stopping=True, 
                     validation_fraction=0.2, n_iter_no_change=10, random_state=42)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 46348214716.046524
RMSE: 215286.3551552827
R-squared: 0.6934169686313945


  y = column_or_1d(y, warn=True)


### On comparing with all the predected models I suggest SGDRegressor with Elastic Net regularization is the best model with RMSE: 212899.18570071552 and R-squared: 0.7001782708766905