# House price predection using regression model

# 1.0 Import and install python libraries

In [49]:
#Here we import any Python libraries that we plan to use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor 
from sklearn.dummy import DummyRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

# 2.0 Set Random Seed

In [50]:
# set random seed to ensure that results are repeatable
np.random.seed(1)

# 3.0 Load data

In [51]:
# load data
housedata = pd.read_csv(r"C:\DSP\Assignment 1\kc_house_data.csv")

# 4.0 Conduct initial exploration of the data

In [52]:
# look at the data
housedata.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


### Determine the input and target variables. The  "price" for the house serves as the dataset's target variable. We are attempting to predict this variable using the other variables in the dataset.

The input variables, also known as predictor variables or features, are all the other variables in the dataset that can be used to predict the sale price of the house. These include the number of bedrooms and bathrooms, the square footage of the living area, the year the house was built, and the location of the house (latitude and longitude), among others.

In [53]:
# generate a basic summary of the data
housedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [54]:
# Check the shape of the dataset
housedata.shape

(21613, 21)

In [55]:
# Check the data types of each column
housedata.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [56]:
# generate a statistical summary of the numeric value in the data
housedata.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [57]:
# Check the missing values by summing the total na's for each variable
housedata.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

### Categorical variables are frequently used in regression analysis as explanatory or predictor variables in order to explain the variance in the response variable. In regression models, dummy variables are frequently used to represent categorical variables. To denote the existence or absence of a specific category, a dummy variable has the value of 1 or 0.

In [58]:
# encode categorical variables
cat_cols = ["waterfront", "view", "condition", "grade"]
for col in cat_cols:
    housedata[col] = LabelEncoder().fit_transform(housedata[col])

# 5.0 Preprocess the dataset

### In order to enhance the model's performance, decrease overfitting, and improve interpretability, it is essential to drop predictor variables from a regression model.

In [59]:
#Drop id and Zipcode columns
housedata.drop(['id','date'], axis=1, inplace = True)

In [60]:
housedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  float64
 1   bedrooms       21613 non-null  int64  
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  int64  
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  float64
 6   waterfront     21613 non-null  int64  
 7   view           21613 non-null  int64  
 8   condition      21613 non-null  int64  
 9   grade          21613 non-null  int64  
 10  sqft_above     21613 non-null  int64  
 11  sqft_basement  21613 non-null  int64  
 12  yr_built       21613 non-null  int64  
 13  yr_renovated   21613 non-null  int64  
 14  zipcode        21613 non-null  int64  
 15  lat            21613 non-null  float64
 16  long           21613 non-null  float64
 17  sqft_living15  21613 non-null  int64  
 18  sqft_l

In [61]:
# scale numeric variables
#num_cols = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors"]
#scaler = StandardScaler()
#housedata[num_cols] = scaler.fit_transform(housedata[num_cols])

6.0 Split and Save the data

In [62]:
# split the data into training and testing sets
target = 'price'
X = housedata.drop(target, axis=1)
y = housedata[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
#standardizing the numerical variables
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [64]:
# convert the numpy array to a pandas DataFrame

X_train = pd.DataFrame(X_train)

y_train = pd.DataFrame(y_train)

X_test = pd.DataFrame(X_test)

y_test = pd.DataFrame(y_test)

In [65]:
# save the DataFrame to a CSV file

X_train.to_csv(r'C:\DSP\Assignment 1\kc_house_data_X_train.csv', index=False)

y_train.to_csv(r'C:\DSP\Assignment 1\kc_house_data_y_train.csv', index=False)

X_test.to_csv(r'C:\DSP\Assignment 1\kc_house_data_X_test.csv', index=False)

y_test.to_csv(r'C:\DSP\Assignment 1\kc_house_data_y_test.csv', index=False)

In [66]:
# set random seed to ensure that results are repeatable
np.random.seed(1)

In [67]:
#load the cleaned data

X_train = pd.read_csv(r"C:\DSP\Assignment 1\kc_house_data_X_train.csv")

X_test = pd.read_csv(r"C:\DSP\Assignment 1\kc_house_data_X_test.csv")

y_train = pd.read_csv(r"C:\DSP\Assignment 1\kc_house_data_y_train.csv")

y_test = pd.read_csv(r"C:\DSP\Assignment 1\kc_house_data_y_test.csv")

## 7.0 Fitting Models the data

As we selected price as the target variable, regression modeling would provide the best results and house price prediction. Because the target variable price includes continuous values, we are unable to choose classification models. Instead, depending on the data set we have chosen, we select MSC, RMSE, and R-Squared as performance measures. This dataset includes details on King County home sales, including attributes like square footage, location, and the number of bedrooms and bathrooms. The top performance measures that may be utilized to develop a model to forecast home prices using this dataset are MSC, RMSE, and R-Squared.

## MLPRegressor

In [69]:
mlp_regr = MLPRegressor(random_state=1, max_iter=500)
mlp_regr.fit(X_train, y_train)
y_pred = mlp_regr.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

  y = column_or_1d(y, warn=True)


MSE: 94147177993.96973
RMSE: 306834.12130004336
R-squared: 0.37723756133809094




7.1 Fit and test a Linear Regression Using Stochastic Gradient Descent

In [70]:
# create an instance of the SGDRegressor class
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45729427451.15249
RMSE: 213844.40009304075
R-squared: 0.6975101074201431


  y = column_or_1d(y, warn=True)


7.2 Fit and test SGDRegression using L2 Regularization

In [71]:
# create an instance of the SGDRegressor class with L2 regularization
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, penalty='l2', random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45729427451.15249
RMSE: 213844.40009304075
R-squared: 0.6975101074201431


  y = column_or_1d(y, warn=True)


7.3 Fit and test SGDRegression Using L1 Regularization

In [72]:
# create an instance of the SGDRegressor class with L1 regularization
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, penalty='l1', random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45664161353.02602
RMSE: 213691.7437642971
R-squared: 0.6979418280016543


  y = column_or_1d(y, warn=True)


7.4 Fit and test SGDRegression using ElasticNet Regularization

In [73]:
# create an instance of the SGDRegressor with Elastic Net regularization:
reg = SGDRegressor(max_iter=1000, tol=1e-3, alpha=0.01, penalty='elasticnet', random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45719010762.441444
RMSE: 213820.0429390132
R-squared: 0.6975790114765651


  y = column_or_1d(y, warn=True)


7.5 Fit and test SGDRegression using Early Stopping

In [74]:
# create an instance of the SGDRegressor with Elastic Net regularization:
reg = SGDRegressor(max_iter=1000, tol=1e-3, early_stopping=True, validation_fraction=0.1, n_iter_no_change=5, random_state=42)

# fit the model on the training data
reg.fit(X_train, y_train)

# make predictions on the test data
y_pred = reg.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 45326063272.027756
RMSE: 212899.18570071552
R-squared: 0.7001782708766905


  y = column_or_1d(y, warn=True)


## 8.0 Polynomial Regression

8.1 Fit and test a polynomial Regression Using Stochastic Gradient Descent

In [75]:
#Create a pipeline with PolynomialFeatures and SGDRegressor
pipeline = make_pipeline(PolynomialFeatures(degree=2, include_bias=False), SGDRegressor(random_state=42))

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 1.6878132959399367e+25
RMSE: 4108300495265.5747
R-squared: -111645059000373.33


  y = column_or_1d(y, warn=True)


8.2 Fit and test a polynomial Regression Using using L2 Regularization

In [76]:
#Create a pipeline with PolynomialFeatures and SGDRegressor
poly = PolynomialFeatures(degree=2, include_bias=False)
sgd_reg = SGDRegressor(penalty='l2', alpha=0.01, random_state=42)
pipeline = make_pipeline(poly, sgd_reg)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 1.5006435785918402e+25
RMSE: 3873814113495.6904
R-squared: -99264202547424.02


  y = column_or_1d(y, warn=True)


8.3 Fit and test a polynomial Regression Using using L1 Regularization

In [77]:
poly = PolynomialFeatures(degree=2, include_bias=False)
sgd_reg = SGDRegressor(penalty='l1', alpha=0.01, random_state=42)
pipeline = make_pipeline(poly, sgd_reg)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

  y = column_or_1d(y, warn=True)


MSE: 1.2529455728353555e+25
RMSE: 3539697123816.3237
R-squared: -82879535751943.39


8.4 Fit and test SGDRegression using ElasticNet Regularization

In [78]:
poly = PolynomialFeatures(degree=2, include_bias=False)
sgd_reg = SGDRegressor(penalty='elasticnet', alpha=0.01, random_state=42)
pipeline = make_pipeline(poly, sgd_reg)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 1.0296670844118014e+24
RMSE: 1014725127515.7235
R-squared: -6811016518616.939


  y = column_or_1d(y, warn=True)


8.5 Fit and test SGDRegression using Early Stopping

In [79]:
pipeline = SGDRegressor(alpha=0.01, max_iter=1000, tol=1e-3, early_stopping=True, 
                     validation_fraction=0.2, n_iter_no_change=10, random_state=42)

#Fit the model on the training set
pipeline.fit(X_train, y_train)

#Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# evaluate the model using MSE, RMSE and R-squared metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print("RMSE:", rmse)
print("R-squared:", r2)

MSE: 46348214716.046524
RMSE: 215286.3551552827
R-squared: 0.6934169686313945


  y = column_or_1d(y, warn=True)


As per the results above the best model is the one with lowest MSE and RMSE value and highest r-squared values. Based on that after using neural network model also we can clearly see that the best performed model is SGDRegression using ElasticNet Regularization model.