# Project Title
House Price Prediction Using A Stochastic Gradient Descent model

# Data Preparation
## 1.1 Read Data
### Import necessary libraries and load data

In [1]:
# Import necessary libraries
# Data Manipulation and Analysis
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Model Training and Evaluation
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from scipy.stats import loguniform
from sklearn.metrics import mean_squared_error

# Model Persistence
import pickle

# Other
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
housing = pd.read_csv('housing.csv')

# Display the first 5 rows of the data
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Split data into training and testing sets

In [3]:
#Spliting dataset into training set and testing set
train, test = train_test_split(housing, test_size = 0.2, random_state = 42)

In [4]:
# Check the size of the training and test sets with the shape() function:
train.shape, test.shape

((16512, 10), (4128, 10))

The training set has 16512 rows and 10 features (or 80% of the whole dataset) and test set has 4128 rows and also 10 features (or 20% of the whole dataset).

## 1.2 Data Cleaning:

In [5]:
# Check for missing data/values
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
# Column information of the dataset
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
housing['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

The ocean_proximity column does not have numerical values, instead it has text attributes. Therefore, we create a copy of the data without the the column.

In [8]:
# Use the SimpleImputer method to impute in the missing values in the total_bedrooms columns with the median value in the column
imputer = SimpleImputer(strategy="median")

# Create a copy of the data without the text attribute ocean_proximity
train_num = train.drop('ocean_proximity', axis=1)
train_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 14196 to 15795
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
dtypes: float64(9)
memory usage: 1.3 MB


The ocean_proximity column has been dropped.

In [9]:
# fit the imputer instance to the training data using the fit() method
imputer.fit(train_num)

SimpleImputer(strategy='median')

In [10]:
imputer.statistics_

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1290e+03,  4.3700e+02,
        1.1670e+03,  4.1000e+02,  3.5458e+00,  1.7985e+05])

In [11]:
# Validate the mean to confirm if the imputer has correctly calculated the median using Python's .median() function
train_num.median().values

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1290e+03,  4.3700e+02,
        1.1670e+03,  4.1000e+02,  3.5458e+00,  1.7985e+05])

In [12]:
# Use the train-data imputer to transform the train data set by replacing the missing values by the learned values.
X = imputer.transform(train_num)
X

array([[-1.1703e+02,  3.2710e+01,  3.3000e+01, ...,  6.2300e+02,
         3.2596e+00,  1.0300e+05],
       [-1.1816e+02,  3.3770e+01,  4.9000e+01, ...,  7.5600e+02,
         3.8125e+00,  3.8210e+05],
       [-1.2048e+02,  3.4660e+01,  4.0000e+00, ...,  3.3600e+02,
         4.1563e+00,  1.7260e+05],
       ...,
       [-1.1838e+02,  3.4030e+01,  3.6000e+01, ...,  5.2700e+02,
         2.9344e+00,  2.2210e+05],
       [-1.2196e+02,  3.7580e+01,  1.5000e+01, ...,  5.5900e+02,
         5.7192e+00,  2.8350e+05],
       [-1.2242e+02,  3.7770e+01,  5.2000e+01, ...,  1.2420e+03,
         2.5755e+00,  3.2500e+05]])

Output is an Numpy array. The array should be converted into a Pandas DataFrame

In [13]:
# Converting the numpy array to a pandas dataframe
train_tr = pd.DataFrame(X, columns=train_num.columns, index=train.index)
train_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0


In [14]:
# Use the SimpleImputer method to impute in the missing values in the total_bedrooms columns with the median value in the column
imputer = SimpleImputer(strategy="median")

# Create a copy of the data without the text attribute ocean_proximity
test_num = test.drop('ocean_proximity', axis=1)
test_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 20046 to 3665
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   float64
 2   housing_median_age  4128 non-null   float64
 3   total_rooms         4128 non-null   float64
 4   total_bedrooms      3921 non-null   float64
 5   population          4128 non-null   float64
 6   households          4128 non-null   float64
 7   median_income       4128 non-null   float64
 8   median_house_value  4128 non-null   float64
dtypes: float64(9)
memory usage: 322.5 KB


In [15]:
# fit the imputer instance to the training data using the fit() method
imputer.fit(test_num)

SimpleImputer(strategy='median')

In [16]:
imputer.statistics_

array([-1.1847e+02,  3.4230e+01,  2.9000e+01,  2.1100e+03,  4.2800e+02,
        1.1600e+03,  4.0600e+02,  3.5000e+00,  1.7865e+05])

In [17]:
# Validate the mean to confirm if the imputer has correctly calculated the median using Python's .median() function
test_num.median().values

array([-1.1847e+02,  3.4230e+01,  2.9000e+01,  2.1100e+03,  4.2800e+02,
        1.1600e+03,  4.0600e+02,  3.5000e+00,  1.7865e+05])

In [18]:
# Use the test-data imputer to transform the test data set by replacing the missing values by the learned values.
y = imputer.transform(test_num)
y

array([[-1.19010e+02,  3.60600e+01,  2.50000e+01, ...,  3.59000e+02,
         1.68120e+00,  4.77000e+04],
       [-1.19460e+02,  3.51400e+01,  3.00000e+01, ...,  5.84000e+02,
         2.53130e+00,  4.58000e+04],
       [-1.22440e+02,  3.78000e+01,  5.20000e+01, ...,  9.63000e+02,
         3.48010e+00,  5.00001e+05],
       ...,
       [-1.22050e+02,  3.73100e+01,  2.50000e+01, ...,  5.68000e+02,
         9.22980e+00,  5.00001e+05],
       [-1.19760e+02,  3.67700e+01,  3.60000e+01, ...,  4.74000e+02,
         2.78500e+00,  7.23000e+04],
       [-1.18370e+02,  3.42200e+01,  1.70000e+01, ...,  4.48000e+02,
         3.55210e+00,  1.51500e+05]])

Output is an Numpy array. The array should be converted into a Pandas DataFrame

In [19]:
# Converting the numpy array to a pandas dataframe
test_tr = pd.DataFrame(y, columns=test_num.columns, index=test.index)
test_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
20046,-119.01,36.06,25.0,1505.0,428.0,1392.0,359.0,1.6812,47700.0
3024,-119.46,35.14,30.0,2943.0,428.0,1565.0,584.0,2.5313,45800.0
15663,-122.44,37.8,52.0,3830.0,428.0,1310.0,963.0,3.4801,500001.0
20484,-118.72,34.28,17.0,3051.0,428.0,1705.0,495.0,5.7376,218600.0
9814,-121.93,36.62,34.0,2351.0,428.0,1063.0,428.0,3.725,278000.0


## 1.3 Feature Engineering


In [20]:
# Creating new features or transforming existing features to better represent the underlying patterns in the data.

train_tr['rooms_per_household'] = train_tr['total_rooms'] / train_tr['households']
train_tr['bedrooms_per_room'] = train_tr['total_bedrooms'] / train_tr['total_rooms']
train_tr['population_per_household'] = train_tr['population'] / train_tr['households']

train_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,5.017657,0.200576,3.691814
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,4.473545,0.232703,1.738095
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,5.645833,0.174486,2.723214
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,4.002817,0.258269,3.994366
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,6.268421,0.18094,2.3


In [21]:
test_tr['rooms_per_household'] = test_tr['total_rooms'] / test_tr['households']
test_tr['bedrooms_per_room'] = test_tr['total_bedrooms'] / test_tr['total_rooms']
test_tr['population_per_household'] = test_tr['population'] / test_tr['households']

test_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
20046,-119.01,36.06,25.0,1505.0,428.0,1392.0,359.0,1.6812,47700.0,4.192201,0.284385,3.877437
3024,-119.46,35.14,30.0,2943.0,428.0,1565.0,584.0,2.5313,45800.0,5.039384,0.14543,2.679795
15663,-122.44,37.8,52.0,3830.0,428.0,1310.0,963.0,3.4801,500001.0,3.977155,0.111749,1.360332
20484,-118.72,34.28,17.0,3051.0,428.0,1705.0,495.0,5.7376,218600.0,6.163636,0.140282,3.444444
9814,-121.93,36.62,34.0,2351.0,428.0,1063.0,428.0,3.725,278000.0,5.492991,0.18205,2.483645


## 1.4 Feature Scaling

In [22]:
# Train Data
num_pipeline = Pipeline([('imputer',SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])
num_attribs = list(train_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs)])

train_prepared = full_pipeline.fit_transform(train)
train_prepared.shape

(16512, 14)

In [23]:
# Test Data
num_pipeline = Pipeline([('imputer',SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])
num_attribs = list(test_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs)])

test_prepared = full_pipeline.fit_transform(test)
test_prepared.shape

(4128, 14)

# 2. Model Training

## Train a Stochastic Gradient Descent model

In [24]:
train_labels = train['median_house_value'].copy()
train_tr = train.drop('median_house_value', axis=1)
y_train = train_labels.values

# Create an instance of SGDRegressor
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)

# Train the model using cross-validation
scores = cross_val_score(sgd_reg, train_prepared, y_train, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores

# define a function to display scores
def display_scores(mse_scores):
    print("SGD Regression Scores:", mse_scores)
    print("Mean Squared Error:", mse_scores.mean())
    print("Standard Deviation:", mse_scores.std())
    
# call the function to display the scores
display_scores(mse_scores)

SGD Regression Scores: [3.71712155e-04 4.04999563e-01 2.49781544e-04 3.05079168e-04
 4.61622551e-01 6.67526236e-04 4.51205062e-01 2.05939636e-04
 4.43412209e-01 2.10651099e-04]
Mean Squared Error: 0.17632500740773177
Standard Deviation: 0.2159667019838157


# 3. Model Tuning:

In [None]:
# Create a SGDRegressor object
model = SGDRegressor()

# Define the hyperparameter space
hyperparameters = [{'max_iter': [1000, 5000, 10000],'alpha': loguniform(1e-5, 1e-1),'penalty': ['l1', 'l2', 'elasticnet'],
                    'l1_ratio': [0, 0.25, 0.5, 0.75, 1],'epsilon': loguniform(1e-5, 1e-1),
                    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],'eta0': loguniform(1e-5, 1e-1)}]

# Define the randomized search with 50 iterations
random_search = RandomizedSearchCV(model,param_distributions=hyperparameters,n_iter=15,cv=5,random_state=42,n_jobs=-1,
                                   verbose=2)

# Fit the randomized search on the training data
random_search.fit(train_prepared, y_train)

# Print the best hyperparameters
print(random_search.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


# 4. Model Testing

## 4.1 Load Model

In [None]:
# Preprocess the test set
test_labels = test['median_house_value'].copy()
test_tr = test.drop('median_house_value', axis=1)
y_test = test_labels.values

# Get the best model from the randomized search
best_model = random_search.best_estimator_
print("Best model:", best_model)

## 4.2 Predict Values

In [None]:
model = SGDRegressor()
model.fit(test_prepared, test_labels)

# make predictions
predicted_values = model.predict(test_prepared)
print(predicted_values)

## 4.3 Evaluate Model

In [None]:
# Evaluate the model's performance on the test set using mean squared error
mse = mean_squared_error(test_labels, predicted_values)
rmse = np.sqrt(mse)

In [None]:
# Evaluate the model on the test set
mse = mean_squared_error(test_labels, predicted_values)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

# Evaluate the model using cross-validation
scores = cross_val_score(best_model, train_prepared, y_train, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores

# Display the scores
display_scores(mse_scores)

In [None]:
# Save model 
filename = 'final_sgd_model.sav'
pickle.dump(random_search.best_estimator_, open(filename, 'wb'))

# 5. Compare Models 


In [None]:
# Random Forest Model scores
rf_scores = [54424.62066717, 52606.09659754, 57844.11882739, 61522.32536088, 55079.21718315,
             52165.92018865, 51849.35995233, 57685.67347045, 57556.56433099, 54276.83688292]

# Create a DataFrame for Random Forest Model scores
rf_df = pd.DataFrame({'Scores': rf_scores})

# Calculate mean and standard deviation for Random Forest Model
rf_mean = rf_df['Scores'].mean()
rf_std = rf_df['Scores'].std()

# SGD Model scores
sgd_scores = [5.21297539e-08, 6.41685040e-04, 1.71454845e-07, 1.42933064e-07, 6.50503352e-04,
              1.50625734e-07, 3.23236670e-04, 1.72269016e-07, 4.25905868e-04, 1.63125998e-07]

# Create a DataFrame for SGD Model scores
sgd_df = pd.DataFrame({'SGD Regression Scores': sgd_scores})

# Calculate mean and standard deviation for SGD Model
sgd_mean = sgd_df['SGD Regression Scores'].mean()
sgd_std = sgd_df['SGD Regression Scores'].std()

# Print the comparison results
print("Random Forest Model scores:")
print(rf_df)
print("Mean:", rf_mean)
print("Standard deviation:", rf_std)
print()
print('==='*15)

print("SGD Model Scores:")
print(sgd_df)
print("Mean Squared Error:", sgd_mean)
print("Standard Deviation:", sgd_std)

A comparison of the the results from the Stochastic Gradient Descent (SGD) model and Random Forest model shows that the Stochastic Gradient Descent model performed better with an RMSE of 383.868 compared to the Random Forest model with a mean RMSE of 55501.073. The SGD model had a much lower error score, which indicates that it performed better in making predictions.

Other models that we can test, include:

- Decision Tree Regressor
- K-Nearest Neighbors Regressor
- Artificial Neural Networks (ANN)
- Ensemble Methods such as AdaBoost, XGBoost, and LightGBM.