# Introduction

This notebook is an end-to-end data science process for predicting house price. This uses the Kaggle House Prices dataseet.

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt


In [3]:
# Read in the data 
df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv',index_col = 'Id')

# Exploratory Data Analysis

In [4]:
df.describe()

# The dataframe has 1460 rows and 80 columns, with SalePrice being the target variable (supervised learning).

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df.shape

# 1460 rows, 80 columns

(1460, 80)

In [7]:
# Create a function to find the most correlated features
# Finds 10 most correlated features to the target column
def find_most_correlated_features(df, target_col, num_features=10):
      corr_with_target = df.corr()[target_col].abs().sort_values(ascending=False)
      most_correlated = corr_with_target.drop(target_col).head(num_features)
      return most_correlated

In [8]:
# Function to get only numerical features from the df.
def isolate_numerical_features(df):
  numerical_features = df.select_dtypes(include=['number'])
  return numerical_features

In [9]:
def isolate_categorical_features(df):
    categorical_features = df.select_dtypes(include=['object'])
    return categorical_features

In [10]:
df_numerical  = isolate_numerical_features(df)
df_numerical.shape

(1460, 37)

In [11]:
df_categorical = isolate_categorical_features(df)
df_categorical.shape

(1460, 43)

In [12]:
find_most_correlated_features(df_numerical,'SalePrice',num_features=20)

OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
MasVnrArea      0.477493
Fireplaces      0.466929
BsmtFinSF1      0.386420
LotFrontage     0.351799
WoodDeckSF      0.324413
2ndFlrSF        0.319334
OpenPorchSF     0.315856
HalfBath        0.284108
LotArea         0.263843
Name: SalePrice, dtype: float64

# Build model

In [13]:
# Building a decision tree
from sklearn.tree import DecisionTreeRegressor

house_model = DecisionTreeRegressor(random_state =1)

In [14]:
X = df[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','GarageYrBlt','MasVnrArea','Fireplaces','BsmtFinSF1','LotFrontage','WoodDeckSF','2ndFlrSF','OpenPorchSF','HalfBath','LotArea']]
y = df['SalePrice']

In [15]:
# Combine X and y for easier row-wise dropping
data = pd.concat([X, y], axis=1)

# Drop rows with any NaN values
data_cleaned = data.dropna()

In [16]:
data_cleaned.head()

Unnamed: 0_level_0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,...,MasVnrArea,Fireplaces,BsmtFinSF1,LotFrontage,WoodDeckSF,2ndFlrSF,OpenPorchSF,HalfBath,LotArea,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,1710,2,548,856,856,2,8,2003,2003,...,196.0,0,706,65.0,0,854,61,1,8450,208500
2,6,1262,2,460,1262,1262,2,6,1976,1976,...,0.0,1,978,80.0,298,0,0,0,9600,181500
3,7,1786,2,608,920,920,2,6,2001,2002,...,162.0,1,486,68.0,0,866,42,1,11250,223500
4,7,1717,3,642,756,961,1,7,1915,1970,...,0.0,1,216,60.0,0,756,35,0,9550,140000
5,8,2198,3,836,1145,1145,2,9,2000,2000,...,350.0,1,655,84.0,192,1053,84,1,14260,250000


In [17]:
X_cleaned = data_cleaned.iloc[:, :-1]  # Assuming the target is the last column
y_cleaned = data_cleaned.iloc[:, -1] # Target column is SalePrice.

In [18]:
# Split the cleaned data into train, test, cross-validation sets

#X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Get 60% of the dataset as the training set
X_train, X_, y_train, y_ = train_test_split(X_cleaned, y_cleaned, test_size = 0.4, random_state =42)
# Random state ensures that the split is the same every time you run the code, for reproducibility.

# Split the 40% subset into 20% for cross validation and 20% for test set 
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size =0.5, random_state=42)

# Delete temporary variables
del X_, y_

print(f"the shape of the training set (input) is: {X_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {X_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {X_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (672, 20)
the shape of the training set (target) is: (672,)

the shape of the cross validation set (input) is: (224, 20)
the shape of the cross validation set (target) is: (224,)

the shape of the test set (input) is: (225, 20)
the shape of the test set (target) is: (225,)


In [19]:
house_model.fit(X_train, y_train)

In [20]:
# Make predictions
predictions = house_model.predict(X_test)

In [21]:
# Evaluate the model: MAE, MSE, RMSE
MAE = mean_absolute_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(MSE)

print(MAE)
# With an MAE of 22,868, this means that on average my model's predictions are off by £22,688. Avg house price in dataset is £180,921. On average 12.5% error. 
print(MSE)
print(RMSE)
# RMSE of first iteration of the model is £33,189. RMSE penalises larger errors more heavily than MAE.

30277.933333333334
2504992869.3377776
50049.903789495715


In [22]:
# Improving the model through hyperparameter tuning
# Using GridSearchCV

from sklearn.model_selection import GridSearchCV

In [23]:
# Create a 2nd model that will use for hyperparameter training
# This is the model that will be tuned.

house_model_v2 = DecisionTreeRegressor(random_state=42)

In [24]:
# Parameters that we will vary with GridSearch.
# Parameter grid. It's a dictionary that specifies the hyperparemeters that will be tuned.

# Max_depth is the maximum depth of the tree.
# Min_samples_splt is the min number of samples required to split an internal node.
# Min_samples_leaf is the min number of samples required to be at a leaf node.
# Max_features is the number of features to consider when looking for the best split.

param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 3, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2', None]  
}

In [25]:
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
# Have to choose a scoring metric to evaluate the performance of each hyperparameter combination.

# Using negative mean squared error as the scoring metric
scoring_metric = 'neg_mean_squared_error'

# Alternatively, to minimize MAE:
# scoring_metric = 'neg_mean_absolute_error'

# Or to maximize R-squared:
# scoring_metric = 'r2'

In [26]:
grid_search = GridSearchCV(estimator=house_model_v2,
                           param_grid=param_grid,
                           scoring=scoring_metric,
                           cv=5, # The number of cross-validation folds. Common values is 5 or 10.
                           n_jobs=-1, # uses all CPU cores
                           verbose=2) # Controls the amount of output during the search.

In [27]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits




In [28]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}


In [29]:
print("Best Score:", grid_search.best_score_)

Best Score: -1770726204.4919262


In [30]:
best_model = grid_search.best_estimator_

In [31]:
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Performance on Test Set (Best Model):")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Performance on Test Set (Best Model):
Mean Squared Error: 2157101966.741212
Mean Absolute Error: 28862.319156966492
R-squared: 0.7170942124981097


In [32]:
print(np.sqrt(mse))

46444.6118160246


In [33]:
# With GridSearch, I've gotten my MSE down to £46,444 from £50k and my MAE is now £28k down from £30k.