In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt


In [3]:
# Read in the data 
df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv',index_col = 'Id')

In [4]:
## Exploratory Data Analysis
df.describe()

# The dataframe has 1460 rows and 80 columns, with SalePrice being the target variable (supervised learning).

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
# Create a function to find the most correlated features

def find_most_correlated_features(df, target_col, num_features=10):
      """Finds the most correlated features with the target variable."""
      corr_with_target = df.corr()[target_col].abs().sort_values(ascending=False)
      most_correlated = corr_with_target.drop(target_col).head(num_features)
      return most_correlated

In [6]:
def isolate_numerical_features(df):
  """
  Isolates numerical features from a pandas DataFrame.

  Args:
    df: The input pandas DataFrame.

  Returns:
    A pandas DataFrame containing only the numerical features.
  """
  numerical_features = df.select_dtypes(include=['number'])
  return numerical_features

In [7]:
df_numerical  = isolate_numerical_features(df)
df_numerical.shape

(1460, 37)

In [8]:
find_most_correlated_features(df_numerical,'SalePrice',num_features=20)

OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
MasVnrArea      0.477493
Fireplaces      0.466929
BsmtFinSF1      0.386420
LotFrontage     0.351799
WoodDeckSF      0.324413
2ndFlrSF        0.319334
OpenPorchSF     0.315856
HalfBath        0.284108
LotArea         0.263843
Name: SalePrice, dtype: float64

In [9]:
##### Building model #####

In [10]:
# Building a decision tree
from sklearn.tree import DecisionTreeRegressor

house_model = DecisionTreeRegressor(random_state =1)

In [11]:
X = df[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','GarageYrBlt','MasVnrArea','Fireplaces','BsmtFinSF1','LotFrontage','WoodDeckSF','2ndFlrSF','OpenPorchSF','HalfBath','LotArea']]
y = df['SalePrice']

In [12]:
# Combine X and y for easier row-wise dropping
data = pd.concat([X, y], axis=1)

# Drop rows with any NaN values
data_cleaned = data.dropna()

In [13]:
data_cleaned.head()

Unnamed: 0_level_0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,...,MasVnrArea,Fireplaces,BsmtFinSF1,LotFrontage,WoodDeckSF,2ndFlrSF,OpenPorchSF,HalfBath,LotArea,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,1710,2,548,856,856,2,8,2003,2003,...,196.0,0,706,65.0,0,854,61,1,8450,208500
2,6,1262,2,460,1262,1262,2,6,1976,1976,...,0.0,1,978,80.0,298,0,0,0,9600,181500
3,7,1786,2,608,920,920,2,6,2001,2002,...,162.0,1,486,68.0,0,866,42,1,11250,223500
4,7,1717,3,642,756,961,1,7,1915,1970,...,0.0,1,216,60.0,0,756,35,0,9550,140000
5,8,2198,3,836,1145,1145,2,9,2000,2000,...,350.0,1,655,84.0,192,1053,84,1,14260,250000


In [14]:
X_cleaned = data_cleaned.iloc[:, :-1]  # Assuming the target is the last column
y_cleaned = data_cleaned.iloc[:, -1] # Target column is SalePrice.

In [15]:
# Split the cleaned data into train, test, cross-validation sets

#X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Get 60% of the dataset as the training set
X_train, X_, y_train, y_ = train_test_split(X_cleaned, y_cleaned, test_size = 0.4, random_state =42)
# Random state ensures that the split is the same every time you run the code, for reproducibility.

# Split the 40% subset into 20% for cross validation and 20% for test set 
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size =0.5, random_state=42)

# Delete temporary variables
del X_, y_

print(f"the shape of the training set (input) is: {X_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {X_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {X_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (672, 20)
the shape of the training set (target) is: (672,)

the shape of the cross validation set (input) is: (224, 20)
the shape of the cross validation set (target) is: (224,)

the shape of the test set (input) is: (225, 20)
the shape of the test set (target) is: (225,)


In [16]:
house_model.fit(X_train, y_train)

In [17]:
# Make predictions
predictions = house_model.predict(X_test)

In [18]:
# Evaluate the model: MAE, MSE, RMSE
MAE = mean_absolute_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(MSE)

print(MAE)
# With an MAE of 22,868, this means that on average my model's predictions are off by £22,688. Avg house price in dataset is £180,921. On average 12.5% error. 
print(MSE)
print(RMSE)
# RMSE of first iteration of the model is £33,189. RMSE penalises larger errors more heavily than MAE.

30277.933333333334
2504992869.3377776
50049.903789495715


In [19]:
# Improving the model through hyperparameter tuning
# Using GridSearchCV

from sklearn.model_selection import GridSearchCV

In [20]:
house_model_v2 = DecisionTreeRegressor(random_state=42)

In [21]:
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 3, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2', None]  
}