<a href="https://colab.research.google.com/github/riya1606/House_Price_Prediction/blob/main/HousingDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A House Price Valuation System using Gradient Boosting

In [51]:
# Set up the development environment by downloading all the libraries necessary
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib

In [52]:
# Import the dataset - in this case we are using a dataset from kaggle.com
df=pd.read_csv("Melbourne_housing_FULL.csv")

In [53]:
# Preview all the columns and its type in the dataframe.
df.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [54]:
# Preview the data to understand about it.
df.head(n=5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [55]:
# Scrubbing the Data: Remove the columns that will not be used.
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [56]:
df.dtypes

Suburb           object
Rooms             int64
Type             object
Price           float64
Distance        float64
Bedroom2        float64
Bathroom        float64
Car             float64
Landsize        float64
BuildingArea    float64
YearBuilt       float64
CouncilArea      object
dtype: object

In [57]:
# decision trees (including gradient boosting and random forests) are adept at managing large and highdimensional datasets with a high number of variables.

# Scrubbing the Data: Delete the rows with missing values.
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)


In [58]:
# Scrubbing the Data: Convert columns that contain non-numeric data to numeric values using one-hot encoding.
# With Pandas, one-hot encoding can be performed using the get_dummies function.
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])

In [59]:
# Delete the dependent variable (Price in this case).
del features_df['Price']

In [60]:
# Scrubbing the Data: Create X and Y arrays from dataset using matrix data type. X (independent variables) and Y (dependent variables).
X = features_df.to_numpy()
y = df['Price'].to_numpy()

In [61]:
# Split the dataset into 70:30 (Training and Testing)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [62]:
# Select the appropriate algorithm and configure its parameters.
model = ensemble.GradientBoostingRegressor(
n_estimators=250,
learning_rate=0.1,
max_depth=5,
min_samples_split=4,
min_samples_leaf=6,
max_features=0.6,
loss='huber'
)

In [63]:
# Fit your model on the training data
model.fit(X_train,y_train)

GradientBoostingRegressor(loss='huber', max_depth=5, max_features=0.6,
                          min_samples_leaf=6, min_samples_split=4,
                          n_estimators=250)

In [64]:
# Save your model
joblib.dump(model, 'house_trained_model.pkl')

['house_trained_model.pkl']

In [65]:
# Evaluate the accuracy of the model on training data.
mse = mean_absolute_error(y_train, model.predict(X_train))
print ("Training Set Mean Absolute Error: %.2f" % mse)

Training Set Mean Absolute Error: 125308.42


In [66]:
# Evaluate the accuracy on test data.
mse = mean_absolute_error(y_test, model.predict(X_test))
print ("Test Set Mean Absolute Error: %.2f" % mse)

Test Set Mean Absolute Error: 160119.92


In [67]:
# If your model does not give accurate data, start by modifying the hyperparameters of the model or you can also use grid search.