# Bengaluru House Price Prediction - Supervised Regression Problem

## Model Building

### Split dataset in train and test

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("ohe_data_reduce_cat_class.csv")
df

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft,area_typeSuper built-up Area,area_typeBuilt-up Area,area_typePlot Area,availability_ready_to_move,...,location_Kalena Agrahara,location_Horamavu Agara,location_Vidyaranyapura,location_Hosur Road,location_Hebbal Kempapura,location_BTM 2nd Stage,location_Domlur,location_Horamavu Banaswadi,location_Tumkur Road,location_Mahadevpura
0,3.0,2.0,150.00,1672.0,3,8971.291866,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3.0,3.0,149.00,1750.0,3,8514.285714,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3.0,2.0,150.00,1750.0,3,8571.428571,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2.0,2.0,40.00,1250.0,2,3200.000000,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2.0,2.0,83.00,1200.0,2,6916.666667,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7115,3.0,2.0,325.00,2900.0,3,11206.896552,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7116,3.0,1.0,84.83,1780.0,3,4765.730337,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7117,2.0,1.0,48.00,880.0,2,5454.545455,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7118,2.0,1.0,55.00,1000.0,2,5500.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = df.drop("price", axis=1)
y = df['price']
print("Shape of X = ", X.shape)
print("Shape of y = ", y.shape)

Shape of X =  (7120, 107)
Shape of y =  (7120,)


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 51)
print('Shape of X_train = ', X_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (5696, 107)
Shape of X_test =  (1424, 107)
Shape of y_train =  (5696,)
Shape of y_test =  (1424,)


### Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## Machine Learning Model Training

### Linear regression

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr_lasso = Lasso()
lr_ridge = Ridge()

In [7]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [8]:
lr.fit(X_train, y_train)
lr_score = lr.score(X_test, y_test)
lr_rmse = rmse(y_test, lr.predict(X_test))
lr_score, lr_rmse

(0.7918162437298752, 64.67629458509695)

In [9]:
# Lasso
lr_lasso.fit(X_train, y_train)
lr_lasso_score = lr_lasso.score(X_test, y_test)
lr_lasso_rmse = rmse(y_test, lr_lasso.predict(X_test))
lr_lasso_score, lr_lasso_rmse

(0.8038760090599336, 62.77505059562723)

### Support Vector Machine

In [10]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
svr_score = svr.score(X_test, y_test)
svr_rmse = rmse(y_test, svr.predict(X_test))
svr_score, svr_rmse

(0.2161021361572849, 125.50223235155066)

### Random Forest Regressor

In [11]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_score = rfr.score(X_test, y_test)
rfr_rmse = rmse(y_test, rfr.predict(X_test))
rfr_score, rfr_rmse

(0.8881319390068746, 47.41052512077909)

### XGBoost

In [14]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
xgb_reg_score = xgb_reg.score(X_test, y_test)
xgb_reg_rmse = rmse(y_test, xgb_reg.predict(X_test))
xgb_reg_score, xgb_reg_rmse

(0.8742964229428543, 50.256880560528465)

In [15]:
print(pd.DataFrame([{'Model': 'Linear Regression','Score': lr_score, "RMSE":lr_rmse},
                   {'Model': 'Lasso', 'Score': lr_lasso_score, "RMSE": lr_lasso_rmse},
                    {'Model':'Support Vector Machine','Score':svr_score, "RMSE":svr_rmse},
                    {'Model':'Random Forest','Score':rfr_score, "RMSE":rfr_rmse},
                    {'Model':'XGBoost','Score':xgb_reg_score, "RMSE":xgb_reg_rmse}
                   ],
                  columns=['Model','Score','RMSE']))

                    Model     Score        RMSE
0       Linear Regression  0.791816   64.676295
1                   Lasso  0.803876   62.775051
2  Support Vector Machine  0.216102  125.502232
3           Random Forest  0.888132   47.410525
4                 XGBoost  0.874296   50.256881


### Cross Validation

In [16]:
from sklearn.model_selection import KFold, cross_val_score
cvs = cross_val_score(xgb_reg, X_train, y_train, cv=10)
cvs, cvs.mean()

(array([0.98490739, 0.97045611, 0.99629293, 0.97237394, 0.96982053,
        0.98805319, 0.971068  , 0.88372362, 0.98899998, 0.91387018]),
 0.9639565854902301)

In [17]:
cvs_rfr = cross_val_score(rfr, X_train, y_train, cv = 10)
cvs_rfr, cvs_rfr.mean()

(array([0.99459353, 0.96291064, 0.99754348, 0.94857855, 0.96476818,
        0.90289066, 0.92460034, 0.91070892, 0.99638751, 0.98672899]),
 0.9589710799016272)

### Random Forest has the highest accuracy, lowest RMSE and difference between cross validation is also less (as compared to others)

### Save model and Load model

In [18]:
import joblib
# Save model
joblib.dump(rfr, "Bengaluru_House_Price_Prediction_rfr_model.pkl")

['Bengaluru_House_Price_Prediction_rfr_model.pkl']

In [19]:
# Load model
bengaluru_house_price_prediction_model = joblib.load("Bengaluru_House_Price_Prediction_rfr_model.pkl")