## House Rent Prediction using Random Forest Regressor

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Run this code to fetch data from MongoDB
import pymongo
import pymongoarrow, pymongoarrow.api

client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
mydb = client['Project']
collection = mydb.HouseRent

df = pymongoarrow.api.find_pandas_all(collection, {})
df = df.drop('_id',axis=1)

In [3]:
# Run this code to fetch data from local system
#df=pd.read_csv("../Dataset/ProcessedFile.csv")

In [4]:
df.head()

Unnamed: 0,size_sq_ft,propertyType,bedrooms,latitude,longitude,localityName,suburbName,companyName,closest_metro_station_km,AP_dist_km,Aiims_dist_km,NDRLW_dist_km,price
0,400,Independent Floor,1,28.64101,77.284386,Swasthya Vihar,East Delhi,Other,0.577495,21.741188,11.119239,6.227231,9000
1,1050,Apartment,2,28.594969,77.298668,mayur vihar phase 1,East Delhi,Other,0.417142,21.401856,9.419061,9.217502,20000
2,2250,Independent Floor,2,28.641806,77.293922,Swasthya Vihar,East Delhi,Other,0.125136,22.620365,11.829486,7.159184,28000
3,1350,Independent Floor,2,28.644363,77.293228,Krishna Nagar,East Delhi,Other,0.371709,22.681201,11.982708,7.097348,28000
4,450,Apartment,2,28.594736,77.31115,New Ashok Nagar,East Delhi,Other,1.08776,22.59281,10.571573,10.263271,12500


In [5]:
X = df.drop(['price','localityName'], axis=1)
y = df['price']

In [6]:
# Mapping PropertyTypes
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType']) 

In [8]:
X_train['propertyType'] = X_train['propertyType'].astype(str)
X_test['propertyType'] = X_test['propertyType'].astype(str)

In [9]:
# Using Once Hot Encoding
ohc = OneHotEncoder(handle_unknown='ignore')
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int32','int64','float64'])))
X_transf_trn = ct.fit_transform(X_train).toarray()
X_transf_trn = pd.DataFrame(X_transf_trn, columns=ct.get_feature_names_out())

In [10]:
print(ct.get_feature_names_out())

['onehotencoder__propertyType_1' 'onehotencoder__propertyType_2'
 'onehotencoder__propertyType_3' 'onehotencoder__propertyType_4'
 'onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi'
 'onehotencoder__companyName_A R Realtors'
 'onehotencoder__companyName_AB Estate'
 'onehotencoder__companyName_AMIT CHHABRA'
 'onehotencoder__companyName_AMPM Realtors'
 'onehotencoder__companyName_AVS Realtors'
 'onehotencoder__companyName_Abhishek yadav'
 'onehotencoder__companyName_Ahuja properties'
 'onehotencoder__companyName_Angel Property Services'
 'onehotencoder__companyName_Arjun Raj'
 'onehotencoder__companyName_Ashish Bansal'
 'onehotencoder__companyName_Ashish Talwar'
 'onehotencoder__companyName_Atul Kumar'
 'onehotenc

In [11]:
# Implementing Random Forest Regressor using KFolds and Grid Search
rf=RandomForestRegressor(random_state=2023, oob_score=True, n_jobs=-1)
params={'max_features':[5,10,15,20,30,35],
        'n_estimators':[25,50,75,100,125]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
gcv = GridSearchCV(rf, param_grid=params, cv=kfold, scoring='r2',n_jobs=-1)
gcv.fit(X_transf_trn, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=RandomForestRegressor(n_jobs=-1, oob_score=True,
                                             random_state=2023),
             n_jobs=-1,
             param_grid={'max_features': [5, 10, 15, 20, 30, 35],
                         'n_estimators': [25, 50, 75, 100, 125]},
             scoring='r2')

In [12]:
# Printing best R2 score of train data
print(gcv.best_score_)

0.7661411869023725


In [13]:
# Printing best performing parameters
print(gcv.best_params_)

{'max_features': 30, 'n_estimators': 100}


In [14]:
# Getting best performing model from GCV
best_model = gcv.best_estimator_

X_transf_tst = ct.transform(X_test).toarray()

#Calculating Predictions on Test Data
y_pred = best_model.predict(X_transf_tst)
y_pred

array([15529.16666667, 56406.66666667, 27028.33333333, ...,
       14226.50052725, 40970.        , 19173.277     ])

In [15]:
# Printing R2 Score on Test Data
print(r2_score(y_test, y_pred))

0.9017881980216116


#### Calculating Prediction Interval

In [16]:
#Residual Error
resid = y_train - best_model.oob_prediction_

In [17]:
# Calculating Lower quantile and Upper quantile
lowq = resid.quantile(0.25)
higq = resid.quantile(0.75)

In [18]:
test_y = best_model.predict(X_transf_tst)
lowt = (test_y + lowq).clip(1000) #cant have negative numbers
higt = (test_y + higq)

In [19]:
# Lower Bound
lowt

array([12414.02998776, 53291.52998776, 23913.19665443, ...,
       11111.36384834, 37854.86332109, 16058.14032109])

In [20]:
# Upper Bound
higt

array([17667.18675842, 58544.68675842, 29166.35342509, ...,
       16364.52061901, 43108.02009176, 21311.29709176])

In [21]:
import io
import pickle

In [22]:
# Creating Pickle Files

pickle.dump(best_model,open("../Pickle_Files/best_model_final.pkl","wb"))
pickle.dump(df,open("../Pickle_Files/dataFrame_final.pkl","wb"))
pickle.dump(ct,open("../Pickle_Files/column_transformer.pkl","wb"))
pickle.dump(y_train,open("../Pickle_Files/y_train.pkl","wb"))