In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score

import seaborn as sns
import matplotlib.pyplot as plt

import pickle
import json

### Data Gathering

In [2]:
df = pd.read_csv('Housing_V1.csv')
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420.0,4,2,3,1,0,0,0,1,2,1,2
1,12250000,8960.0,4,4,4,1,0,0,0,1,3,0,2
2,12250000,9960.0,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500.0,4,2,2,1,0,1,0,1,3,1,2
4,11410000,7420.0,4,1,2,1,1,1,0,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000.0,2,1,1,1,0,1,0,0,2,0,0
541,1767150,2400.0,3,1,1,0,0,0,0,0,0,0,1
542,1750000,3620.0,2,1,1,1,0,0,0,0,0,0,0
543,1750000,2910.0,3,1,1,0,0,0,0,0,0,0,2


### Model Training

In [3]:
x = df.drop('price',axis=1)
y = df['price']

x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=32)

In [5]:
rf_reg = RandomForestRegressor(n_jobs=-1,random_state=32)

In [6]:
rf_reg.fit(x_train,y_train)

### Model Evaluation

In [7]:
#Training data evaluation

y_train_pred = rf_reg.predict(x_train)

mae = mean_absolute_error(y_train , y_train_pred)
print("Mean absolute error : ",mae)

mse = mean_squared_error(y_train , y_train_pred)
print("Mean squared error : ",mse)

r2 = r2_score(y_train , y_train_pred)
print("R2 Score : ",r2)

Mean absolute error :  324242.34403669724
Mean squared error :  207297585192.32053
R2 Score :  0.9410255934994507


In [8]:
#Testing data evaluation

y_test_pred = rf_reg.predict(x_test)

mae = mean_absolute_error(y_test , y_test_pred)
print("Mean absolute error : ",mae)

mse = mean_squared_error(y_test , y_test_pred)
print("Mean squared error : ",mse)

r2 = r2_score(y_test , y_test_pred)
print("R2 Score : ",r2)

Mean absolute error :  692615.6840978593
Mean squared error :  909897706274.6866
R2 Score :  0.7323108990993862


### Hyperparameter Tunning

In [12]:
rf_reg = RandomForestRegressor(n_jobs=-1,random_state=32)
parameters = {"criterion"   : ['squared_error','absolute_error'],
        "max_depth"   : np.arange(3,10),
        "min_samples_split" : np.arange(5,20),
        "min_samples_leaf"  : np.arange(2,10)}

gscv_rf_reg = GridSearchCV(rf_reg,parameters,n_jobs=-1)
gscv_rf_reg.fit(x_train,y_train)

In [15]:
rf_reg = gscv_rf_reg.best_estimator_
rf_reg.fit(x_train,y_train)

In [16]:
#Training data evaluation

y_train_pred = rf_reg.predict(x_train)

mae = mean_absolute_error(y_train , y_train_pred)
print("Mean absolute error : ",mae)

mse = mean_squared_error(y_train , y_train_pred)
print("Mean squared error : ",mse)

r2 = r2_score(y_train , y_train_pred)
print("R2 Score : ",r2)

Mean absolute error :  606036.0266055046
Mean squared error :  803162589457.089
R2 Score :  0.7715070487061763


In [17]:
#Testing data evaluation

y_test_pred = rf_reg.predict(x_test)

mae = mean_absolute_error(y_test , y_test_pred)
print("Mean absolute error : ",mae)

mse = mean_squared_error(y_test , y_test_pred)
print("Mean squared error : ",mse)

r2 = r2_score(y_test , y_test_pred)
print("R2 Score : ",r2)

Mean absolute error :  689877.5344036698
Mean squared error :  957490138782.4706
R2 Score :  0.718309352134462


In [41]:
rf_reg = RandomForestRegressor(criterion='squared_error', max_depth=10,
                      min_samples_leaf=5, min_samples_split=5, n_jobs=-1,
                      random_state=5)
rf_reg.fit(x_train,y_train)
train = rf_reg.score(x_train,y_train)
test = rf_reg.score(x_test,y_test)
print(train)
print(test)

0.7570383887114862
0.7289685790192486


In [42]:
#Training data evaluation

y_train_pred = rf_reg.predict(x_train)

mae = mean_absolute_error(y_train , y_train_pred)
print("Mean absolute error : ",mae)

mse = mean_squared_error(y_train , y_train_pred)
print("Mean squared error : ",mse)

r2 = r2_score(y_train , y_train_pred)
print("R2 Score : ",np.around(r2,2))

Mean absolute error :  649928.3690349759
Mean squared error :  854020554052.969
R2 Score :  0.76


In [43]:
#Testing data evaluation

y_test_pred = rf_reg.predict(x_test)

mae = mean_absolute_error(y_test , y_test_pred)
print("Mean absolute error : ",mae)

mse = mean_squared_error(y_test , y_test_pred)
print("Mean squared error : ",mse)

r2 = r2_score(y_test , y_test_pred)
print("R2 Score : ",np.around(r2,2))

Mean absolute error :  706868.9902588865
Mean squared error :  921258532562.8002
R2 Score :  0.73


In [44]:
with open('rf_pkl.pickle','wb') as f:
    pickle.dump(rf_reg,f)

In [45]:
column_list = x.columns.to_list()
column_list

['area',
 'bedrooms',
 'bathrooms',
 'stories',
 'mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'parking',
 'prefarea',
 'furnishingstatus']

In [46]:
project_data = {'column_list':column_list}
project_data

{'column_list': ['area',
  'bedrooms',
  'bathrooms',
  'stories',
  'mainroad',
  'guestroom',
  'basement',
  'hotwaterheating',
  'airconditioning',
  'parking',
  'prefarea',
  'furnishingstatus']}

In [47]:
with open('project_data.json','w') as f:
    json.dump(project_data,f)