#### Business Problem : To estimate the price of a house with the help of 2 models, i.e. Random Forest and Gradient Boosting

# Data Understanding

In [1]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Import the dataset 
df = pd.read_csv('House_Pricing.csv')

In [3]:
df.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,house_number,street_name,unit_number,city,zip_code,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,42670,Lopez Crossing,,Hallfort,10907,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,5194,Gardner Park,,Hallfort,10907,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,4366,Harding Islands,,Lake Christinaport,11203,2721596.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,3302,Michelle Highway,,Lake Christinaport,11203,212968.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,582,Jacob Cape,,Lake Christinaport,11203,224529.0


In [4]:
# Remove the fields from the data set that we don't want to include in our model
df1 = df.drop(columns = ['house_number','street_name','unit_number','zip_code'])
df2 = df1

In [5]:
df1.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,city,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,Hallfort,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,Hallfort,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,Lake Christinaport,2721596.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,Lake Christinaport,212968.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,Lake Christinaport,224529.0


In [6]:
df.describe()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,house_number,unit_number,zip_code,sale_price
count,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,3088.0,42703.0,42703.0
mean,1990.993209,1.365759,3.209283,1.923659,0.527153,1987.758986,2127.155446,455.8498,41.656324,18211.767347,2027.395402,11030.991476,441986.2
std,19.199987,0.513602,1.043396,0.759699,0.499268,846.76627,922.807342,243.453463,168.715867,27457.109993,1141.38377,573.576228,344285.7
min,1852.0,0.0,0.0,0.0,0.0,-3.0,5.0,-4.0,0.0,0.0,3.0,10004.0,664.0
25%,1980.0,1.0,3.0,1.0,0.0,1380.0,1466.0,412.0,0.0,674.0,1063.0,10537.0,285591.5
50%,1994.0,1.0,3.0,2.0,1.0,1808.0,1937.0,464.0,0.0,4530.0,2033.0,11071.0,402191.0
75%,2005.0,2.0,4.0,2.0,1.0,2486.0,2640.0,606.0,0.0,24844.5,2921.0,11510.0,532715.0
max,2017.0,4.0,31.0,8.0,1.0,12406.0,15449.0,8318.0,9200.0,99971.0,3998.0,11989.0,22935780.0


In [7]:
df1['sale_price'].min()

664.0

In [8]:
df1['sale_price'].max()

22935778.0

In [9]:
m = df1['sale_price'].mean()
m

441986.20551249327

In [10]:
df1['sale_price'] = df1['sale_price'] >= m

In [11]:
# Convert Y variable (Price) into 2 categories
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()

In [13]:
df1['sale_price'] = le.fit_transform(df1['sale_price'])
df1.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,city,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,Hallfort,0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,Hallfort,0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,Lake Christinaport,1
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,Lake Christinaport,0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,Lake Christinaport,0


In [34]:
# Replace categorical data with one-hot encoded data
# • Garage type
# • city
from sklearn.preprocessing import LabelBinarizer
df_one_hot = df1.copy()
lb = LabelBinarizer()
lb_results = lb.fit_transform(df_one_hot['garage_type'])
lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_)
lb_results_df.head()

Unnamed: 0,attached,detached,none
0,1,0,0
1,1,0,0
2,0,0,1
3,1,0,0
4,1,0,0


In [15]:
final_df = pd.concat([df_one_hot,lb_results_df],axis = 1)

In [16]:
df_one_hot_en = final_df.copy()
lb_en = LabelBinarizer()
lb_results_en = lb_en.fit_transform(df_one_hot_en['city'])
lb_results_df_en = pd.DataFrame(lb_results_en, columns=lb_en.classes_)
lb_results_df_en.head()

Unnamed: 0,Amystad,Brownport,Chadstad,Clarkberg,Coletown,Davidfort,Davidtown,East Amychester,East Janiceville,East Justin,...,South Anthony,South Stevenfurt,Toddshire,Wendybury,West Ann,West Brittanyview,West Gerald,West Gregoryview,West Lydia,West Terrence
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
final_df_en = pd.concat([df_one_hot_en,lb_results_df_en],axis = 1)

In [18]:
print("Original dimensions :", df.shape)
print("One hot Encoded dimensions :", final_df_en.shape)
final_df_en.head()

Original dimensions : (42703, 20)
One hot Encoded dimensions : (42703, 66)


Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,...,South Anthony,South Stevenfurt,Toddshire,Wendybury,West Ann,West Brittanyview,West Gerald,West Gregoryview,West Lydia,West Terrence
0,1978,1,4,1,1,1689,1859,attached,508,0,...,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,attached,462,0,...,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,none,0,625,...,0,0,0,0,0,0,0,0,0,0
3,2004,1,4,2,0,1829,2277,attached,479,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,attached,430,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
final_df_en.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42703 entries, 0 to 42702
Data columns (total 66 columns):
year_built              42703 non-null int64
stories                 42703 non-null int64
num_bedrooms            42703 non-null int64
full_bathrooms          42703 non-null int64
half_bathrooms          42703 non-null int64
livable_sqft            42703 non-null int64
total_sqft              42703 non-null int64
garage_type             42703 non-null object
garage_sqft             42703 non-null int64
carport_sqft            42703 non-null int64
has_fireplace           42703 non-null bool
has_pool                42703 non-null bool
has_central_heating     42703 non-null bool
has_central_cooling     42703 non-null bool
city                    42703 non-null object
sale_price              42703 non-null int64
attached                42703 non-null int32
detached                42703 non-null int32
none                    42703 non-null int32
Amystad                 42703 non-null

In [20]:
final_df_en = final_df_en.drop(columns =['garage_type','city'])

In [21]:
final_df_en.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42703 entries, 0 to 42702
Data columns (total 64 columns):
year_built              42703 non-null int64
stories                 42703 non-null int64
num_bedrooms            42703 non-null int64
full_bathrooms          42703 non-null int64
half_bathrooms          42703 non-null int64
livable_sqft            42703 non-null int64
total_sqft              42703 non-null int64
garage_sqft             42703 non-null int64
carport_sqft            42703 non-null int64
has_fireplace           42703 non-null bool
has_pool                42703 non-null bool
has_central_heating     42703 non-null bool
has_central_cooling     42703 non-null bool
sale_price              42703 non-null int64
attached                42703 non-null int32
detached                42703 non-null int32
none                    42703 non-null int32
Amystad                 42703 non-null int32
Brownport               42703 non-null int32
Chadstad                42703 non-null i

In [22]:
final_df_en['sale'] = final_df_en['sale_price']
final_df_en.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,has_fireplace,...,South Stevenfurt,Toddshire,Wendybury,West Ann,West Brittanyview,West Gerald,West Gregoryview,West Lydia,West Terrence,sale
0,1978,1,4,1,1,1689,1859,508,0,True,...,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,462,0,True,...,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,0,625,False,...,0,0,0,0,0,0,0,0,0,1
3,2004,1,4,2,0,1829,2277,479,0,True,...,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,430,0,True,...,0,0,0,0,0,0,0,0,0,0


In [23]:
final_df_en = final_df_en.drop(columns = ['sale_price'])

In [24]:
final_df_en.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,has_fireplace,...,South Stevenfurt,Toddshire,Wendybury,West Ann,West Brittanyview,West Gerald,West Gregoryview,West Lydia,West Terrence,sale
0,1978,1,4,1,1,1689,1859,508,0,True,...,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,462,0,True,...,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,0,625,False,...,0,0,0,0,0,0,0,0,0,1
3,2004,1,4,2,0,1829,2277,479,0,True,...,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,430,0,True,...,0,0,0,0,0,0,0,0,0,0


In [25]:
final_df_en = final_df_en.rename(columns = {'sale':'sale_price'})

In [26]:
final_df_en.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,has_fireplace,...,South Stevenfurt,Toddshire,Wendybury,West Ann,West Brittanyview,West Gerald,West Gregoryview,West Lydia,West Terrence,sale_price
0,1978,1,4,1,1,1689,1859,508,0,True,...,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,462,0,True,...,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,0,625,False,...,0,0,0,0,0,0,0,0,0,1
3,2004,1,4,2,0,1829,2277,479,0,True,...,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,430,0,True,...,0,0,0,0,0,0,0,0,0,0


In [27]:
final_df_en['has_fireplace'] = final_df_en['has_fireplace'].astype(int)

In [28]:
final_df_en['has_pool'] = final_df_en['has_pool'].astype(int)

In [29]:
final_df_en['has_central_heating'] = final_df_en['has_central_heating'].astype(int)

In [30]:
final_df_en['has_central_cooling'] = final_df_en['has_central_cooling'].astype(int)

In [31]:
final_df_en.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,has_fireplace,...,South Stevenfurt,Toddshire,Wendybury,West Ann,West Brittanyview,West Gerald,West Gregoryview,West Lydia,West Terrence,sale_price
0,1978,1,4,1,1,1689,1859,508,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,462,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,0,625,0,...,0,0,0,0,0,0,0,0,0,1
3,2004,1,4,2,0,1829,2277,479,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,430,0,1,...,0,0,0,0,0,0,0,0,0,0


In [32]:
final_df_en.dtypes

year_built              int64
stories                 int64
num_bedrooms            int64
full_bathrooms          int64
half_bathrooms          int64
livable_sqft            int64
total_sqft              int64
garage_sqft             int64
carport_sqft            int64
has_fireplace           int32
has_pool                int32
has_central_heating     int32
has_central_cooling     int32
attached                int32
detached                int32
none                    int32
Amystad                 int32
Brownport               int32
Chadstad                int32
Clarkberg               int32
Coletown                int32
Davidfort               int32
Davidtown               int32
East Amychester         int32
East Janiceville        int32
East Justin             int32
East Lucas              int32
Fosterberg              int32
Hallfort                int32
Jeffreyhaven            int32
                        ...  
Lake Carolyn            int32
Lake Christinaport      int32
Lake Dariu

In [35]:
# Apply Standard Scalar function to the Numerical Columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
print(scaler.fit(df[['livable_sqft','total_sqft','garage_sqft','carport_sqft']]))

StandardScaler(copy=True, with_mean=True, with_std=True)


In [36]:
print(scaler.transform(df[['livable_sqft','total_sqft','garage_sqft','carport_sqft']]))

[[-0.35282757 -0.29059     0.21421265 -0.24690512]
 [-0.00443928 -0.13562626  0.02526262 -0.24690512]
 [-0.48037311 -0.59509916 -1.87245288  3.45759125]
 ...
 [-1.64599767 -1.63541516 -1.87245288  0.92075214]
 [-0.69649195 -0.78690742 -0.22530155 -0.24690512]
 [-0.69294902 -0.78148911 -0.24583959 -0.24690512]]


# Modelling

In [37]:
# Create the X and y arrays
array = final_df_en.values
X = array[0:10000,1:63]

In [38]:
y = df['sale_price'][0:10000]

In [39]:
X

array([[1, 4, 1, ..., 0, 0, 0],
       [1, 3, 1, ..., 0, 0, 0],
       [1, 3, 2, ..., 0, 0, 0],
       ...,
       [1, 2, 1, ..., 0, 0, 0],
       [2, 3, 2, ..., 0, 0, 0],
       [3, 3, 2, ..., 0, 0, 0]], dtype=int64)

In [40]:
# Split the data set in a training set (70%) and a test set (30%)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=7)

In [None]:
final_df_en.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state = 7)
regressor.fit(x_train,y_train)

In [None]:
# Predicting the Test set results
y_pred = regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.model_selection import GridSearchCV
# Number of trees in random forest
# n_estimators = [500,1000,3000]
# Number of features to consider at every split
# max_features = [1.0, 0.3,0.1]
# Maximum number of levels in tree
# max_depth = [4,6]
# Minimum number of samples required at each leaf node
# min_samples_leaf = [3,5,9,17]
# Method of selecting samples for training each tree
# learning_rate = [1.0,0.05,0.02,0.01]
param_grid = { 
    'n_estimators': [500,1000],
    'max_features': [1.0,0.3,0.1],
    'max_depth' : [4,6],
    'min_samples_leaf' : [3,5,9,17]
}
grid = GridSearchCV(estimator = regressor,scoring = make_scorer(mean_squared_error), param_grid=param_grid,n_jobs = -1,cv = 4,
                    verbose = 2)
grid.fit(x_train,y_train)

In [None]:
# Predicting the Test set results
y_pred1 = regressor.predict(x_test)

In [None]:
best_accuracy = grid.best_score_
best_parameters = grid.best_params_
print(best_accuracy)
print(best_parameters)

In [None]:
from sklearn.metrics import r2_score
# RMSE
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
x = r2_score(y_test, y_pred1)
x

In [None]:
# Fitting GBM to the Training set
classifier = GradientBoostingRegressor(random_state = 7)
classifier.fit(x_train, y_train)

In [None]:
y_pred3 = classifier.predict(x_test)

In [None]:
r2_score(y_test, y_pred3)

In [None]:
param_grid = { 
    'n_estimators': [500,1000],
    'max_features': [1.0,0.3,0.1],
    'max_depth' : [4,6],
    'min_samples_leaf' : [3,5,9,17],
    'learning_rate' : [0.1, 0.05, 0.02, 0.01]
}
grid_gb = GridSearchCV(estimator = classifier,scoring = make_scorer(mean_squared_error), param_grid=param_grid,n_jobs = -1,cv = 2,
                      verbose = 2)
grid_gb.fit(x_train,y_train)

In [None]:
best_accuracy_gb = grid_gb.best_score_
best_parameters_gb = grid_gb.best_params_
print(best_accuracy_gb)
print(best_parameters_gb)

In [None]:
# Predicting the Test set results
y_pred4 = classifier.predict(x_test)
y_pred4

In [None]:
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(regressor, 'House_Price_Estimation.pkl') 

In [None]:
from sklearn import metrics
# RMSE
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))

In [None]:
r2_score(y_test, y_pred4)

In [None]:
# Training error
train_error = 1 - (regressor.score(x_train,y_train))

In [None]:
train_error

In [None]:
# Test error 
test_error = 1 - x
test_error

In [None]:
df1.head()

In [None]:
### Realtime predictions
Real_predictions_rf = regressor.predict([[5,10,15,23,21,26,12,30,8,20,28,17,19,29,31,35,37,38,36,43,45,46,41,49,40,50,25,20,13,23,
                                       65,14,78,15,43,12,45,47,49,1,12,19,10,18,59,56,15,23,27,54,52,53,59,50,44,46,78,74,42,5,
                                      75,72]])
Real_predictions_rf

In [None]:
### Realtime predictions
Real_predictions_gb = classifier.predict([[5,10,15,23,21,26,12,30,8,20,28,17,19,29,31,35,37,38,36,43,45,46,41,49,40,50,25,20,13,23,
                                       65,14,78,15,43,12,45,47,49,1,12,19,10,18,59,56,15,23,27,54,52,53,59,50,44,46,78,74,42,5,
                                      75,72]])
Real_predictions_gb

In [None]:
# Compute train and test errors
# from sklearn import linear_model
# alphas = np.logspace(0,100)
# enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=100)
# train_errors = list()
# test_errors = list()
# for alpha in alphas:
#     enet.set_params(alpha=alpha)
#     enet.fit(x_train, y_train)
#     train_errors.append(enet.score(x_train, y_train))
#     test_errors.append(enet.score(x_test, y_test))