# Random Forest Regression

Predict the price of a house using Random Forest Regression.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
raw_df = pd.read_csv("../input/housesalesprediction/kc_house_data.csv")

# Section 1 - EDA

In [None]:
raw_df

In [None]:
raw_df.shape

In [None]:
raw_df.info()

In [None]:
raw_df.head()

The independent variables like bedrooms, bathrooms, sqft_living, sqft_lot, etc. all are looking relevant here except for date, street, city, statezip, and country.

In [None]:
raw_df.describe()

In [None]:
raw_df.columns

In [None]:
df = raw_df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
             'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
             'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15']].copy()

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df

In [None]:
df.columns

**price** being the dependent variable is normally distributed here.

In [None]:
from scipy.stats import skew
#Top skewed columns
numeric_features = df.dtypes[df.dtypes != 'object'].index
skewed_features = df[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_features[abs(skewed_features) > 0.5]
print(high_skew)

In [None]:
#Transforming skewed columns
# for feature in high_skew.index:
#     if feature in ['waterfront', 'sqft_lot']:
#         raw_df[feature] = np.log1p(raw_df[feature])

In [None]:
df.head()

In [None]:
#Converting categorical data to numerical
data = pd.get_dummies(df)
data.head()

In [None]:
df.head()

In [None]:
# lets visualize the dependent variable (normal distribution)
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

sns.distplot(df['price'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df['price'], plot=plt)

In [None]:
df.columns

In [None]:
# data seems to be skewed towards left here,
# Let's check for the logarithmic distribution of price

# ignoring this for better performance
attr = 'price'
df_attr = np.log(df[attr])
sns.distplot(df_attr, fit = stats.norm)

fig = plt.figure()
res = stats.probplot(df_attr, plot=plt)

The logarithmic distribution of price looks normal now. Let's move ahead with that.

In [None]:
df['price'] = np.log(raw_df['price'])
# df['price'] = np.log1p(df['price'])

In [None]:
df

In [None]:
# Outlier Analysis
fig, axs = plt.subplots(12, figsize = (10,10))
cols = df.columns
for i in range(0,len(cols)):
    plt_i = sns.boxplot(raw_df[cols[i]], ax = axs[i])
plt.tight_layout()

For now, let's assume that there are no outliers, and review the model prediction to recheck outliers.

In [None]:
# sns.pairplot(raw_df, 
#              x_vars=['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                      'floors', 'waterfront', 'view', 'condition', 'sqft_above',
#                      'sqft_basement', 'yr_built', 'yr_renovated'], 
#              y_vars=['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                      'floors', 'waterfront', 'view', 'condition', 'sqft_above',
#                      'sqft_basement', 'yr_built', 'yr_renovated'], 
#              kind='scatter')
# plt.show()

In [None]:
# sns.pairplot(raw_df, 
#              x_vars=['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                      'floors', 'waterfront', 'view', 'condition', 'sqft_above',
#                      'sqft_basement', 'yr_built', 'yr_renovated'], 
#              y_vars='price', 
#              kind='scatter')
# plt.show()

In [None]:
plt.figure(figsize=(16, 6))
sns.heatmap(df.corr(), annot=True)
plt.show()

We can see some correlations here, like sqft_above is positively correlated to sqft_living.

# Section B - ML

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
df.columns

In [None]:
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade',
       'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15',
       'sqft_lot15']]
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
model = RandomForestRegressor()  
model.fit(X_train, y_train)

In [None]:
model_predictions = model.predict(X_test)
print(model.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print(mean_absolute_error(y_test, model_predictions))
print(mean_squared_error(y_test, model_predictions))
print(explained_variance_score(y_test, model_predictions))
print(r2_score(y_test, model_predictions))

In [None]:
from pprint import pprint
pprint(model.get_params())

**Important Hyperparameters in Random Forest Regression**

**n_estimators** = number of trees in the foreset

**max_features** = max number of features considered for splitting a node

**max_depth** = max number of levels in each decision tree

**min_samples_split** = min number of data points placed in a node before the node is split

**min_samples_leaf** = min number of data points allowed in a leaf node

**bootstrap** = method for sampling data points (with or without replacement)

Let's adjust the hyperparameters to see if the model prediction changes.

In [None]:
from sklearn.model_selection import RandomizedSearchCV # Number of trees in random forest

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 500, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
optimized_model = RandomForestRegressor(n_estimators=236, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=20, bootstrap=False)  
optimized_model.fit(X_train, y_train)

In [None]:
opt_model_predictions = optimized_model.predict(X_test)
print(optimized_model.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print(mean_absolute_error(y_test, opt_model_predictions))
print(mean_squared_error(y_test, opt_model_predictions))
print(explained_variance_score(y_test, opt_model_predictions))
print(r2_score(y_test, opt_model_predictions))

In [None]:
importances = optimized_model.feature_importances_
print(importances)

In [None]:
featureImp= []
for feat, importance in zip(X_train.columns, importances):  
    temp = [feat, importance*100]
    featureImp.append(temp)

fT_df = pd.DataFrame(featureImp, columns = ['Feature', 'Importance'])
print (fT_df.sort_values('Importance', ascending = False))

# References

* https://www.kaggle.com/kunalprompt/simple-linear-regression-advertising-and-sales
* https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
* https://medium.com/hackerdawn/house-prices-prediction-using-random-forest-aa8722347276
* https://www.kaggle.com/subhradeep88/house-price-predict-decision-tree-random-forest