In [1]:
# Importing the relevant packages

import numpy as np
import seaborn
import numpy as np
import matplotlib.pyplot as matplotlib
import xlrd
 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
 
from matplotlib.lines import Line2D
from scipy.stats import pearsonr
 
# set seed to make results reproducible
RF_SEED = 30

In [2]:
# Parsing out the input data into 3 lists:
# 1. house price for each record
# 2. raw data to train the model
# 3. feature names

def load_input(excel_file):
    y_prediction = []
    data = []
    feature_names = []
 
    loc = (excel_file)
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_index(0)
    sheet.cell_value(0, 0)
 
    for index_row in range(0, 415):
        row = sheet.row_values(index_row)
        row = row[1:]
 
        if index_row == 0:
            feature_names = row
        else:
            row[0] = str(row[0]).split(".")[0]
            data.append([float(x) for x in row[:-1]])
            y_prediction.append(float(row[-1]))
 
    return y_prediction, data, feature_names[:-1]

In [3]:
# Splitting the data into test and train
# train - 80%, test - 20%

def split_data_train_model(labels, data):
    # 20% examples in test data
    train, test, train_labels, test_labels = train_test_split(data,
                                                              labels,
                                                              test_size=0.2,
                                                              random_state=RF_SEED)
 
    # training data fit
    regressor = RandomForestRegressor(n_estimators=1000, random_state=RF_SEED)
    regressor.fit(x_data, y_data)
 
    return test, test_labels, regressor

In [4]:
# Running the predictions using Random Forest regression

y_data, x_data, feature_names = load_input("Real_estate_valuation_data_ set.xlsx")
x_test, x_test_labels, regressor = split_data_train_model(y_data, x_data)
 
predictions = regressor.predict(x_test)

In [5]:
# Code to run a Scatter plot

import seaborn
import numpy as np
import matplotlib.pyplot as matplotlib
 
from matplotlib.lines import Line2D

def simple_scatter_plot(x_data, y_data, output_filename, title_name, x_axis_label, y_axis_label):
    """Simple scatter plot.
 
    Args:
        x_data (list): List with x-axis data.
        y_data (list): List with y-axis data.
        output_filename (str): Path to output image in PNG format.
        title_name (int): Plot title.
        x_axis_label (str): X-axis Label.
        y_axis_label (str): Y-axis Label.
 
    """
    seaborn.set(color_codes=True)
    matplotlib.figure(1, figsize=(9, 6))
 
    matplotlib.title(title_name)
 
    ax = seaborn.scatterplot(x=x_data, y=y_data)
 
    ax.set(xlabel=x_axis_label, ylabel=y_axis_label)
 
    matplotlib.savefig(output_filename, bbox_inches='tight', dpi=300)
    matplotlib.close()

In [6]:
# Correlation and scatter plot between actual and predicted values 

correlation = round(pearsonr(predictions, x_test_labels)[0], 5)
 
output_filename = "rf_regression.png"
title_name = "Random Forest Regression - Real House Price vs Predicted House Price - correlation ({})".format(correlation)
x_axis_label = "Real House Price"
y_axis_label = "Predicted House Price"
 
# plot data
simple_scatter_plot(x_test_labels, predictions, output_filename, title_name, x_axis_label, y_axis_label)

In [7]:
# Feature importances of the predictors

features_importance = np.round(regressor.feature_importances_, 2)
 
print("Feature ranking:")
for i, data_class in enumerate(feature_names):
    print("{}. {} ({})".format(i + 1, data_class, features_importance[i]))

Feature ranking:
1. X1 transaction date (0.01)
2. X2 house age (0.19)
3. X3 distance to the nearest MRT station (0.6)
4. X4 number of convenience stores (0.02)
5. X5 latitude (0.1)
6. X6 longitude (0.08)
