In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

IMPORT ADDITIONAL LIBRARIES

In [None]:
import matplotlib.pyplot as plt

CLEAN UP AND REARRANGE DATA 

In [None]:
dataset = pd.read_csv('/kaggle/input/real-estate-price-prediction/Real estate.csv')
dataset = dataset.set_index('No')
dataset.rename(columns={'X1 transaction date': 'transaction date','X2 house age':'house age', 
                        'X3 distance to the nearest MRT station':'distance to the nearest MRT station',
                       'X4 number of convenience stores':'number of convenience stores','X5 latitude':'latitude',
                       'X6 longitude':'longitude', 'Y house price of unit area':'house price of unit area'}, inplace=True)
dataset.head()

DESCRIBE AND SUMMARISE DATA

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset.hist(bins=50, figsize=(20,15))
plt.show()

SPLIT DATA INTO TEST AND TRAINING SET

Create a stratiffied shuffle of data to evenly shuffle the data

In [None]:
dataset['distance_cat'] = pd.cut(dataset['distance to the nearest MRT station'],
                                bins=[0, 200, 500, 1000,3000,np.inf],
                                labels=[1,2,3,4,5])
dataset['distance_cat'].hist()

Split the Data

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(dataset, dataset['distance_cat']):
    strat_train_set = dataset.reindex(train_index)
    strat_test_set = dataset.reindex(test_index)

Drop possible NaN values 

In [None]:
strat_train_set.dropna(axis=0, inplace=True)

Drop the shuffle column 

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('distance_cat', axis=1, inplace=True)

Check for correlation in the Data 

In [None]:
corr_matrix = dataset.corr()
corr_matrix['house price of unit area']

Prepare the data for Machine Learning Algorithm 

Create inputs and outputs

In [None]:
dataset = strat_train_set.drop('house price of unit area', axis=1)
dataset_labels = strat_train_set['house price of unit area'].copy()

Perform feature Scaling on the Data and keep scaled data in a pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
num_pipeline = Pipeline([('std_scaler', StandardScaler())])
prepared_data = num_pipeline.fit_transform(dataset)

Perform Random forest Regression on data

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, dataset_labels)

Evaluate algorithm performance

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
dataset_predictions = forest_reg.predict(prepared_data)
forest_mse = mean_squared_error(dataset_labels, dataset_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

In [None]:
forest_scores = cross_val_score(forest_reg, prepared_data, dataset_labels,
                            scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Perform Grid Search to determine best parameters to use

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid={'n_estimators':[3,10,30], 'max_features':[2,4,6,8]}


forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True, verbose=3)

grid_search.fit(prepared_data, dataset_labels)

Test on test Data

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('house price of unit area', axis=1)
y_test = strat_test_set['house price of unit area'].copy()

X_test_prepared = num_pipeline.transform(X_test)

final_prediction = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_prediction)
final_rmse = np.sqrt(final_mse)
final_rmse

View Confidence Interval

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_prediction - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors)-1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))