In [None]:
import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly
from plotly.subplots import make_subplots
import os
from catboost import CatBoostClassifier,CatBoostRegressor
from sklearn.feature_selection import SelectKBest,f_regression
from xgboost import plot_importance,XGBClassifier,XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import seaborn as sns
import shap

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Plot Correlation Matrix
def corrMat(df,id=False,figsize=(10,10)):
    
    corr_mat = df.corr().round(2)
    f, ax = plt.subplots(figsize=figsize)
    mask = np.triu(np.ones_like(corr_mat, dtype=np.bool))
    mask = mask[1:,:-1]
    corr = corr_mat.iloc[1:,:-1].copy()
    sns.heatmap(corr,mask=mask,vmin=-0.3,vmax=0.3,center=0, 
                cmap='Blues',square=False,lw=2,annot=True,cbar=False)

# Plot Correlation to Target Variable only
def corrMat2(df,target='demand',figsize=(9,0.5),ret_id=False):
    
    corr_mat = df.corr().round(2);shape = corr_mat.shape[0]
    corr_mat = corr_mat.transpose()
    corr = corr_mat.loc[:, df.columns == target].transpose().copy()
    
    if(ret_id is False):
        f, ax = plt.subplots(figsize=figsize)
        sns.heatmap(corr,vmin=-0.3,vmax=0.3,center=0, 
                     cmap=cmap,square=False,lw=2,annot=True,cbar=False)
        plt.title(f'Feature Correlation to {target}')
    
    if(ret_id):
        return corr
    
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

def plotlyoff_corr(corr,size=None):
    
    xcols = corr.columns.tolist();ycols = xcols
    if(size is None):
        width = 700; height = 500
    else:
        width = size[0]; height = size[1]
    
    layout = dict(
        width = width,height = height,
        yaxis= dict(tickangle=-30,side = 'left'),
        xaxis= dict(tickangle=-30,side = 'top'))
    fig = ff.create_annotated_heatmap(
        z=corr.values,x= xcols,y= ycols,
        colorscale='viridis',showscale=False)
    fig['layout'].update(layout)
    fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0})
    
    return py.iplot(fig)

''' Draw a Bivariate Seaborn Pairgrid /w KDE density w/ '''
def snsPairGrid(df):

    ''' Plots a Seaborn Pairgrid w/ KDE & scatter plot of df features'''
    g = sns.PairGrid(df,diag_sharey=False)
    g.fig.set_size_inches(14,13)
    g.map_diag(sns.kdeplot, lw=2) # draw kde approximation on the diagonal
    g.map_lower(sns.scatterplot,s=15,edgecolor="k",linewidth=1,alpha=0.4) # scattered plot on lower half
    g.map_lower(sns.kdeplot,cmap='plasma',n_levels=10) # kde approximation on lower half
    plt.tight_layout()

from sklearn.base import BaseEstimator,TransformerMixin

# Basic Transformer
class transformer(BaseEstimator,TransformerMixin):
    
    def __init__(self,drop_nan=False,select_dtype=False,show_nan=False,title='Title',show_counts=False,figsize=None):
        self.drop_nan = drop_nan
        self.select_dtype = select_dtype
        self.show_nan = show_nan
        self.title = title
        self.show_counts = show_counts
        self.figsize = figsize
        
    # Apply Fit
    def fit(self,X,y=None):
        return self
        
    # Apply Some Transformation to the Feature Matrix
    def transform(self,X):
        
        # show NaN % in DataFrame
        if(self.show_nan):
            
            fig, ax = plt.subplots(figsize = self.figsize)
            nan_val = (X.isnull().sum()/len(X)*100).sort_values(ascending = False)
            cmap = sns.color_palette("plasma")
            for i in ['top', 'right', 'bottom', 'left']:
                ax.spines[i].set_color('black')
            ax.spines['top'].set_visible(True);ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False);ax.spines['left'].set_visible(False)
            sns.barplot(x=nan_val,y=nan_val.index, edgecolor='k',palette = 'rainbow')
            plt.title(self.title);ax.grid(ls='--',alpha = 0.9);plt.show()
            return
        
        ''' Plot df.value_counts '''
        if(self.show_counts):
        
            tdf = X.value_counts()
            cmap = sns.color_palette("plasma")
            fig, ax = plt.subplots(figsize = self.figsize)
            for i in ['top', 'right', 'bottom', 'left']:
                ax.spines[i].set_color('black')
            ax.spines['top'].set_visible(True);ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False);ax.spines['left'].set_visible(False)
            sns.barplot(tdf.index,tdf.values,edgecolor='k',palette = 'rainbow',ax=ax);
            plt.title(self.title);ax.grid(ls='--',alpha = 0.9);plt.show()
        
        ''' Drop All NAN values in DataFrame'''
        if(self.drop_nan):
            X = X.dropna(); print(X.shape) # drop NaN values in df
            return X
            
        ''' Split DataFrame into Numerical/Object features'''
        if(self.select_dtype):
            X1 = X.select_dtypes(include=['float64','int64'])     # return only numerical features from df
            X2 = X.select_dtypes(exclude=['float64','int64'])
            return X1,X2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set(style='white')

def function(ldf,id=None):

    if(id is 'boxplot'):
        # 1. Univariate Boxplots
        fig, axs = plt.subplots(ncols=5, nrows=3, figsize=(14,4))
        index = 0
        axs = axs.flatten()
        for k,v in X.items():

            flierprops = dict(marker='o', mfc='k',ms=3,ls='none', mec='k')
            ax = sns.boxplot(x=k,data=ldf, orient='h',flierprops=flierprops,
                             ax=axs[index], width=.5)
            index += 1
        plt.tight_layout()

    elif(id is 'outiers'):
#       2. Define Outliers
        for k, v in ldf.items():
            q1 = v.quantile(0.25); q3 = v.quantile(0.75); irq = q3 - q1
            v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
            perc = np.shape(v_col)[0] * 100.0 / np.shape(ldf)[0]
            print("Column %s outliers = %.2f%%" % (k, perc))
    
    elif(id is 'histograms'):
        # 3. Histograms 
        fig, axs = plt.subplots(ncols=5, nrows=3, figsize=(14, 8))
        index = 0
        axs = axs.flatten()
        for k,v in ldf.items():
            sns.histplot(v,ax=axs[index],bins=20)
            index += 1
        plt.tight_layout()
        
    elif(id is 'correlation'):
        # 4. Correlation Matrix
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Bivariate Correlation Matrix
        def corrMat(df,id=False):

            corr_mat = df.corr().round(2)
            f, ax = plt.subplots(figsize=(9,7))
            mask = np.triu(np.ones_like(corr_mat, dtype=np.bool))
            mask = mask[1:,:-1]
            corr = corr_mat.iloc[1:,:-1].copy()
            sns.heatmap(corr,mask=mask,vmin=-0.3,vmax=0.3,center=0, 
                        cmap='Blues',square=False,lw=2,annot=True,cbar=False)

        corrMat(ldf)
        
''' Draw a Bivariate Seaborn Pairgrid /w KDE density w/ '''
def snsPairGrid(df):

    ''' Plots a Seaborn Pairgrid w/ KDE & scatter plot of df features'''
    g = sns.PairGrid(df,diag_sharey=False)
    g.fig.set_size_inches(14,13)
    g.map_diag(sns.kdeplot, lw=2) # draw kde approximation on the diagonal
    g.map_lower(sns.scatterplot,s=15,edgecolor="k",linewidth=1,alpha=0.4) # scattered plot on lower half
    g.map_lower(sns.kdeplot,cmap='plasma',n_levels=10) # kde approximation on lower half
    plt.tight_layout()
    
    
# Class to Visualise Things Only
class visualise(BaseEstimator,TransformerMixin):
    
    def __init__(self,vis_mean_AUD=None,model_name=None,target=None,eval_fi=False,vis_box=False,
                 vis_compare_mean_AUD=False,vis_compare_bar=False,top_scat=50,top_bar=None,
                 option=False):
        self.vis_mean_AUD = vis_mean_AUD # [T/F] trigger for mean AUD visualisation 
        self.model_name = model_name     # define which model plot in vis_mean_AUD
        self.target = target             # target varable [str]
        self.vis_box = vis_box # boxplots for univariate analysis
        self.vis_compare_mean_AUD = vis_compare_mean_AUD
        self.top_scat = top_scat # export only top n number of suburbs for scatter matrix
        self.top_bar = top_bar   # show only top cases for bar sort
        self.vis_compare_bar = vis_compare_bar # compare two bar plots (general)
        self.option = option

    @staticmethod 
    def corrMat2(df,target='demand',figsize=(9,0.5),ret_id=False):

        corr_mat = df.corr().round(2);shape = corr_mat.shape[0]
        corr_mat = corr_mat.transpose()
        corr = corr_mat.loc[:, df.columns == target].transpose().copy()

        if(ret_id is False):
            f, ax = plt.subplots(figsize=figsize)
            sns.heatmap(corr,vmin=-0.3,vmax=0.3,center=0, 
                         cmap=cmap,square=False,lw=2,annot=True,cbar=False)
            plt.title(f'Feature Correlation to {target}')

        if(ret_id):
            return corr
        
    def fit(self):
        return self
    
    # X -> Numerical (feature matrix + target variable)
    def transform(self,X):
        
        if(self.option is 'histogram'):
            vdf_perth1_num,_ = transformer(select_dtype=True).transform(X=X)
            vdf_perth1_num.hist(bins=60, figsize=(20,15));plt.show()
        
        ''' show suburb based mean price prediction/price '''
        if(self.vis_mean_AUD is not None):
            
            # Combine Numerical & Object Features
            if(self.vis_mean_AUD is 'error'):
                tdfx = X.groupby(['suburb']).mean().sort_values(by=self.model_name+'_error',ascending=False)
                
                if(self.top_bar is not None):
                    dfx = X.groupby(['suburb']).mean().sort_values(by=self.model_name+'_error',ascending=False)[:self.top_bar]
                else:
                    dfx = tdfx
                    
                fig = px.bar(dfx,x=dfx.index, y=[self.model_name+'_error'],template='plotly_white',height=400)
                fig.update_layout(barmode="group",title='Suburb Based Mean Error |(y_pred-y_target)|',showlegend=True)
                
            elif(self.vis_mean_AUD is 'value'):
                tdfx = X.groupby(['suburb']).mean().sort_values(by=self.model_name+'_error',ascending=False)                
                if(self.top_bar is not None):
                    dfx = X.groupby(['suburb']).mean().sort_values(by=self.model_name+'_error',ascending=False)[:self.top_bar]
                else:
                    dfx = tdfx
                
                fig = px.bar(dfx,x=dfx.index, y=[self.model_name,self.target],template='plotly_white',height=400)
                fig.update_layout(barmode="group",title='Suburb-Based Mean Prediction/Target Variable',showlegend=True)

            # Used for Scatter Matrix Import 
            fig.show() # stack/overlay/group
            tdfx.loc[:,'group_id2'] = 0
            tdfx.loc[:self.top_scat,'group_id2'] = 1
            
            return tdfx
        
        ''' Compare Mean Groupby Suburb Error Bars '''
        # X -> List of Pandas DataFrames
        if(self.vis_compare_mean_AUD):
            
            dfx1 = X[0].groupby(['suburb']).mean().sort_values(by=self.model_name+'_error',ascending=False)
            dfx2 = X[1].groupby(['suburb']).mean().sort_values(by=self.model_name+'_error',ascending=False)
            dfx1.rename(columns = {self.model_name+'_error':'A'}, inplace = True)
            dfx2.rename(columns = {self.model_name+'_error':'B'}, inplace = True)
            dfx_all = pd.concat([dfx1['A'],dfx2['B']],axis=1)               
            fig = px.bar(dfx_all,x=dfx_all.index, y=['A','B'],template='plotly_white',height=400)
            fig.update_layout(barmode="group",title='Suburb Based Mean Error |(y_pred-y_target)|',showlegend=True)
            fig.show()
            
        ''' Compare Two Error Bars (general)'''
        # X -> List of Pandas DataFrames
        if(self.vis_compare_bar):
            
            X[0].rename(columns = {self.model_name+'_error':'A'}, inplace = True)
            X[1].rename(columns = {self.model_name+'_error':'B'}, inplace = True)
            dfx_all = pd.concat([dfx1['A'],dfx2['B']],axis=1)               
            fig = px.bar(dfx_all,x=dfx_all.index, y=['A','B'],template='plotly_white',height=400)
            fig.update_layout(barmode="group",title='Suburb Based Mean Error |(y_pred-y_target)|',showlegend=True)
            fig.show()
            
        ''' Plot Univariate Boxplots for Data Distribution '''
        if(self.vis_box):
            
            lX,_ = transformer(select_dtype=True).transform(X=X)
            fig,axs = plt.subplots(ncols=5,nrows=3,figsize=(14,4))
            index = 0
            axs = axs.flatten()
            for k,v in lX.items():
                flierprops = dict(marker='o',mfc='k',ls='none',mec='k')
                ax = sns.boxplot(x=k,data=lX,orient='h',flierprops=flierprops,
                                ax=axs[index],width=0.5)
                index += 1
            plt.tight_layout()  
            
        if(self.option is 'outliers'):
            
    #       2. Define Outliers
            lX,_ = transformer(select_dtype=True).transform(X=X)
            for k, v in lX.items():
                q1 = v.quantile(0.25); q3 = v.quantile(0.75); irq = q3 - q1
                v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
                perc = np.shape(v_col)[0] * 100.0 / np.shape(lX)[0]
                print("Column %s outliers = %.2f%%" % (k, perc))     


![](https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/8cc1eeaa-4046-4c4a-ae93-93d656f68688/deogdrn-5d90efc3-ff4d-4793-9978-361a212b41f3.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzhjYzFlZWFhLTQwNDYtNGM0YS1hZTkzLTkzZDY1NmY2ODY4OFwvZGVvZ2Rybi01ZDkwZWZjMy1mZjRkLTQ3OTMtOTk3OC0zNjFhMjEyYjQxZjMuanBnIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.XjawmWiLz0CEYyx7uo_hqqrYXkB4qg7a3NamnjT4h2c)
Photography of <b>Perth</b> by [@harrycunningham](https://unsplash.com/photos/pNN9i2tR_8w)

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#82409C;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>1 |</b> INTRODUCTION
    </p>
</div>

I've decided to split the notebook into two parts __(I) EDA notebook (this)__ & __(II) ML models to predict price.__

The aim of this notebook is to build some models that can predict [Perth](https://www.australia.com/en/places/perth-and-surrounds/guide-to-perth.html) (located in Western Australia) housing prices based on a set of scrapped features made available in the [Perth Housing Dataset](https://www.kaggle.com/syuzai/perth-house-prices). 

#### <b><span style='color:#5D2ECC'>PART I - EDA</span></b>

Having learned how to use <b>Plotly</b> for geographic data analysis, thanks to a great article by Dr.Halupka, found on [towardsdatascience](https://towardsdatascience.com/how-to-create-maps-in-plotly-with-non-us-locations-ca974c3bc997), we'll take a look at suburb based data as well. Let's see what we can learn about the dataset, whilst we try to reach our goal of predicting the target varaible <code>price</code>.

#### <b><span style='color:#5D2ECC'>PART II - Prediction Models</span></b>

- Let's also use our own sklearn compatible class, here a simplistilc <b>XGBoost</b> models (tree ensemble approaches) are used, which is one of the more powerful models, and it might be interesting to see how well it perform compared to the [XGBoost Library](https://xgboost.ai) as well.
- The follow-up notebook can be found at [Models | Perth Housing Price Prediction](https://www.kaggle.com/shtrausslearning/models-perth-housing-price-prediction/)

### <b><span style='color:#5D2ECC'>1.1</span> | Perth Housing Dataset</b>

As far as [Perth Housing Dataset](https://www.kaggle.com/syuzai/perth-house-prices) goes, the current dataset is relatively extensive from an informative point of view, we would probably benefit from some form of <b>geotagging</b> especially if we had <code>address</code> coordinates, however we will limit outselves to <b>suburb</b> locations only, which will be extracted from a separate dataset.

In [None]:
df_perth0 = pd.read_csv('/kaggle/input/perth-house-prices/all_perth_310121.csv')
df_perth0.columns = map(str.lower, df_perth0.columns)
df_perth0 = df_perth0.drop(['longitude','latitude'],axis=1)
df_perth0.rename({'cbd_dist':'CBD_dist'},axis=1,inplace=True)
df_perth0.columns

As far as these features are concerned, they are quite self explanatory:
***
- <code>address</code> : Physical address of the property ( we will set to index )
- <code>suburb</code> : Specific locality in Perth; a list of all Perth suburb can be found [here](https://www.homely.com.au/find-suburb-by-region/perth-greater-western-australia)
- <code>price</code> : Price at which a property was sold (AUD)
- <code>bedrooms</code> : Number of bedrooms
- <code>bathrooms</code> : Number of bathrooms
- <code>garage</code> : Number of garage places
- <code>land_area</code> : Total land area (m^2)
- <code>floor_area</code> : Internal floor area (m^2)
- <code>buil_year</code> : Year in which the property was built
- <code>CBD_dist</code> : Distance from the centre of Perth (m)
- <code>nearest_stn</code> : The nearest public transport station from the property
- <code>nearest_stn_dst</code> : The nearest station distance (m)
- <code>date_sold</code> : Month & year in which the property was sold

### <b><span style='color:#5D2ECC'>1.2</span> | Additional Dataset containing Latitude & Longitude</b>

- Aside from <code>CBD_dist</code> (distance to some point in the centre of Perth), we don't have other geographical information; two identical <code>CBD_dist</code> values don't necessarily mean they are located near each other, which is why even rough estimates of GPS locations can be beneficial to include in order to improve our models.
- I've loaded some data which contains some approximate GPS coodinates for each suburb (averaged coordinates within a suburb), ideally each property would have been more useful.
- You can download the dataset containing [Australian Postcodes](https://www.matthewproctor.com/australian_postcodes) & create your own, the <code>long,lat</code> values can be extracted based on two conditions <code>state == WA</code> & corresponding <code>suburbs</code>, which are included in column <code>locality</code>.  I don't own the dataset, so I will not upload it, the code to get the coordinates is included below.

In [None]:
df0 = pd.read_csv('/kaggle/input/auspostGPS/australian_postcodes.csv')
df0.head()

In [None]:
#  # Get the latitude, logitude from the secondary dataset
# def get_locs(ldf,city): 
    
#     # tdf : all related GPS data
#     tdf = ldf[(ldf['locality'] == city.upper()) & (ldf['state'] == 'WA')] 
#     if(tdf.shape[0]>1):
#         llong = tdf['long'].mean(); llat = tdf['lat'].mean()
#         return llong, llat 
#     elif(tdf.shape[0]==1):
#         llong = tdf.long.values; llat = tdf.lat.values
#         return llong, llat 
#     else: # if city is not in dataset['City']
#         print('City Name Not Found')
#         return 0,_

# df_perth0['long'] = -777; df_perth0['lat'] = -777; df_perth0['pass'] = -1  
# for index, row in df_perth0.iterrows():
#     name = df_perth0.loc[index,'suburb'].upper() # name of the surb CAPS    
#     lng,lat = get_locs(df0,name)
#     if(abs(lng)>0):
#         df_perth0.loc[index,'long'] = lng
#         df_perth0.loc[index,'lat'] = lat 
#         df_perth0.loc[index,'pass'] = 1
        
# df_perth0[['long','lat']].to_csv('PHP_GPS2.csv')

In [None]:
df_gps = pd.read_csv('/kaggle/input/php-gps2/PHP_GPS2.csv')
df_gps.columns

In [None]:
df_perth = pd.concat([df_perth0,df_gps[['long','lat']]],axis=1)
df_perth.head()

In [None]:
df_perth.info()

Let's split the the data into numerical & categorical/ordinal variables

### <b><span style='color:#5D2ECC'>1.3</span> | Data Assembly & Cleaning</b>

The scrapped data needs quite a bit of cleaning; containing ocassional errors due to incorrect data, as well as standard NaN missing data. Let's do some more obvious data cleaning here; dealing with <b>missing data</b> & <b>repetitive addresses.</b>

In [None]:
# Some Data Cleaning 
df_perth.drop_duplicates(subset=['address'],inplace=True) # Some addresses actually have multiple entries
df_perth.index = df_perth['address'] # set dataframe index, since it's not really a useful feature 
del df_perth['address'] # let's also delete the column

In [None]:
''' Remove Missing Data '''
transformer(show_nan=True,figsize=(8,3),title='Feature (NaN) %').transform(X=df_perth)

In [None]:
transformer(show_counts=True,title='Garage Value_Counts',figsize=(13,2)).transform(X=df_perth['garage'])

We have two features with missing data, <code>garage</code> probably makes sense to set to zero, whilst <code>build_year</code> is a little more tricky. Ideally it would be best to obtain this data & not try to predict it using imputation, let's drop it here.

In [None]:
print(df_perth.shape)
df_perth['garage'] = df_perth['garage'].fillna(0)  # fill missing data with 0
df_perth = transformer(drop_nan=True).transform(X=df_perth)  # drop the rest

For a start, we have the following features as we start some <b>EDA</b>, more features would be useful to add.

In [None]:
df_perth.columns

In [None]:
# Split the features into categorical/ordinal features
df_num,df_cat = transformer(select_dtype=True).transform(X=df_perth)

### <b><span style='color:#5D2ECC'>1.4</span> | Categrocal & Ordinal Features</b>

We have a few <b>categorical features</b> & we should decide how we will go about them; we can use it during <b>EDA</b>.
- Let's split the feature <code>date_sold</code> into two features; <code>sold_month</code> & <code>sold_year</code>.
- <code>suburb</code>, <code>nearest_stn</code> are quite interesting features for which we can use <b>mapping</b>; we might consider introducing some form of scoring system & attempt to influence the model accuracy this way, mapping will be introduced in this <b>models</b> section.

In [None]:
df_cat.columns

In [None]:
# df_perth3 = df_perth1.copy()
df_num[['sold_month', 'sold_year']] = df_cat['date_sold'].str.split('-', 1, expand=True).astype('float64')
df_cat.drop(['date_sold'],axis=1,inplace=True)

In [None]:
# df used for EDA
df_EDA = pd.concat([df_num,df_cat],axis=1)

In [None]:
df_EDA.info()

In [None]:
def px_stats(df, n_cols=4, to_plot='box',height=800):
    
    ldf,_ = transformer(select_dtype=True).transform(X=df)
    numeric_cols = ldf.columns
    n_rows = -(-len(numeric_cols) // n_cols)  # math.ceil in a fast way, without import
    row_pos, col_pos = 1, 0
    fig = make_subplots(rows=n_rows, cols=n_cols,subplot_titles=numeric_cols.to_list())
    
    for col in numeric_cols:
        if(to_plot is 'histogram'):
            trace = go.Histogram(x=ldf[col],showlegend=False)
        else:
            trace = getattr(px, to_plot)(ldf[col],x=ldf[col])["data"][0]
            
        if col_pos == n_cols: 
            row_pos += 1
        col_pos = col_pos + 1 if (col_pos < n_cols) else 1
        fig.add_trace(trace, row=row_pos, col=col_pos)

    fig.update_layout(template='plotly_white');fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0})
#     if(to_plot is 'histogram'):
#         fig.update_traces(marker=dict(line=dict(width=1, color='white')))
    fig.update_layout(height=height);fig.show()

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#82409C;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>2 |</b> EXPLORATORY DATA ANALYSIS
    </p>
</div>

### <b><span style='color:#5D2ECC'>2.1</span> | Univariate Analyses</b>

#### <b><span style='color:#5D2ECC'>EDA - Feature Distributions</span></b>

The number of <code>bedrooms</code> & <code>bathrooms</code>, are most certainly one of the most important features of a property, alongside with the number of <code>garages</code> (car slots), some oservations:
- We can note how uncommon __1 bedroom appartment__ properties are in Perth (at least acording to this dataset), most common being a __4 bedroom property__, typically having __1 or 2 bathrooms__ & having a __garage__ with __two car slots__.
- We can see a very rapid __increase in property sales__ in the last 6 years or so.  I'm guessing real estate agents in Perth are kept busy. 
- A steady increase in properties built since the 1950s can also be noted.
- Some of the properties have a very large number of garages slots, so it could make sense to just remove them but lets just keep them anyway.

In [None]:
px_stats(df_EDA, to_plot='histogram')

#### <b><span style='color:#5D2ECC'>EDA - Data Distributions Boxplots</span></b>
Let's take a look at the distribution of our feature data again. This time using boxplots; <b>boxplots</b> give us a good indication of how our data is distributed & outliers in the data. Whilst it is often stated that <b>tree based</b>, such as RF are not sensitive to outliers, such as this article on [medium](https://arsrinevetha.medium.com/ml-algorithms-sensitivity-towards-outliers-f3862a13c94d), it might be woth taking it into account, as there are good references that state otherwise as shown on [stackexchange](https://stats.stackexchange.com/questions/187200/how-are-random-forests-not-sensitive-to-outliers). So we'll have to look into the effect of these outliers as well, when getting to the model generation stage.

In [None]:
px_stats(df_EDA, to_plot='box',height=400)

#### <b><span style='color:#5D2ECC'>EDA - Suburb Sorted Price Ranges</span></b>
- We can visualise the statistics data of <code>Price</code> in different <code>suburbs</code> (sorted alphabetically) if we are interested, in a particular suburb it is quicker to find it by name when required.
- We can see that there a quite a number of property <code>price</code> cases which exceed the <b>q1</b> & <b>q3</b> threshold, even in different suburbs; which is an indicator that we have a number of 'outlier' cases. 
- Ideally we should be looking them, as they are likely going to complicate the fitting process for our models, especially if our <b>features</b> aren't chosen very well.

In [None]:
ldf = df_EDA.sort_values(by='suburb')
fig = px.box(ldf, x="suburb", y="price",template='plotly_white',
             title="Suburb Based Price Range (Alphabetically Arranged)",height=600)
fig.update_layout(yaxis={'categoryorder':'total ascending'},margin=dict(l=80, r=80, t=100, b=80))
fig.update_traces(marker=dict(size=4));fig.update_layout(xaxis=dict(rangeslider=dict(visible=True)));fig.show()

In [None]:
df_submed = df_EDA.groupby(['suburb']).median()
df_submin = df_EDA.groupby(['suburb']).min()
df_submax = df_EDA.groupby(['suburb']).max()

In [None]:
df_submed.head()

#### <b><span style='color:#5D2ECC'>EDA - Suburb Median Property Prices sorted by CBD_dist</span></b>
Looking at the top 100 suburbs that are on average closest to the CBD, <code>West Perth</code>,<code>Glendalough</code> & <code>Nollamara</code> are amongst the cheaper alternatives, but we can can see a trend of higher property <code>price</code>, the closer to the the city centre we get as well.

In [None]:
dfx = df_submed.sort_values(by=['CBD_dist'],ascending=True)
fig = px.bar(dfx, x=dfx.index, y='price',template='plotly_white',text='CBD_dist',height=300)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

#### <b><span style='color:#5D2ECC'>EDA - Suburb Median Property Prices sorted by nearest_stn_dist</span></b>
We can look at which suburbs would be cheapest, if you wanted to live near public transport, <code>Armadale</code>, <code>Parmelia</code>, <code>Warnbro</code> among the cheaper suburbs overall.

In [None]:
dfx = df_EDA.groupby(['suburb']).median().sort_values(by=['nearest_stn_dist'],ascending=True)
fig = px.bar(dfx, x=dfx.index, y='price',template='plotly_white',hover_data=['nearest_stn_dist','price'],height=300)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

#### <b><span style='color:#5D2ECC'>EDA - Property Build Year Min/Med/Max Property Prices</span></b>

- Despite the market <code>median house price</code> being not exactly very low, hovering in the region of 400-500k in the last 60 years or so, there has tended to be properties that are "relatively affordable" built every year in the range 100-300k range, which is a positive thing, given the number people near the poverty line will only likely to increase as stated in the [Canberra Times](https://www.canberratimes.com.au/story/6641748/one-in-eight-australians-living-in-poverty/?cs=14231), (which likely is an indicator that more and more people will not be able to afford housing in general & is backed by an article from [Melbourne Uni](https://fbe.unimelb.edu.au/exchange/edition1/house-prices-outpacing-income-growth) which states that many would-be first time buyers are priced out of the market).  
- Interesting to also note that nothing was built in 1944 (at least accoriding to this data) & there tends to be a <b>common trend of median price reducing</b>, the newer the property is in the dataset, a similar trend can be observed for the <b>minimum price</b>.

In [None]:
dfx = df_EDA.groupby(['build_year']).median().sort_values(by=['price'],ascending=True)
dfx2 = df_EDA.groupby(['build_year']).min().sort_values(by=['price'],ascending=True)
dfx3 = df_EDA.groupby(['build_year']).max().sort_values(by=['price'],ascending=True)

fig = go.Figure()
fig.add_trace(go.Bar(x=dfx3.index, y=dfx3['price'],name='max price',marker_color='rgb(27,38,49)'))
fig.add_trace(go.Bar(x=dfx.index, y=dfx['price'],name='median price',marker=dict(color='#566573')))
fig.add_trace(go.Bar(x=dfx2.index, y=dfx2['price'],name='lowest price',marker_color='#CACFD2'))
fig.update_layout(barmode='overlay',template='plotly_white',height=300,title='Property Build Year Grouped House Prices')
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

#### <b><span style='color:#5D2ECC'>EDA - Feature Importance</span></b>

- We can use various <b>feature importance</b> approaches, including (<b>SHAP</b> values (with <b>CatBoost</b>), <b>RandomForest()</b> Feature Importance, <b>XGB</b> Feature Importance & <b>KBest</b> ) to get a quick idea of which features are quite influential in our models.
- Interstingly enough, other <b>feature importance</b> methods also evaluated garage as a less impactful feature, we can see also see that <code>long</code> (longitudinal coodinates) is quite impactful in <b>XGB</b> & <b>CAT</b> models.

In [None]:
# Plot Relative Feature Importance
def feature_importance(tldf,feature='price',n_est=500):
    
    # X : Numerical / Object DataFrame
    ldf,ldf2 = transformer(select_dtype=True).transform(X=tldf)
     
    # Input dataframe containing feature & target variable
    X = ldf.copy()
    y = ldf[feature].copy()
    del X[feature]
    
#   CORRELATION
    imp = corrMat2(ldf,feature,figsize=(15,0.5),ret_id=True)
    del imp[feature]
    s1 = imp.squeeze(axis=0);s1 = abs(s1)
    s1.name = 'Correlation'
#     display(s1)
        
#   SHAP
    model = CatBoostRegressor(silent=True,n_estimators=n_est).fit(X,y)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    shap_sum = np.abs(shap_values).mean(axis=0)
    s2 = pd.Series(shap_sum,index=X.columns,name='Cat_SHAP').T
#     display(s2)
    
#   RANDOMFOREST
    model = RandomForestRegressor(n_est,random_state=0, n_jobs=-1)
    fit = model.fit(X,y)
    rf_fi = pd.DataFrame(model.feature_importances_,index=X.columns,
                                         columns=['RandForest']).sort_values('RandForest',ascending=False)
    s3 = rf_fi.T.squeeze(axis=0)
#     display(s3)

#   XGB 
    model=XGBRegressor(n_estimators=n_est,learning_rate=0.5,verbosity = 0)
    model.fit(X,y)
    data = model.feature_importances_
    s4 = pd.Series(data,index=X.columns,name='XGB').T
#     display(s4)
    
#   KBEST
    model = SelectKBest(k=X.shape[1], score_func=f_regression)
    fit = model.fit(X,y)
    data = fit.scores_
    s5 = pd.Series(data,index=X.columns,name='K_best')
#     display(s5)

    # Combine Scores
    df0 = pd.concat([s1,s2,s3,s4,s5],axis=1)
    df0.rename(columns={'target':'lin corr'})

    x = df0.values 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled,index=df0.index,columns=df0.columns)
    df = df.rename_axis('Feature Importance via', axis=1)
    df = df.rename_axis('Feature', axis=0)
    
    pd.options.plotting.backend = "plotly"
    fig = df.plot(kind='bar',title='Scaled Feature Importance')
    fig.update_layout(template='plotly_white');fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

In [None]:
feature_importance(df_EDA)

### <b><span style='color:#5D2ECC'>2.2</span> | Bivariate Analyses</b>

#### <b><span style='color:#5D2ECC'>EDA - Pearson Correlation</span></b>
<b>Linear correlation</b> can be useful to understand how our variables change relative to one another in the input dataset, most feature related to price in an expected manner, perhaps most suprising out of the lot was <code>land_area</code> having a much smaller value than expected</code>, in compa

In [None]:
plotlyoff_corr(df_EDA.corr().round(2))

#### <b><span style='color:#5D2ECC'>EDA - Area Features Relation to Price</span></b>
<code>Floor_area</code> & <code>Land_area</code> have a relatively high <b>feature importance</b> as we saw previousy, let's look at the bivariate relations to <code>price</code>.

In [None]:
titles = ['Floor Area','Land Area']
fig = make_subplots(rows=1, cols=2,shared_yaxes=True,subplot_titles=titles,horizontal_spacing = 0.05)

fig.add_trace(go.Scattergl(y=df_EDA['price'].values,x=df_EDA['floor_area'].values,mode='markers',name='Floor Area',text=df_EDA.index,opacity=0.1),row=1, col=1)
fig.add_trace(go.Scattergl(y=df_EDA['price'].values,x=df_EDA['land_area'].values,mode='markers',name='Land Area',text=df_EDA.index,opacity=0.1),row=1, col=2)

fig.update_traces(marker=dict(size=4,line=dict(width=1.2,color='black')))
fig.update_layout(template='plotly_white',title='Area Feature Relation to Property Sold Price',height=500,showlegend=False)
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

In [None]:
df_EDA[(df_EDA.index == '60 Semerwater Crescent') 
         | (df_EDA.index == '12 Gailey Way') 
         | (df_EDA.index == '3 Longstaff Avenue')
         | (df_EDA.index == '11 Semerwater Crescent')] 

- We can see a very big difference between the two feature relations, when it comes to <code>price</code> value. 
- <code>floor_area</code> relation to <code>price</code> tends to be quite straightforward, whilst <code>land area</code> tends to be quite spread out.
- If we actually zoom into <code>land_area</code> realation to <code>price</code>, we can note some lines at 10k, 20k & 40k, the relation as the <b>correlation</b> value suggests is quite spreadout and seemindly inconsistent, apart from these noted patterns.
- Both relations contain a lot of outlier properties. 

A good rule of thumb it to <b>verify the data</b> if possible, some things that can be noted:
- 60 Semerwater Crescent which has a value of 365k is likely an error according to [realestate](https://www.realestate.com.au/property/60-semerwater-cres-aveley-wa-6069?pid=p4ep-pdp|sold-pdp:property-history-cta#timeline) 
- Similarly; 12 Gailey Way on [reiwa](https://reiwa.com.au/12-gailey-way-aveley-4382054/)
- Similarly; 3 Longstaff Avenue on [reiwa](https://reiwa.com.au/3-longstaff-avenue-alkimos-3732905/)
- 1264 Chittering Road (554,500m^2) on [reiwa](https://reiwa.com.au/1264-chittering-road-bullsbrook-wa-3781179/) is probably an indicator anything below would probably be genuine, even though different sources do vary the value a little.

Of course, many realistic datasets require a thorough investigation, which is unfortunately quite a labourious task, so I'll limit my efforts here.

In [None]:
df_EDA.loc['60 Semerwater Crescent','land_area'] = 375 # m^2
df_EDA.loc['12 Gailey Way','land_area'] = 375
df_EDA.loc['3 Longstaff Avenue','land_area'] = 574 
df_EDA.loc['11 Semerwater Crescent','land_area'] = 375 # m^2

- <code>land_area</code> is one of the features that contains quite a large number of outliers. For smaller <code>land_area</code> proprties, this feature might not be the most impactful since they are all very similar, however the land area value does increase quite exponentially, thus it should be impactful only for a specific subset of properties, let's take a look at two subset groups.
- Somewhere after the region of 1100-1300 (m^2), the values start to very rapidly increase, so lets make a division somewhere there.

In [None]:
df_EDA_smlland = df_EDA[df_EDA['land_area'] < 1200].copy()
df_EDA_bigland = df_EDA[(df_EDA['land_area'] >= 1200)].copy()

In [None]:
titles = ['Land_Area < 1200','Land_Area > 1000 | Land_Area < 30000']
fig = make_subplots(rows=1, cols=2,shared_yaxes=True,subplot_titles=titles,horizontal_spacing = 0.05)

fig.add_trace(go.Scattergl(y=df_EDA_smlland['price'],x=df_EDA_smlland['land_area'],mode='markers',name='Land Area',text=df_EDA_smlland.index,opacity=0.05),row=1, col=1)
fig.add_trace(go.Scattergl(y=df_EDA_bigland['price'],x=df_EDA_bigland['land_area'],mode='markers',name='Land Area',text=df_EDA_bigland.index,opacity=0.05),row=1, col=2)

fig.update_traces(marker=dict(size=4,line=dict(width=1.2,color='black')))
fig.update_layout(template='plotly_white',title='Land Area Subset Data',height=500,showlegend=False)
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

- We can note that for smaller <code>land_value</code> properties and for larger ones, we have quite a different set of feature that are more impactful.
- Whilst Perth may not be the most concentrated city when it comes to area values/population near the CBD (compared to other cities in general), there certainly is a difference between a property that are used only for building a house on & properties that allow one to build cottages, farms and alike. It may be beneficial to classify these properties, with the addition of a categorical feature that would define the property type.
- Let's also create models for these two property types separately in the models section.

In [None]:
feature_importance(df_EDA_smlland)

In [None]:
feature_importance(df_EDA_bigland)

#### <b><span style='color:#5D2ECC'>EDA - Distance Feature Relation to Property Price</span></b>
- <code>CBD_dist</code> as we saw above, has a significant weight according to the <b>feature importance</b>, however <code>nearest_stn_dist</code> is slightly less impactful, let's view the scatter relation.
- Distance based features usually are very informative for models, let's take a look at two related features here (<code>CBD_dist</code> & <code>nearest_stn_dist</code>)
- Lowering the opacity of the scatter data, we can get a sense of the <b>general trend</b> of increasing price as the <code>CBD_dist</code> reduces, with an accute increase very close to the CBD.
- <code>nearest_stn_dist</code>, on the other hand is slightly more scatted, showing no visible linear relation in the scater data, however there tends to be a two direction scatter realation in addition to the central scatter region.

In [None]:
titles= ['CBD_dist','nearest_stn_dist']
fig = make_subplots(rows=1, cols=2,shared_yaxes=True,subplot_titles=titles,horizontal_spacing = 0.05)

fig.add_trace(go.Scattergl(y=df_EDA['price'],x=df_EDA['CBD_dist'],mode='markers',name='CBD_dist',text=df_EDA.index,opacity=0.05),row=1, col=1)
fig.add_trace(go.Scattergl(y=df_EDA['price'],x=df_EDA['nearest_stn_dist'],mode='markers',name='nearest_stn_dist',text=df_EDA.index,opacity=0.05),row=1, col=2)

fig.update_traces(marker=dict(size=3,line=dict(width=1.2,color='black')))
fig.update_layout(template='plotly_white',barmode='stack',title='Distance Feature Relations to Price',height=500,showlegend=False)
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

In [None]:
import geopandas as gpd

def plot_geo(ldf,feature,title=None,lst_val=None):
    
    # Load Geometry File
    wa_gdf = gpd.read_file('/kaggle/input/wa-gda2020/WA_LOCALITY_POLYGON_SHP-GDA2020.shp')    # Load the data using 
    wa_gdf.drop(['POSTCODE','PRIM_PCODE','LOCCL_CODE','DT_GAZETD','STATE_PID','DT_RETIRE','DT_CREATE','LOC_PID'],axis=1,inplace=True)

    wa_gdf.index = wa_gdf['NAME']
    median_price = ldf.groupby(['suburb']).median()
    median_price.index = median_price.index.str.upper()
    df_merged = wa_gdf.join(median_price).dropna() # some perth suburbs don't have data & drop other WA region suburbs to speed up map load

    # Convert geometry to GeoJSON
    df_merged = df_merged.to_crs(epsg=4327)
    lga_json = df_merged.__geo_interface__

    MAPBOX_ACCESSTOKEN = 'pk.eyJ1Ijoic2h0cmF1c3NhcnQiLCJhIjoiY2tqcDU2dW56MDVkNjJ6angydDF3NXVvbyJ9.nx2c5XzUH9MwIv4KcWVGLA'

    if(lst_val is None):
        lst_val = [df_merged[feature].min(),df_merged[feature].max()]

    # Set the data for the map
    data = go.Choroplethmapbox(geojson = lga_json,
                               locations = df_merged.index,    
                               z = df_merged[feature], 
                               text = title,
                               colorbar=dict(thickness=20, ticklen=3,outlinewidth=0),
                               marker_line_width=1, marker_opacity=0.8, colorscale="viridis",
                               zmin=lst_val[0], zmax=lst_val[1])

    layout = go.Layout(mapbox1 = dict(domain = {'x': [0, 1],'y': [0, 1]},center = dict(lat=-31.95, lon=115.8),
                       accesstoken = MAPBOX_ACCESSTOKEN,zoom = 9),
                       autosize=True,height=650)

    # Generate the map
    fig=go.Figure(data=data, layout=layout)
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.show()

### <b><span style='color:#5D2ECC'>2.3</span> | Geospatial Data Exploration (GDE)</b>

- In this dataset, we have each individual address as our index, but don't have the individual GPS coordinates. Let's look at <b>suburb</b> based data instead using <b>Choropeth Maps</b>, some examples of how to create them shown in notebook; [Australian Geographic Data Plots](https://www.kaggle.com/shtrausslearning/australian-geographic-data-plots).
- We'll use the <b>General District Area</b> datafile for <b>Western Australia</b>, where Perth is located. The geographic data is available in this source [data.gov](https://data.gov.au/dataset/ds-dga-6a0ec945-c880-4882-8a81-4dbcb85e74e5/distribution/dist-dga-9fff5439-7af5-42f4-9102-42c4199c5c1c/details?q=).

#### <b><span style='color:#5D2ECC'>GDE - Entire Dataset</span></b>

Let's take a look at the suburb based median data, <code>price</code>, <code>floor_area</code>, <code>land_area</code>, <code>nearest_stn_dist</code> & <code>build_year</code>. Noting 

In [None]:
plot_geo(ldf=df_EDA,feature='price',title='Median House Price')
plot_geo(ldf=df_EDA,feature='floor_area',title='Median Floor Area',lst_val=[0,250])
plot_geo(ldf=df_EDA,feature='land_area',title='Median Land Area',lst_val=[0,1000])
plot_geo(ldf=df_EDA,feature='build_year',title='Median Build Year')
plot_geo(ldf=df_EDA,feature='nearest_stn_dist',title='Median Nearest Station Distance',lst_val=[0,10000])

#### <b><span style='color:#5D2ECC'>GDE - Lower Price Properties : All built_year</span></b>

- This dataset includes around 4.5k properties that cost less than 350,000 AUD <b>at time of purchase</b>. This includes properties sold a while back as well, and is <b>not necessarily a reflection of "current market price values"</b>, which is likely to have changed over time. This is a significant issue that can affect the model accuracy, since some of our <b>price</b> data is "frozen in time", and <b>we need features that can correct for this on top of some more obvious features like floor_area</b>. 
- As an example, <code>10 Mardja Loop</code> , a 5 bedroom house was apparently sold for 87k in 2003 ( not too long after the suburb was officially formed ), [realestate](https://www.realestate.com.au/property/10-mardja-loop-mardella-wa-6125) attemps to evaluate the property in the current market by comparing it to similarly sized houses & suburb based sales. ie. somewhere in the region of perhaps 500k... So there would potentially be a significant shift in price, if the property was to be sold now.
- We now should have a clear indication that our model, if trained on the curret set of data, will be predicting the property 'price' based on the evolution of sales data, in case that wasn't already clear. 

In [None]:
df_EDA_low = df_EDA[df_EDA['price']<350000]
df_EDA_low.shape

- We can see that a lot housing in this area was sold before being built, which is a common way to buy more affordable housing, usully associated with early sales before prices hikes, backed up by this [article](https://www.newhomesource.com/learn/why-buy-into-new-community-early/). It's also a nice way to increase the value of the property very early on if the construction projects are sucessfull. We can see some smart owners got a sizable return on their investment within a short period of time; [18 Mardja Loop](https://www.domain.com.au/property-profile/18-mardja-loop-mardella-wa-6125), as an example. 
- We can see that if there is a need to build affordble housing, the cost associated with building quite a sizable house actually is not very high at all, as we can see some built in 2005. The question these days really is why would a contractor sell a house and not attempt to cash in, since the buyer most likely will be able to. Only at a government level is this issue tackleable (which it can if it really wanted to), as we can see the Australian Government does attempt to address home ownership rights in general; [reference](https://www.dss.gov.au/housing-support/programmes-services/housing).

In [None]:
# Let's look at only a few in this suburb
display(df_EDA_low[df_EDA_low['suburb']=='Mardella'].head())

In [None]:
plot_geo(ldf=df_EDA_low,feature='price',title='Median House Price')

#### <b><span style='color:#5D2ECC'>GDE - Lower Price Properties : build_year >= 2015</span></b>
- Let's look at properties which were built after 2014 (over a period of three years here), it's actually quite interesting to know if low cost properties are actually available these days in Perth. We saw roughly 10 years earlier, it was quite possible, what about more recently?
- So, over a period of a several years, 338 properties that are more or less affordable were built. Is that enough? Defnitely an intersting  question to explore ... 
- Most of them are located <b>north and south of Perth</b> & some further to the east of Perth & the Indian Ocean, tough nuggies to those wanting to live by the ocean in close proximity central Perth. <code>Waikiki</code> , <code>Alkimos</code>, <code>Jindalee</code>, <code>Eglinton</code> are among some suburbs with relatively affordable property prices.

Just some intersting propeties that caught my attention:
- [1A Forster Avenue](https://www.realestate.com.au/property/1a-forster-ave-lathlain-wa-6100), which was sold at a very big loss.
- [1 Eastfield Count](https://www.realestate.com.au/property/1-eastfield-ct-ferndale-wa-6148), which was originally built in 1997, rebuilt in 2017 via, once again sold before being rebuilt, and not being too far form the CBD.
- [101 Pine Crest Way](https://www.realestate.com.au/property/101-pine-crest-way-gnangara-wa-6077), which was sold much earlier, and perhaps only had a property built more recently.

In [None]:
df_EDA_low2 = df_EDA[(df_EDA['price']<350000) & (df_EDA['build_year'] >= 2010)]
df_EDA_low2.shape

In [None]:
df_EDA_low2[df_EDA_low2['suburb']=='Westminster'].sort_values(by='price')

In [None]:
plot_geo(ldf=df_EDA_low2,feature='price',title='Median House Price')

#### <b><span style='color:#5D2ECC'>EDA - Presold Propeties</span></b>
- We saw that in early 2000s, some properties were constantly being built and purchased at a very affordable price, as a result of <b>prepurchasing the property</b> before it was built ([Presold Homes](https://ibcbuilt.com/pre-sold-homes)) 
- Preselling, generally is a way that allows buyers to purchase properties at more reasonable prices, especially with early upfront payments.
- This practice was still noted to exist more recently as well. Let's take a look at a year by year basis, see how many properties were sold before they were built.
- We can note that <b>only 633 properties out of 30000 were prebuilt</b>, which is not quite a lot. More importantly, they are probably acting like outliers having been sold under "market value" prices.
- As the data shows, this kind of practice didn't really exist in Perth before 2000. 2013 & 2014 being the busiest years. 

In [None]:
# Let's add to df_perth3
df_EDA_presold = df_EDA.copy()
df_EDA_presold['presold'] = df_EDA_presold['sold_year'].astype('int') < df_EDA_presold['build_year']
df_EDA_presold['presold'].value_counts()

- We can that after 2014, there was a sudden drop in prebuilt properties. Our data also seems to be lacking properties built after 2016, despite containing sold data. 
- It might also be interesting to create separate models for <b>prebuilt</b> & non <b>prebuilt properties</b> on top of a general model, as they tend to be in a category of their own.

In [None]:
dfx = df_EDA_presold.groupby(['sold_year']).sum()

fig = go.Figure()
fig.add_trace(go.Bar(x=dfx.index, y=dfx['presold'],name='Presold Properties by Year'))
fig.update_layout(barmode='overlay',template='plotly_white',height=300,title='Presold Properties by Year')
fig.update_layout(margin={"r":0,"t":60,"l":0,"b":0});fig.show()

In [None]:
# Plot Scatter Matrix using Plotly Express
def scat_mat(ldf,dim=None,colour=None,hov_name=None,title=None):
    
    fig = px.scatter_matrix(ldf,dimensions=dim,opacity=0.5,color=colour,hover_name=hov_name,height=1000)
    fig.update_traces(marker=dict(size=6,line=dict(width=1,color='black')))
    fig.update_layout(template='plotly_white',title=title) # stack/overlay/group
    fig.show()

- Comparing <b>presold</b> properties to <b>non presold</b>, through a scatter matrix, we can generally get an idea they don't tend be very different when it comes to basic features like <code>bedrooms</code>,<code>bathrooms</code>,<code>land_area</code> & <code>floor_area</code>. 
- The prices of the current set of presold properties, on the otherhand are certainly on the lower end.

In [None]:
tlist = ['price','bedrooms','bathrooms','land_area','floor_area']
scat_mat(ldf=df_EDA_presold,dim=tlist,colour='presold',hov_name=df_EDA_presold.index,title='Presold Property Scatter Matrix Relations')

Let's also take a look where these properties were built; we can note that quite a lot of them are further away from Perth, and some probably even closer to <b>Mandurah</b> than <b>Perth</b> itself, and upon reviewing some properties, there it seems to be [quite more affordable](https://www.realestate.com.au/buy/property-house-in-mandurah/list-1?activeSort=price-asc&source=refinement) and in close proximity to the oceans.

In [None]:
plot_geo(ldf=df_EDA_presold[df_EDA_presold['presold']==True],feature='price',title='Presold Properties Median House Price')