In [29]:
import numpy as np
import pandas as pd
import seaborn as sns
from esda import Moran
import geopandas as gpd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from libpysal.weights import KNN
from spreg import OLS, GM_Error_Het
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [30]:
gdf = gpd.read_file('output/hedonic_gdf.gpkg')

In [31]:
gdf['log_price'] = np.log(gdf['GrossSalePrice'])

In [32]:
gdf.columns

Index(['GrossSalePrice', 'AgeAtSale', 'LandArea', 'TotalFloorArea',
       'water_DIST', 'bus_DIST', 'Census_Pop', 'RnkIMDNoEm', 'RnkIMDNoIn',
       'RnkIMDNoCr', 'RnkIMDNoHo', 'RnkIMDNoHe', 'RnkIMDNoEd', 'RnkIMDNoAc',
       'DECILE_high', 'DECILE_prim', 'Median_Income', 'CBD_DIST',
       'cycleways_DIST', 'cycle_DENS', 'year_2018', 'year_2019', 'canopy_0_20',
       'canopy_20_50', 'canopy_50_100', 'canopy_100_200', 'residuals',
       'geometry', 'log_price'],
      dtype='object')

In [33]:
w = KNN.from_dataframe(gdf, k=8)
w.transform = 'R'

 There are 11 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


In [34]:
gdf.shape

(12431, 29)

In [35]:
y = gdf['log_price'].values.reshape(-1, 1)

non_x_cols = [
  # unneccessary columns
  'GrossSalePrice', 'residuals', 'geometry', 'log_price',
  # removed columns
  "Census_Pop",
  "RnkIMDNoEm",
  "RnkIMDNoIn",
  "RnkIMDNoCr",
  "RnkIMDNoHo",
  "RnkIMDNoHe",
  "RnkIMDNoEd",
  "RnkIMDNoAc",
  "DECILE_high",
  # "DECILE_prime",
  "Median_Income"
]

x_cols = [col for col in gdf.columns if col not in non_x_cols]

print(x_cols)

X = gdf.loc[:, x_cols]

X = X.values

['AgeAtSale', 'LandArea', 'TotalFloorArea', 'water_DIST', 'bus_DIST', 'DECILE_prim', 'CBD_DIST', 'cycleways_DIST', 'cycle_DENS', 'year_2018', 'year_2019', 'canopy_0_20', 'canopy_20_50', 'canopy_50_100', 'canopy_100_200']


In [36]:
ols_spreg = OLS(y, X, w=w, spat_diag=True, moran=True, name_y='log_price', name_x=x_cols)

print(ols_spreg.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :   log_price                Number of Observations:       12431
Mean dependent var  :     13.1505                Number of Variables   :          16
S.D. dependent var  :      0.3376                Degrees of Freedom    :       12415
R-squared           :      0.5916
Adjusted R-squared  :      0.5911
Sum squared residual:     578.441                F-statistic           :   1199.0515
Sigma-square        :       0.047                Prob(F-statistic)     :           0
S.E. of regression  :       0.216                Log likelihood        :    1427.923
Sigma-square ML     :       0.047                Akaike info criterion :   -2823.846
S.E of regression ML:      0.2157                Schwarz criterion     :   -2704.999

------------------------------------------------------------

# VIF

In [37]:
x_cols

['AgeAtSale',
 'LandArea',
 'TotalFloorArea',
 'water_DIST',
 'bus_DIST',
 'DECILE_prim',
 'CBD_DIST',
 'cycleways_DIST',
 'cycle_DENS',
 'year_2018',
 'year_2019',
 'canopy_0_20',
 'canopy_20_50',
 'canopy_50_100',
 'canopy_100_200']

In [38]:
X = gdf.loc[:, x_cols]

In [39]:
X.head()

Unnamed: 0,AgeAtSale,LandArea,TotalFloorArea,water_DIST,bus_DIST,DECILE_prim,CBD_DIST,cycleways_DIST,cycle_DENS,year_2018,year_2019,canopy_0_20,canopy_20_50,canopy_50_100,canopy_100_200
0,24,685.0,211.0,189.601703,216.79891,10.0,6354.291389,590.0,1.6,0,1,1.077267,163.500474,3673.623939,10193.36043
1,42,609.0,120.0,139.5812,154.824709,10.0,6227.397408,471.071045,1.6,0,0,20.957896,450.415001,3568.941048,19805.408025
2,22,704.0,245.0,201.942495,192.941029,10.0,6352.89185,610.836121,1.6,1,0,10.299413,144.270775,2970.7922,9872.392485
3,44,869.0,131.0,157.854014,179.645498,10.0,6229.065719,483.016235,1.6,0,1,74.306779,392.823676,3602.928325,19098.845813
4,22,1149.0,279.0,199.693219,216.7306,10.0,6258.811869,533.936462,1.6,1,0,0.0,279.813206,6163.843751,14113.51833


In [40]:
vif_df = pd.DataFrame({
  'variable': X.columns,
  'VIF': [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})

print(vif_df)

          variable       VIF
0        AgeAtSale  3.521190
1         LandArea  1.191485
2   TotalFloorArea  6.101419
3       water_DIST  2.520637
4         bus_DIST  2.786214
5      DECILE_prim  1.517704
6         CBD_DIST  6.092187
7   cycleways_DIST  3.855113
8       cycle_DENS  2.003174
9        year_2018  1.962587
10       year_2019  1.954268
11     canopy_0_20  2.985803
12    canopy_20_50  5.605742
13   canopy_50_100  2.576599
14  canopy_100_200  3.625759


# training GM_Error_Het heteroskedasticity-robust estimation

In [41]:
X = X.values

In [42]:
sem = GM_Error_Het(y, X, w, name_y = 'log_price', name_x = x_cols)

In [43]:
print(sem.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: GM SPATIALLY WEIGHTED LEAST SQUARES (HET)
------------------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :   log_price                Number of Observations:       12431
Mean dependent var  :     13.1505                Number of Variables   :          16
S.D. dependent var  :      0.3376                Degrees of Freedom    :       12415
Pseudo R-squared    :      0.5846
N. of iterations    :           1                Step1c computed       :          No

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT        12.79131         0.03012       424.73841         0.00000
           AgeAtSale        -0.00174         0.00012    

In [44]:
gdf["e_sem"] = sem.e_filtered.flatten()  # or sem.e_filtered if already 1D
mi_sem = Moran(gdf["e_sem"], w)
print(mi_sem.I, mi_sem.p_sim)

-0.018002164938573557 0.001
