# Imports

In [41]:
# User-defined functions
# All data preparation functions for GDP data and explanatory variables
from Functions.data_prep import data_GDP, data_explanatory
from Functions.spatial_functions import spatial_weight_matrix
from Functions.spatiotemporal_autocorrelation import year_df_creation
from libpysal.weights import W
import pandas as pd
import numpy as np
import libpysal
import spreg
import folium

# Spatial econometrics
from spreg import OLS
from spreg import ML_Lag, ML_Error, GM_Error, GM_Error_Het, GM_Lag, GM_Combo, GM_Combo_Het
from sklearn.metrics import mean_squared_error

In [42]:
# Example data
time_period_option = 'all_years'
NUTS_level = 2
gdf_lvl = data_GDP(NUTS_level, time_period_option)

In [43]:
exp_data = data_explanatory(True, 'all_years')
# Explanatory df
X_df = exp_data[0]
X_df = X_df.dropna()
# Explanatory variables list
exp_var_list = list(exp_data[1])
# list of NUTS regions where explanatory variables have values
exp_regions = list(X_df['NUTS_ID'].unique())

In [44]:
# Define the list of required years
required_years = set(range(2015, 2020))
# Find NUTS_IDs with complete years
def has_all_years(group):
    return required_years.issubset(group['TIME_PERIOD'].unique())
# Filter NUTS_IDs
valid_nuts_ids = X_df.groupby('NUTS_ID').filter(has_all_years)['NUTS_ID'].unique()
# Convert to list if needed
valid_nuts_ids_list = valid_nuts_ids.tolist()
X_df = X_df[X_df['NUTS_ID'].isin(valid_nuts_ids_list)]

# Filter out final df with years 2015 to 2019:
X_df = X_df[(X_df['TIME_PERIOD'] >= 2015) & (X_df['TIME_PERIOD'] <= 2019)]

In [45]:
# Merging with gdf_lvl to get the geometries of each location
final_df = pd.merge(X_df, gdf_lvl[['NUTS_ID', 'NAME_LATN', 'NUTS_NAME','geometry', 'TIME_PERIOD', 'GDP_VALUE']], on=['NUTS_ID', 'TIME_PERIOD'], how='left')

In [46]:
# Extract only for one year to calculate spatial weight (preventing repeated regions)!!
region_df = year_df_creation(final_df, 2015)
region_df = region_df.drop(columns=['TIME_PERIOD'])
# Making sure that the index of region is NUTS_ID
region_df.set_index('NUTS_ID', inplace=True)
# Spatial Weights Creation
w_adaptive_exp = spatial_weight_matrix(region_df, 15)

# Removing own regions as neighbours from w_adaptive results
for key in w_adaptive_exp.weights:
    w_adaptive_exp.weights[key] = w_adaptive_exp.weights[key][1:]
for key in w_adaptive_exp.neighbors:
    w_adaptive_exp.neighbors[key] = w_adaptive_exp.neighbors[key][1:]

In [47]:
final_df

Unnamed: 0,NUTS_ID,TIME_PERIOD,DISP_INCOME_VAL,PRIM_INCOME_VAL,POP_DENS_VAL,POP_COUNT_VAL,TOTAL_DEATH_VAL,TOUR_PLCS_VAL,TOUR_NIGHTS_VAL,SCI_TECH_VAL,EDU_VAL,NAME_LATN,NUTS_NAME,geometry,GDP_VALUE
0,AT11,2015,-0.774547,-0.722311,-0.317250,-0.948915,-0.104939,-0.268372,-0.539341,0.337501,-0.102844,Burgenland,Burgenland,"POLYGON ((4846489.857 2803511.391, 4854633.294...",25000.0
1,AT11,2016,-0.769789,-0.722459,-0.316794,-0.947317,-0.196806,-0.269549,-0.527673,0.317100,-0.165883,Burgenland,Burgenland,"POLYGON ((4846489.857 2803511.391, 4854633.294...",25600.0
2,AT11,2017,-0.764390,-0.716013,-0.316451,-0.946756,-0.215735,-0.269549,-0.527977,0.531307,-0.155376,Burgenland,Burgenland,"POLYGON ((4846489.857 2803511.391, 4854633.294...",26400.0
3,AT11,2018,-0.759596,-0.710199,-0.318962,-0.946315,-0.287216,-0.270726,-0.531453,0.449705,-0.113351,Burgenland,Burgenland,"POLYGON ((4846489.857 2803511.391, 4854633.294...",27100.0
4,AT11,2019,-0.756658,-0.707431,-0.318620,-0.945859,-0.291452,-0.271510,-0.526076,0.674112,0.128297,Burgenland,Burgenland,"POLYGON ((4846489.857 2803511.391, 4854633.294...",27600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,SK04,2015,-0.459419,-0.521702,-0.290088,-0.150130,1.414827,-0.220511,-0.518666,-1.345553,-1.027409,Východné Slovensko,Východné Slovensko,"POLYGON ((5234032.669 2964367.443, 5224941.452...",15200.0
971,SK04,2016,-0.490577,-0.540160,-0.289404,-0.149044,1.243141,-0.227834,-0.485621,-1.386354,-1.069435,Východné Slovensko,Východné Slovensko,"POLYGON ((5234032.669 2964367.443, 5224941.452...",14300.0
972,SK04,2017,-0.502221,-0.547703,-0.289175,-0.147199,1.163894,-0.229926,-0.477852,-1.294551,-0.985384,Východné Slovensko,Východné Slovensko,"POLYGON ((5234032.669 2964367.443, 5224941.452...",14800.0
973,SK04,2018,-0.472183,-0.527053,-0.288947,-0.145616,0.990927,-0.220772,-0.466630,-0.917139,-0.617659,Východné Slovensko,Východné Slovensko,"POLYGON ((5234032.669 2964367.443, 5224941.452...",15300.0


In [48]:
import scipy.sparse as sp

# Convert to panel data structure
final_df = final_df.sort_values(by=['NUTS_ID', 'TIME_PERIOD'])
final_df = final_df.set_index(['NUTS_ID', 'TIME_PERIOD'])

# Spatiotemporal Model

## Data Loading

In [49]:
# Removing variables that tested for multicollinearity!
exp_var_list.remove('DISP_INCOME_VAL')
exp_var_list.remove('POP_COUNT_VAL')

## Temporal Lags Creation

In [50]:
# # Initializing list to store lag variables
# lag_list = list()
# # Sort the DataFrame by 'Region' and 'Year'
# final_df.sort_values(by=['NUTS_ID', 'TIME_PERIOD'], inplace=True)
# # Create temporal lags for 'HumanResources_SciTech'
# lag_columns = [2]  # Define the lag periods you want
# for lag in lag_columns:
#     for var in exp_var_list:
#         final_df[var+'_lag'+str(lag)] = final_df.groupby('NUTS_ID')[var].shift(lag)
#         lag_list.append(var+'_lag'+str(lag))
#
# # Fill NaN values with the first non-null value within each region
# for lag in lag_columns:
#     for var in exp_var_list:
#         lag_col = var+'_lag'+str(lag)
#         final_df[lag_col] = final_df.groupby('NUTS_ID')[lag_col].transform(lambda x: x.fillna(method='bfill').fillna(method='ffill'))
# final_df = final_df.dropna()
#
# # Appending explanatory variable list with the temporal lag variables!
# for var in lag_list:
#     exp_var_list.append(var)
#
# len(exp_var_list)

In [51]:
# Extract dependent variable (GDP)
y = final_df['GDP_VALUE'].values
# Extract independent variables
X = final_df[exp_var_list].values

## Spatial Weight Matrix Tranformation

In [59]:
# Number of time periods
num_periods = len(final_df.index.get_level_values(1).unique())
# Convert the Kernel weights matrix to a sparse matrix
W_sparse = w_adaptive_exp.full()[0]
# Extend the spatial weights' matrix for each time period using the Kronecker product
W_extended = sp.kron(sp.identity(num_periods), W_sparse)
# Fit the panel model
# Convert the extended sparse matrix to a dense format
W_extended_dense = W_extended.toarray()
# Create a PySAL weights object from the dense matrix
# Extract the neighbors and weights
neighbors = {}
weights_i = {}
for i, row in enumerate(W_extended_dense):
    neighbors[i] = np.where(row != 0)[0].tolist()
    weights_i[i] = row[row != 0].tolist()

W_pysal = W(neighbors, weights_i)
# Now W_pysal is a PySAL-compatible weights object that can be used in your model

 There are 5 disconnected components.


In [53]:
exp_temp = exp_var_list.copy()
exp_temp

['EDU_VAL',
 'POP_DENS_VAL',
 'PRIM_INCOME_VAL',
 'SCI_TECH_VAL',
 'TOTAL_DEATH_VAL',
 'TOUR_NIGHTS_VAL',
 'TOUR_PLCS_VAL']

In [54]:
# for var in exp_temp:
#     exp_var_list.remove(var)
#
# exp_var_list

In [55]:
gm_lag_model = GM_Lag(y, X, w=W_pysal, w_lags=6, name_y='GDP_VALUE', name_x=exp_var_list)
print(gm_lag_model.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :   GDP_VALUE                Number of Observations:         975
Mean dependent var  :  28118.9744                Number of Variables   :           9
S.D. dependent var  :  10876.0639                Degrees of Freedom    :         966
Pseudo R-squared    :      0.7121
Spatial Pseudo R-squared:  0.6921

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT     25542.91133      1209.46325        21.11921         0.00000
             EDU_VAL     -4974.00032       508.06404        -9.79011         0.00000
        POP_DENS_VAL      1617.32902      

In [56]:
y_pred = gm_lag_model.predy
# Calculate RMSE
rmse = mean_squared_error(y, y_pred, squared=False)
# Extract pseudo-R-squared
pseudo_r_squared = gm_lag_model.pr2

In [57]:
print("RMSE (Root Mean Squared Error): "+str(rmse))
print("Pseudo R-squared: "+str(pseudo_r_squared))

RMSE (Root Mean Squared Error): 5833.011594542761
Pseudo R-squared: 0.7121397061618294


In [58]:
print("RMSE (Root Mean Squared Error): "+str(rmse))
print("Pseudo R-squared: "+str(pseudo_r_squared))

RMSE (Root Mean Squared Error): 5833.011594542761
Pseudo R-squared: 0.7121397061618294
