# Skater Regression

### This notebook shows the use of the Skater Regression funcion (Skater_reg), introduced by Anselin & Amaral (2021). For more information on the method, check:
https://www.researchgate.net/publication/353411566_Endogenous_Spatial_Regimes

In [1]:
# Required imports
import libpysal as ps
import numpy as np
import spreg
from spreg.skater_reg import Skater_reg

In [2]:
# Optional imports
import geopandas as gpd
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### To load the data, we can use pure PySAL or (Geo)Pandas
### Data can be any discrete spatial objects such as points or polygons. Here we use Columbus for simplicity

In [3]:
# Pure PySAL alternative
data = ps.io.open(ps.examples.get_path('columbus.dbf'))
y = np.array(data.by_col('HOVAL')).reshape((-1,1))
x = np.array([data.by_col(name) for name in ['INC','CRIME']]).T
w = ps.weights.Queen.from_shapefile(ps.examples.get_path("columbus.shp"))

In [4]:
# GeoPandas approach
data = gpd.read_file(ps.examples.get_path('columbus.shp'))
y = data['HOVAL'].to_numpy()
x = data[['INC','CRIME']].to_numpy()
w = ps.weights.Queen.from_dataframe(data)

In [5]:
# Standardize the variables to be used to compute the minimum spanning tree (could add/remove any variable)
x_std = (x - np.mean(x,axis=0)) / np.std(x,axis=0)

# Call the Skater_reg method based on OLS
results = Skater_reg().fit(4, w, x_std, {'reg':spreg.OLS,'y':y,'x':x}, quorum=10)

# The cluster allocations from the final step are stored in the attribute current_labels_
results.current_labels_

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 0,
       1, 2, 0, 0, 3, 3, 0, 0, 2, 1, 3, 2, 3, 2, 3, 0, 2, 1, 1, 2, 3, 3,
       3, 2, 1, 3, 3], dtype=int32)

### Optionally, to see the decrease in the Sum of Squared Residuals (SSR), we can use the trace from Skater_reg to plot the SSR from each step

In [None]:
trace = [results._trace[i][1][2] for i in range(1,len(results._trace))]
fig = go.Figure(data=go.Scatter(x=list(range(2,len(trace)+2)), y=trace,mode='lines+markers',
                               line=dict(color='black', width=2)))
fig.update_layout(xaxis_title='Number of clusters',
                   yaxis_title='Total sum of squared residuals')
fig.show()

### We can also plot the cluster allocations from the final step using the attribute current_labels_

In [None]:
fig = px.choropleth(pd.concat([data,pd.Series(results.current_labels_.astype(str), name='cluster', index=data.index)], axis=1),
                   geojson=data.geometry,
                   locations=data.index,
                   color="cluster")
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

### The trace can also be used to recall the cluster allocations from any intermediate step

In [None]:
# _trace[k-1][0] contains the cluster allocation for k clusters.
fig = px.choropleth(pd.concat([data,pd.Series(results._trace[2][0].astype(str), name='cluster', index=data.index)], axis=1),
                   geojson=data.geometry,
                   locations=data.index,
                   color="cluster")
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

### With the cluster allocations, we can call the Regimes methods in spreg to get the full regression results.

In [None]:
reg = spreg.OLS_Regimes(y,x,
      regimes=results.current_labels_, w=w, name_y=['HOVAL'], name_x=['INC','CRIME'], name_regimes='skater_reg')
print(reg.summary)