# Segregation Analysis with PySAL

In [None]:
%load_ext watermark
%load_ext autoreload
%autoreload 2

In [None]:
%watermark -v -a "author: eli knaap" -d -u -p segregation,libpysal,geopandas

Here, we'll use PySAL's `segregation` module to analyze racial segregation in southern california

In [None]:
import geopandas as gpd

## Data Prep

In [None]:
scag = gpd.read_file("data/scag_region.gpkg", layer="tracts")

We need to reproject the data into a more appropriate coordinate system. UTM11 should work 

In [None]:
scag = scag.to_crs(epsg=26911)
scag.crs

In [None]:
scag.dropna(subset=['p_hispanic_persons']).plot(column='p_hispanic_persons',
                                                scheme='quantiles', 
                                                cmap='Blues',
                                                k=8, 
                                                legend=True)

some background on [fips codes](https://www.policymap.com/2012/08/tips-on-fips-a-quick-guide-to-geographic-place-codes-part-iii/)

In [None]:
scag['county'] = scag.geoid.str[:5]

In [None]:
scag.county.unique()

In [None]:
county_names = ["Los Angeles", "Imperial", "Orange", "San Bernadino", "San Diego", "Riverside", "Ventura"]

In [None]:
namer = dict(zip(scag.county.unique(), county_names))

In [None]:
namer

Now that we know which county is which, we could just use these codes to divide up the region into pieces. But lets go ahead and replace the codes with their names. It's more to type, but if we want to subset later, we won't have to go look up the codes again

In [None]:
scag['county'] = scag.county.replace(to_replace=namer)

In [None]:
scag.county

In [None]:
coastal = scag[scag.county.isin(["Los Angeles", "Orange", "San Diego", "Ventura"])]

In [None]:
inland = scag[scag.county.isin(['Riverside', "San Bernadino", "Imperial"])]

In [None]:
coastal.plot(column='county')

In [None]:
inland.plot(column='county')

## Calculating Segregation Measures

### Classic (aspatial) Single-Group Indices

In [None]:
from segregation.aspatial import Dissim, GiniSeg, Entropy

In [None]:
dissim = Dissim(scag, "n_hispanic_persons", "n_total_pop")
gini = GiniSeg(scag, "n_hispanic_persons", "n_total_pop")
entropy = Entropy(scag, "n_hispanic_persons", "n_total_pop")

In [None]:
dissim.statistic

In [None]:
gini.statistic

In [None]:
entropy.statistic

### Multigroup Indices

In [None]:
from segregation.aspatial import MultiInformationTheory, MultiGiniSeg, MultiDiversity

In [None]:
pop_groups = ['n_asian_persons', 'n_hispanic_persons', 'n_nonhisp_black_persons', 'n_nonhisp_white_persons']

In [None]:
multi_div = MultiDiversity(scag, pop_groups)
multi_info = MultiInformationTheory(scag, pop_groups)

In [None]:
multi_div.statistic

In [None]:
multi_info.statistic

### Spatial Indices

In [None]:
from libpysal import weights

In [None]:
from segregation.spatial import SpatialDissim,  SpatialInformationTheory

In [None]:
w_queen = weights.Queen.from_dataframe(scag)
w_knn = weights.KNN.from_dataframe(scag, k=10)

#### Single Group

In [None]:
spatial_dissim = SpatialDissim(scag, 'n_hispanic_persons', 'n_total_pop', w=w_queen)

In [None]:
spatial_dissim.statistic

In [None]:
spatial_dissim_dist = SpatialDissim(scag, 'n_hispanic_persons', 'n_total_pop', w=w_knn)

In [None]:
spatial_dissim_dist.statistic

#### Multi Group

we can also look at how different concepts of space influence the resulting index statistic

In [None]:
spatial_info_queen = SpatialInformationTheory(scag, pop_groups, w=w_queen)
spatial_info_dist = SpatialInformationTheory(scag, pop_groups, w=w_knn)

In [None]:
spatial_info_queen.statistic

In [None]:
spatial_info_dist.statistic

#### Multiscalar Profile

The multiscalar segregation profile is a way of measuring how global versus local the segregation patterns are in a region. 

In [None]:
from segregation.spatial import compute_segregation_profile

In [None]:
distances = [1500., 2500., 3500., 4500., 5500.]

In [None]:
prof = compute_segregation_profile(scag, pop_groups, distances)

In [None]:
import pandas as pd
pd.Series(prof).plot()

We can also look at how the segregation profiles differ by region. If we plot them all on the same graph, we can compare the slopes of the lines to see how the shape of segregation differs between places in the southern cal region

In [None]:
coastal_prof = compute_segregation_profile(coastal, pop_groups, distances)
inland_prof = compute_segregation_profile(inland, pop_groups, distances)

In [None]:

pd.Series(prof, name='socal').plot(legend=True)
pd.Series(coastal_prof, name='coastal').plot(legend=True)
pd.Series(inland_prof, name='inland').plot(legend=True)

This shows that segregation in the coastal region is considerably larger than the inland region at every scale, though have similar shapes to their overall segregation profiles.

## Single-Value Inference

In [None]:
from segregation.inference import SingleValueTest

In [None]:
entropy_test = SingleValueTest(entropy)

In [None]:
dissim_test = SingleValueTest(dissim)

In [None]:
entropy_test.p_value

In [None]:
entropy_test.plot()

In [None]:
dissim_test.plot()

## Comparative Inference

In [None]:
from segregation.inference import TwoValueTest

In [None]:
info_test = TwoValueTest(MultiInformationTheory(coastal, pop_groups),
            MultiInformationTheory(inland, pop_groups))

In [None]:
info_test.est_point_diff

In [None]:
info_test.plot()

## Decomposition

In [None]:
from segregation.decomposition import DecomposeSegregation

In [None]:
w_coastal = weights.DistanceBand.from_dataframe(coastal, 2000)

In [None]:
w_inland = weights.DistanceBand.from_dataframe(inland, 2000)

In [None]:
one = SpatialDissim(coastal, 'n_nonhisp_black_persons', 'n_total_pop', w=w_coastal)

In [None]:
decomp = DecomposeSegregation(SpatialDissim(coastal, 'n_nonhisp_black_persons', 'n_total_pop', w=w_coastal),
                    SpatialDissim(inland,'n_nonhisp_black_persons', 'n_total_pop', w=w_inland))

In [None]:
decomp.plot(plot_type='maps',  scheme='equalinterval', k=10, city_a='Coastal', city_b='inland')

In [None]:
decomp.plot()

## Exercise

1. Which county in the socal region has the greatest level of multiracial segregation, (using the 4 categories above) according to the MultiInformationTheory index?

2. According to the Gini index, is hispanic/latino segregation in Riverside County greater or less than Ventura County? Is that difference significant?

3. According to the Spatial Dissimilarity index, does the difference in segregation between Riverside and Venture result from the demograpic structure or the spatial structure?

In [None]:
# %load solutions/06.py
#### 1

results = {}

for i in scag.county.unique():

    results[i] = MultiInformationTheory(scag[scag.county==i], pop_groups).statistic
    print(f"{i} Info Theory: {results[i]}")


#### 2

rside_gini = GiniSeg(scag[scag.county=='Riverside'], group_pop_var='n_hispanic_persons', total_pop_var='n_total_pop')
vent_gini = GiniSeg(scag[scag.county=='Ventura'], group_pop_var='n_hispanic_persons', total_pop_var='n_total_pop')

print(f"\nRiverside Gini: {rside_gini.statistic}")
print(f"Venura Gini: {vent_gini.statistic}")
ginitest = TwoValueTest(rside_gini, vent_gini)
print(f"test significance level = {ginitest.p_value}")
ginitest.plot()

#### 3
decomp = DecomposeSegregation(SpatialDissim(scag[scag.county=='Riverside'], group_pop_var='n_hispanic_persons', total_pop_var='n_total_pop'),
                              SpatialDissim(scag[scag.county=='Ventura'], group_pop_var='n_hispanic_persons', total_pop_var='n_total_pop'))
decomp.plot('maps', figsize=(20,20))