In [1]:
# Import libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import libpysal as ps
import matplotlib.pyplot as plt
import matplotlib as mpl

#For basemap:
import contextily as cx

#For regression:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices

import esda as esda
from esda.moran import Moran_Local
import splot as splot

#Plot spatial weights:
from libpysal.weights.contiguity import Queen
from splot.libpysal import plot_spatial_weights
from splot.esda import moran_scatterplot, plot_moran
from splot.esda import plot_local_autocorrelation

# For GWR
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import compare_surfaces, truncate_colormap



In [2]:
def standardize(raw_data):
    return ((raw_data - np.mean(raw_data, axis = 0)) / np.std(raw_data, axis = 0))

In [3]:
geogon_od = pd.read_csv('geogon_od.csv')
geogon_od 

# Rename our columns to have spaces 
geogon_od.columns = ['Year', 'FIPS', 'State', 'County', 'Deaths', 'Population', 'Crude_Rate',
       'Cruder_Rate', 'Deathrate_per_100', 'Unemployment_rate',
       'Dispense_rate', 'SUMLEV', 'AGEGRP', 'TOT_POP', 'TOT_MALE',
       'TOT_FEMALE', 'WA_MALE', 'WA_FEMALE', 'BA_MALE', 'BA_FEMALE', 'IA_MALE',
       'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE', 'NA_FEMALE', 'TOM_MALE',
       'TOM_FEMALE', 'NH_MALE', 'NH_FEMALE', 'H_MALE', 'H_FEMALE',
       'Urbanicity', 'Jail_Population', 'Incarceration_Rate_per_100k',
       'PovertyCount', 'PovertyPercentage', 'MedianHHI', 'Latitude',
       'Longitude', 'geometry']


#
nonnum_features = ['Year', 'FIPS', 'State', 'County', 'Urbanicity', 'Latitude',
                   'Longitude', 'geometry']
stzd_geogon = standardize(geogon_od.drop(nonnum_features, axis = 1))
stzd_geogon[nonnum_features] = geogon_od[nonnum_features]
stzd_geogon





# Convert to geopandas 
# geogon_od = gpd.GeoDataFrame(geogon_od)
stzd_geogon['geometry'] = gpd.GeoSeries.from_wkt(stzd_geogon['geometry'])
stzd_geogon = gpd.GeoDataFrame(stzd_geogon, geometry = 'geometry')


# Convert FIPS to string 
geogon_od['FIPS'] = geogon_od['FIPS'].astype(str).str.zfill(5)

stzd_geogon['FIPS'] = stzd_geogon['FIPS'].astype(str).str.zfill(5)
stzd_geogon.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 15896 entries, 0 to 15895
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   AA_FEMALE                    8835 non-null   float64 
 1   AA_MALE                      8835 non-null   float64 
 2   AGEGRP                       0 non-null      float64 
 3   BA_FEMALE                    8835 non-null   float64 
 4   BA_MALE                      8835 non-null   float64 
 5   Crude_Rate                   0 non-null      object  
 6   Cruder_Rate                  15896 non-null  float64 
 7   Deathrate_per_100            15896 non-null  float64 
 8   Deaths                       15896 non-null  float64 
 9   Dispense_rate                12810 non-null  float64 
 10  H_FEMALE                     8835 non-null   float64 
 11  H_MALE                       8835 non-null   float64 
 12  IA_FEMALE                    8835 non-null   float64

In [4]:
print(stzd_geogon.columns)

Index(['AA_FEMALE', 'AA_MALE', 'AGEGRP', 'BA_FEMALE', 'BA_MALE', 'Crude_Rate',
       'Cruder_Rate', 'Deathrate_per_100', 'Deaths', 'Dispense_rate',
       'H_FEMALE', 'H_MALE', 'IA_FEMALE', 'IA_MALE',
       'Incarceration_Rate_per_100k', 'Jail_Population', 'MedianHHI',
       'NA_FEMALE', 'NA_MALE', 'NH_FEMALE', 'NH_MALE', 'Population',
       'PovertyCount', 'PovertyPercentage', 'SUMLEV', 'TOM_FEMALE', 'TOM_MALE',
       'TOT_FEMALE', 'TOT_MALE', 'TOT_POP', 'Unemployment_rate', 'WA_FEMALE',
       'WA_MALE', 'Year', 'FIPS', 'State', 'County', 'Urbanicity', 'Latitude',
       'Longitude', 'geometry'],
      dtype='object')


In [5]:
"Cruder_Rate ~ AA_FEMALE + AA_MALE + BA_FEMALE + BA_MALE + \
H_FEMALE + H_MALE + IA_FEMALE + IA_MALE + NA_FEMALE + NA_MALE + NH_FEMALE + NH_MALE + \
TOM_FEMALE + TOM_MALE + WA_FEMALE + WA_MALE + \
Unemployment_rate + Dispense_rate + Incarceration_Rate_per_100k + PovertyPercentage + MedianHHI"

'Cruder_Rate ~ AA_FEMALE + AA_MALE + BA_FEMALE + BA_MALE + H_FEMALE + H_MALE + IA_FEMALE + IA_MALE + NA_FEMALE + NA_MALE + NH_FEMALE + NH_MALE + TOM_FEMALE + TOM_MALE + WA_FEMALE + WA_MALE + Unemployment_rate + Dispense_rate + Incarceration_Rate_per_100k + PovertyPercentage + MedianHHI'

In [6]:
stzd_geogon15 = stzd_geogon[stzd_geogon['Year'] == 2015]

y, X = dmatrices("Cruder_Rate ~ AA_FEMALE + AA_MALE + BA_FEMALE + BA_MALE + \
H_FEMALE + H_MALE + IA_FEMALE + IA_MALE + NA_FEMALE + NA_MALE + NH_FEMALE + NH_MALE + \
TOM_FEMALE + TOM_MALE + WA_FEMALE + WA_MALE + \
Unemployment_rate + Dispense_rate + Incarceration_Rate_per_100k + PovertyPercentage + MedianHHI", 
                 data=stzd_geogon15, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            Cruder_Rate   R-squared:                       0.238
Model:                            OLS   Adj. R-squared:                  0.221
Method:                 Least Squares   F-statistic:                     13.75
Date:                Wed, 16 Mar 2022   Prob (F-statistic):           3.91e-38
Time:                        22:29:30   Log-Likelihood:                -1016.2
No. Observations:                 857   AIC:                             2072.
Df Residuals:                     837   BIC:                             2167.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [7]:
stzd_geogon15.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 886 entries, 9952 to 10837
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   AA_FEMALE                    886 non-null    float64 
 1   AA_MALE                      886 non-null    float64 
 2   AGEGRP                       0 non-null      float64 
 3   BA_FEMALE                    886 non-null    float64 
 4   BA_MALE                      886 non-null    float64 
 5   Crude_Rate                   0 non-null      object  
 6   Cruder_Rate                  886 non-null    float64 
 7   Deathrate_per_100            886 non-null    float64 
 8   Deaths                       886 non-null    float64 
 9   Dispense_rate                886 non-null    float64 
 10  H_FEMALE                     886 non-null    float64 
 11  H_MALE                       886 non-null    float64 
 12  IA_FEMALE                    886 non-null    float6

In [11]:
#Now that we have the model we will calculate Morrison I !! 
# we pull out the counties 

# stzd_geogon15 = stzd_geogon15.drop(['AGEGRP', 'Crude_Rate', 'SUMLEV'], axis = 1)
# stzd_geogon15 = stzd_geogon15.dropna()
stzd_geogon15 = stzd_geogon15.reset_index(drop = True)
w = Queen.from_dataframe(stzd_geogon15)

stzd_geogon15_2 = stzd_geogon15.drop(w.islands)
w2 = Queen.from_dataframe(stzd_geogon15_2)

y2 = y.reset_index(drop = True)
y3 = y2.drop(w.islands)
moran = esda.Moran(y3, w2)
moran.I

 There are 105 disconnected components.
 There are 51 islands with ids: 6, 12, 30, 97, 102, 119, 144, 173, 174, 177, 178, 204, 228, 229, 230, 232, 233, 250, 262, 280, 281, 310, 334, 344, 357, 360, 362, 363, 383, 387, 395, 397, 435, 450, 553, 658, 662, 673, 700, 717, 739, 744, 751, 752, 754, 755, 794, 819, 832, 841, 843.
 There are 54 disconnected components.


0.514680965386146

Here we are veryifing that under the null hypothesis 