# Modified Dissimilarity Index (index of systematic dissimilarity)

In [1]:
# !IPython.load_extensions('calico-document-tools') To construct notebook Table of Contents
import pandas as pd
import numpy as np

import os

os.chdir('C:/Users/renan/Desktop/inequality-segregation-supplementary-files/')

In [2]:
def calculate_segregation(data, group_pop_var, total_pop_var, b = 0.5, m = 1000):
    '''
    data: a pandas DataFrame that contains a geometry column
    group_pop_var: the name of variable that contains the population size of the group of interest
    total_pop_var: the name of variable that contains the total population of the unit
    b: Atksinson's Index shape parameter
    m: a numeric value indicating the number of thresholds to be used in the Concentration Profile Index
    '''
    
    # Uneveness
    data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'})
    T = data.total_pop_var.sum()
    P = data.group_pop_var.sum() / T
    data = data.assign(xi = data.group_pop_var,
                       yi = data.total_pop_var - data.group_pop_var,
                       ti = data.total_pop_var,
                       pi = np.where(data.total_pop_var == 0, 0, data.group_pop_var/data.total_pop_var))
    D = (((data.total_pop_var * abs(data.pi - P)))/ (2 * T * P * (1 - P))).sum()
    
    num = (np.matmul(np.array(data.ti)[np.newaxis].T, np.array(data.ti)[np.newaxis]) * abs(np.array(data.pi)[np.newaxis].T - np.array(data.pi)[np.newaxis])).sum()
    den = (2 * T**2 * P * (1-P))
    G = num / den
    
    return {'Dissimilarity (D)': D,
            'Gini (G)': G}

In [3]:
census_2010 = pd.read_csv('data/std_2010_fullcount.csv', encoding = "ISO-8859-1", sep = ",")
census_2010.head()

Unnamed: 0,trtid10,state,county,tract,pop10,nhwht10,nhblk10,ntv10,asian10,hisp10,...,a15hsp10,a60hsp10,ageasn10,a15asn10,a60asn10,agentv10,a15ntv10,a60ntv10,globd10,globg10
0,1001020100,AL,Autauga County,Census Tract 201,1912,1601,228,21,16,44,...,14,2,14,4,1,13,1,3,bw,White Black
1,6083002402,CA,Santa Barbara County,Census Tract 24.02,11406,1980,207,54,703,8439,...,2972,414,624,119,75,26,4,3,wha,Dual immig
2,1001020200,AL,Autauga County,Census Tract 202,2170,844,1226,9,13,75,...,14,1,5,0,0,5,3,1,bw,White Black
3,6083002102,CA,Santa Barbara County,Census Tract 21.02,2084,853,24,24,88,1088,...,342,84,61,4,16,11,0,4,wha,Dual immig
4,1001020300,AL,Autauga County,Census Tract 203,3373,2538,668,30,42,87,...,34,9,22,2,7,9,1,3,bw,White Black


In [4]:
df = census_2010.loc[census_2010.county == "Riverside County"][['trtid10','tract','pop10','nhblk10']]
df.head()

Unnamed: 0,trtid10,tract,pop10,nhblk10
5727,6065042012,Census Tract 420.12,6242,677
5729,6065041911,Census Tract 419.11,10258,844
5731,6065041910,Census Tract 419.10,6342,405
5733,6065040816,Census Tract 408.16,2594,346
5735,6065040815,Census Tract 408.15,3586,429


# Segregation Indeces: Distance from randomness preliminar

In [5]:
m = 500
n = 100
n_unit_each = 10
p = 0.3

Ds = np.empty(m)
Gs = np.empty(m)

d = {'group_pop_var': 1, 'total_pop_var': n * [n_unit_each]}
data = pd.DataFrame(data = d)
data.head()

Unnamed: 0,group_pop_var,total_pop_var
0,1,10
1,1,10
2,1,10
3,1,10
4,1,10


In [6]:
for i in np.array(range(m)):

    freq_sim = np.random.binomial(n = np.array([n * [n_unit_each]]), 
                                  p = np.array([n * [p]]), 
                                  size = (1, n)).tolist()[0]
    data = data.assign(group_pop_var = freq_sim)
    test = calculate_segregation(data, 'group_pop_var', 'total_pop_var')
    Ds[i] = list(test.values())[0]
    Gs[i] = list(test.values())[1]

In [7]:
# To reproduce table of 
# Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409.
Ds.mean()

0.2726069812270854

In [8]:
# To reproduce table of 
# Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409.
Gs.mean()

0.3802093168937524

## Construction of the Modified Dissimilarity Index

The Modified Dissimilarity Index ($Dct$) based on *Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409.* evaluates the deviation from simulated eveness. This measure is estimated this by taking the mean of the classical $D$ under several simulations under eveness from the global minority proportion.

Let $D^*$ be the average of the Classical Dissimilarity Index ($D$) under simulations draw assuming eveness from the global minority proportion. The value of $Dct$ com be evaluated with the following equation: 

\begin{equation}
  Dct =
  \begin{cases}
    \frac{D-D^*}{1-D^*} & \text{if $D \geqslant D^*$} \\
    \frac{D-D^*}{D^*} & \text{if $D < D^*$}
  \end{cases}
\end{equation}

In [9]:
def calculate_dissim(data, group_pop_var, total_pop_var):
    """
    Calculation of Dissimilarity index

    Parameters
    ----------

    data          : a pandas DataFrame
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit

    Attributes
    ----------

    d : float
        Dissimilarity Index

    Notes
    -----
    Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.

    """
    if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
        raise TypeError('group_pop_var and total_pop_var must be strings')
    
    if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)):    
        raise ValueError('group_pop_var and total_pop_var must be variables of data')

    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    if any(data.total_pop_var < data.group_pop_var):    
        raise ValueError('Group of interest population must equal or lower than the total population of the units.')
   
    T = data.total_pop_var.sum()
    P = data.group_pop_var.sum() / T
    
    # If a unit has zero population, the group of interest frequency is zero
    data = data.assign(pi = np.where(data.total_pop_var == 0, 0, data.group_pop_var/data.total_pop_var))
    
    D = (((data.total_pop_var * abs(data.pi - P)))/ (2 * T * P * (1 - P))).sum()
    
    return D

In [10]:
def calculate_modified_dissimilarity(data, group_pop_var, total_pop_var, iterations = 500):
    
    D = calculate_dissim(data, group_pop_var, total_pop_var)
    
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    p_null = data.group_pop_var.sum() / data.total_pop_var.sum()
    
    Ds = np.empty(iterations)
    
    for i in np.array(range(iterations)):

        freq_sim = np.random.binomial(n = np.array([data.total_pop_var.tolist()]), 
                                      p = np.array([[p_null]*data.shape[0]]), 
                                      size = (1, data.shape[0])).tolist()[0]
        data = data.assign(group_pop_var = freq_sim)
        aux = calculate_dissim(data, 'group_pop_var', 'total_pop_var')
        Ds[i] = aux
        
    D_star = Ds.mean()
    
    if (D >= D_star):
        Dct = (D - D_star)/(1 - D_star)
    else:
        Dct = (D - D_star)/D_star
        
    return Dct

In [11]:
np.random.seed(1234)
calculate_modified_dissimilarity(df, 'nhblk10', 'pop10')

0.30009504639081996

# Calculating Modified Gini

The Modified Gini ($Gct$) based on *Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409.* evaluates the deviation from simulated eveness. This measure is estimated this by taking the mean of the classical gini segregation index $G$ under several simulations under eveness from the global minority proportion.

Let $G^*$ be the average of the Gini Segregation Index ($G$) under simulations draw assuming eveness from the global minority proportion. The value of $Gct$ com be evaluated with the following equation: 

\begin{equation}
  Gct =
  \begin{cases}
    \frac{G-G^*}{1-G^*} & \text{if $G \geqslant G^*$} \\
    \frac{G-G^*}{G^*} & \text{if $G < G^*$}
  \end{cases}
\end{equation}

In [12]:
def calculate_gini_seg(data, group_pop_var, total_pop_var):
    """
    Calculation of Gini Segregation index

    Parameters
    ----------

    data          : a pandas DataFrame
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit

    Attributes
    ----------

    g : float
        Gini Segregation Index

    Notes
    -----
    Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.

    """
    if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
        raise TypeError('group_pop_var and total_pop_var must be strings')
    
    if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)):    
        raise ValueError('group_pop_var and total_pop_var must be variables of data')

    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    if any(data.total_pop_var < data.group_pop_var):    
        raise ValueError('Group of interest population must equal or lower than the total population of the units.')
   
    T = data.total_pop_var.sum()
    P = data.group_pop_var.sum() / T
    
    # If a unit has zero population, the group of interest frequency is zero
    data = data.assign(ti = data.total_pop_var,
                       pi = np.where(data.total_pop_var == 0, 0, data.group_pop_var/data.total_pop_var))
    
    num = (np.matmul(np.array(data.ti)[np.newaxis].T, np.array(data.ti)[np.newaxis]) * abs(np.array(data.pi)[np.newaxis].T - np.array(data.pi)[np.newaxis])).sum()
    den = (2 * T**2 * P * (1-P))
    G = num / den
    
    return G

In [13]:
def calculate_modified_gini_seg(data, group_pop_var, total_pop_var, iterations = 500):
    
    G = calculate_gini_seg(data, group_pop_var, total_pop_var)
    
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    p_null = data.group_pop_var.sum() / data.total_pop_var.sum()
    
    Gs = np.empty(iterations)
    
    for i in np.array(range(iterations)):
        freq_sim = np.random.binomial(n = np.array([data.total_pop_var.tolist()]), 
                                      p = np.array([[p_null]*data.shape[0]]), 
                                      size = (1, data.shape[0])).tolist()[0]
        data = data.assign(group_pop_var = freq_sim)
        aux = calculate_gini_seg(data, 'group_pop_var', 'total_pop_var')
        Gs[i] = aux
        
    G_star = Gs.mean()
    
    if (G >= G_star):
        Gct = (G - G_star)/(1 - G_star)
    else:
        Gct = (G - G_star)/G_star
        
    return Gct

In [14]:
np.random.seed(1234)
calculate_modified_gini_seg(df, 'nhblk10', 'pop10')

0.4280279611418648