In [11]:
import pandas as pd
import numpy as np

In [12]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.width', 5000)
pd.set_option('display.max_rows', 100)

### Rates Calculator

Check rates later in research to see in what other ways being calculated

In [23]:
import pandas as pd
import  numpy as np


def create_rates(df, var_list, pop_var=None, var_group=None):
    rate_multiplier = 10000
    if var_group == 'crime':
        rate_multiplier = 100000

    for rate_var in var_list:
        df[f'{rate_var}_rate'] = (df[f'{rate_var}']/df[f'{pop_var}'])*rate_multiplier

    return df
        
def get_all_rates(fl_path):    
    main_df = pd.read_csv(fl_path)

    """
    Create crime rates
    """
    crime_vars = ['rape','robbery','simple_assault','burglary','larceny','motor_vehicle_theft',
                  'agg_assault','murder', 'violent_crime', 'property_crime', 'total_crime']

    main_df = create_rates(main_df, var_list=crime_vars, pop_var='population', var_group='crime')

   
    """
    Create arrest, schl, emp rates
    
    """
    arrests_total_vars = ['agg_assault_tot_arrests', 'burglary_tot_arrests', 
                          'mtr_veh_theft_tot_arrests', 'murder_tot_arrests','rape_tot_arrests', 
                          'robbery_tot_arrests', 'drug_tot_arrests', 'sale_drug_tot_arrests', 
                          'poss_drug_tot_arrests', 'larceny_theft_tot_arrests', 
                          'disorder_tot_arrests', 'violent_tot_arrests', 'property_tot_arrests',
                          'total_crime_tot_arrests', 'schl_enrl_tot', 'schl_enrl_ugrad', 
                          'schl_enrl_grad', 'emp_total']

    main_df = create_rates(main_df, var_list=arrests_total_vars, pop_var='population')


    """
    Create black (arrests' and others) rates
    
    """   
    arrests_black_vars = ['agg_assault_tot_arrests_black', 'burglary_tot_arrests_black', 'mtr_veh_theft_tot_arrests_black',
                          'murder_tot_arrests_black', 'rape_tot_arrests_black', 
                          'robbery_tot_arrests_black', 'drug_tot_arrests_black', 
                          'sale_drug_tot_arrests_black', 'poss_drug_tot_arrests_black', 
                          'larceny_theft_tot_arrests_black', 'disorder_tot_arrests_black',
                          'violent_tot_arrests_black', 'property_tot_arrests_black', 
                          'total_crime_tot_arrests_black', 'emp_total_black']

    main_df = create_rates(main_df, var_list=arrests_black_vars, pop_var='black_total')


    """
    Create white (arresrs' and others) rates
    """
    arrests_white_vars = ['burglary_tot_arrests_white', 'agg_assault_tot_arrests_white', 
                          'mtr_veh_theft_tot_arrests_white', 'murder_tot_arrests_white',  
                          'rape_tot_arrests_white', 'robbery_tot_arrests_white', 
                          'drug_tot_arrests_white', 'sale_drug_tot_arrests_white', 
                          'poss_drug_tot_arrests_white', 'larceny_theft_tot_arrests_white',
                          'disorder_tot_arrests_white', 'violent_tot_arrests_white', 
                          'property_tot_arrests_white', 'total_crime_tot_arrests_white',
                          'emp_total_white']

    main_df = create_rates(main_df, var_list=arrests_white_vars, pop_var='white_total')


    """
    Create the below educational attainment rate
    """
    
    educ_vars = ['educ_atnmnt_18_24_total']
    
    main_df = create_rates(main_df, var_list=educ_vars, pop_var='total_15_24')

    
    """
    Create rates for the below incarceration variables
    """
    incarc_vars = ['prison_occupancy_count', 'jail_occupancy_count']
    main_df = create_rates(main_df, incarc_vars, pop_var='population')

    # some populations are zero so divide by zero gets infinity so replace them with 0
    main_df.replace(np.inf, 0, inplace=True)
    
    return main_df

### Perc Change Calculator

In [32]:
def calc_perc_change(df):
    #df = pd.read_csv(fl_path, engine='python')

    # lets sort the df by ori and year so that each ori has data starting from lowest year to highest
    df.sort_values(['ORI', 'year'], ascending=[True, True], inplace=True)

    df_num = df.loc[:, ['ORI', 'population', 'male_total', 'female_total', 'black_total', 'black_m_total', 
                        'black_f_total', 'white_total', 'white_m_total', 'white_f_total', 'males_15_24', 
                        'females_15_24', 'black_15_24_m', 'black_15_24_f', 'white_15_24_m', 'white_15_24_f', 
                        'rape', 'robbery', 'simple_assault', 'burglary', 'larceny', 'motor_vehicle_theft', 
                        'agg_assault', 'arson', 'murder', 'schl_enrl_tot', 'schl_enrl_ugrad', 'schl_enrl_grad', 
                        'educ_atnmnt_25ao_b', 'educ_atnmnt_25ao_w', 'emp_total', 'emp_total_black', 
                        'emp_total_white', 'pci', 'pci_black', 'pci_white', 'agg_assault_tot_arrests', 
                        'agg_assault_tot_arrests_black', 'agg_assault_tot_arrests_white', 'burglary_tot_arrests', 
                        'burglary_tot_arrests_black', 'burglary_tot_arrests_white', 'mtr_veh_theft_tot_arrests', 
                        'mtr_veh_theft_tot_arrests_black', 'mtr_veh_theft_tot_arrests_white', 'murder_tot_arrests',
                        'murder_tot_arrests_black', 'murder_tot_arrests_white', 'rape_tot_arrests', 
                        'rape_tot_arrests_black', 'rape_tot_arrests_white', 'robbery_tot_arrests', 
                        'robbery_tot_arrests_black', 'robbery_tot_arrests_white', 'drug_tot_arrests', 
                        'drug_tot_arrests_black', 'drug_tot_arrests_white', 'sale_drug_tot_arrests', 
                        'sale_drug_tot_arrests_black', 'sale_drug_tot_arrests_white', 'poss_drug_tot_arrests', 
                        'poss_drug_tot_arrests_black', 'poss_drug_tot_arrests_white', 'larceny_theft_tot_arrests', 
                        'larceny_theft_tot_arrests_black', 'larceny_theft_tot_arrests_white', 'disorder_tot_arrests',
                        'disorder_tot_arrests_black', 'disorder_tot_arrests_white', 'white_15_24', 'black_15_24', 
                        'total_15_24', 'educ_atnmnt_18_24_total', 'tot_felonies_agency', 'tot_misdemeanors_agency', 
                        'tot_felonies_cnty', 'tot_misdemeanors_cnty', 'tot_major_offenses_cnty', 'perc_felonies', 
                        'perc_misdemeanors', 'total_jail_pop', 'total_prison_pop', 'prison_occupancy_count', 
                        'jail_occupancy_count', 'violent_crime', 'property_crime', 'total_crime', 'violent_tot_arrests',
                        'property_tot_arrests', 'total_crime_tot_arrests', 'violent_tot_arrests_black', 
                        'property_tot_arrests_black', 'total_crime_tot_arrests_black', 'violent_tot_arrests_white', 
                        'property_tot_arrests_white', 'total_crime_tot_arrests_white']]
    
    
   # ((x-x[0])/x[0]) * 100
    df_grpd_pct = df_num.groupby('ORI').transform(lambda x: x.div(x.iloc[0]).subtract(1).mul(100))

    
    # drop YEAR and Govt_level col so that it isn't demeaned.
    # df_grpd_pct.drop(['YEAR', 'Govt_level'], inplace=True, axis=1)

    # append _pc to the perc change columns
    df_grpd_pct.columns = ['pc_' + str(col) for col in df_grpd_pct.columns]

    # get id columns from original df
    df_id = df.loc[:, ['ORI', 'year']]

    df_pc = pd.concat([df_id, df_grpd_pct], axis=1)

    return df_pc

https://www3.nd.edu/~rwilliam/stats2/Panel.pdf

If the xtreg command did not exist, we could estimate a fixed effects model by using OLS
regression with the demeaning approach. 

xtreg does demeaning procedure by default

### Agency Categorizer

In [36]:
def categorize_agencies_by_pop_mean(fl_path, op_path, fl_name):
    fnl_main = pd.read_csv(fl_path)

    # groupby ORI and calculate mean of population values from 90-15
    fnl_main_ORI_grpd = fnl_main.groupby('ORI').agg({'population': np.mean}).reset_index()
    fnl_main_ORI_grpd.rename({'population': 'population_mean'}, axis=1, inplace=True)

    # merge fnl_main_ORI_grpd with fnl_main so that population mean for a given ORI will be associated with that ORI for 90-15
    fnl_main_population_mn = fnl_main.merge(fnl_main_ORI_grpd, on ='ORI')

    fnl_main_population_mn.to_csv(f'{op_path}/{fl_name}_pop_mean.csv', index=False)

    small_size_cities = fnl_main_population_mn.query('population_mean >= 10000 & population_mean < 50000').reset_index(drop=True)
    print('small_size_cities: ', small_size_cities['ORI'].nunique())
    small_size_cities.to_csv(f'{op_path}/{fl_name}_small_cities.csv', index=False)

    med_size_cities = fnl_main_population_mn.query('population_mean >= 50000 & population_mean < 100000').reset_index(drop=True)
    print('med_size_cities: ', med_size_cities['ORI'].nunique())
    med_size_cities.to_csv(f'{op_path}/{fl_name}_medium_cities.csv', index=False)

    large_size_cities = fnl_main_population_mn.query('population_mean >= 100000').reset_index(drop=True)
    print('large_size_cities: ', large_size_cities['ORI'].nunique())
    large_size_cities.to_csv(f'{op_path}/{fl_name}_large_cities.csv', index=False)
    

### 10_20 

### pc

In [33]:
df = pd.read_csv('/Users/salma/Research/dissertation_work/data/final/final_10_20_all_rep_oris_gte_10k_core.csv')

df_pc = calc_perc_change(df)

df_pc_merged = df.merge(df_pc, on=['ORI', 'year'], how='left')
df_pc_merged.to_csv('/Users/salma/Research/dissertation_work/data/final/10_20/final_10_20_all_rep_oris_core_counts_pc.csv', index=False)
print(df_pc_merged.shape[0])
print(list(df_pc_merged))

48928
['ORI', 'agency_name', 'state', 'year', 'state_fips', 'longitude', 'latitude', 'county_fips', 'place_fips', 'population', 'male_total', 'female_total', 'black_total', 'black_m_total', 'black_f_total', 'white_total', 'white_m_total', 'white_f_total', 'males_15_24', 'females_15_24', 'black_15_24_m', 'black_15_24_f', 'white_15_24_m', 'white_15_24_f', 'rape', 'robbery', 'simple_assault', 'burglary', 'larceny', 'motor_vehicle_theft', 'agg_assault', 'arson', 'murder', 'schl_enrl_tot', 'schl_enrl_ugrad', 'schl_enrl_grad', 'educ_atnmnt_25ao_b', 'educ_atnmnt_25ao_w', 'emp_total', 'emp_total_black', 'emp_total_white', 'pci', 'pci_black', 'pci_white', 'agg_assault_tot_arrests', 'agg_assault_tot_arrests_black', 'agg_assault_tot_arrests_white', 'burglary_tot_arrests', 'burglary_tot_arrests_black', 'burglary_tot_arrests_white', 'mtr_veh_theft_tot_arrests', 'mtr_veh_theft_tot_arrests_black', 'mtr_veh_theft_tot_arrests_white', 'murder_tot_arrests', 'murder_tot_arrests_black', 'murder_tot_arrests

### rates

In [34]:
df_rates = get_all_rates(fl_path = '/Users/salma/Research/dissertation_work/data/final/10_20/final_10_20_all_rep_oris_core_counts_pc.csv')
df_rates.to_csv('/Users/salma/Research/dissertation_work/data/final/10_20/final_10_20_all_rep_oris_core_counts_pc_rates.csv', index=False)
print(df_rates.shape[0])
print(list(df_rates))

48928
['ORI', 'agency_name', 'state', 'year', 'state_fips', 'longitude', 'latitude', 'county_fips', 'place_fips', 'population', 'male_total', 'female_total', 'black_total', 'black_m_total', 'black_f_total', 'white_total', 'white_m_total', 'white_f_total', 'males_15_24', 'females_15_24', 'black_15_24_m', 'black_15_24_f', 'white_15_24_m', 'white_15_24_f', 'rape', 'robbery', 'simple_assault', 'burglary', 'larceny', 'motor_vehicle_theft', 'agg_assault', 'arson', 'murder', 'schl_enrl_tot', 'schl_enrl_ugrad', 'schl_enrl_grad', 'educ_atnmnt_25ao_b', 'educ_atnmnt_25ao_w', 'emp_total', 'emp_total_black', 'emp_total_white', 'pci', 'pci_black', 'pci_white', 'agg_assault_tot_arrests', 'agg_assault_tot_arrests_black', 'agg_assault_tot_arrests_white', 'burglary_tot_arrests', 'burglary_tot_arrests_black', 'burglary_tot_arrests_white', 'mtr_veh_theft_tot_arrests', 'mtr_veh_theft_tot_arrests_black', 'mtr_veh_theft_tot_arrests_white', 'murder_tot_arrests', 'murder_tot_arrests_black', 'murder_tot_arrests

### categories

In [37]:
categorize_agencies_by_pop_mean(fl_path = '/Users/salma/Research/dissertation_work/data/final/10_20/final_10_20_all_rep_oris_core_counts_pc_rates.csv',
                                op_path='/Users/salma/Research/dissertation_work/data/final/10_20',
                                fl_name='final_10_20_all_rep_oris_core_counts_pc_rates')


small_size_cities:  3323
med_size_cities:  680
large_size_cities:  445
