# City Analysis Report Data Prep

This file contains the city analysis report prep data pipeline. This takes existing data from all departments and produces several useful helper tables.

#### Imports and global variables

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

In [2]:
agencies = {
    'Department of Labor': 'Labor',
    'Department of Justice': 'DOJ',
    'Department of Health and Human Services': 'HHS',
    'Department of Housing and Urban Development': 'HUD',
    'Department of Education': 'DOE'
}

In [3]:
all_data = pd.read_parquet('clean_data/clean_all_dept_all_contracts_all_cols.parquet')

#### Basic Data Cleaning

In [4]:
all_data.program_match = all_data.program_match.fillna('other')

In [5]:
grants_to_exclude = [
    'https://www.usaspending.gov/award/ASST_NON_B-21-DF-22-0001_8620/',
    'https://www.usaspending.gov/award/ASST_NON_B-18-DP-22-0001_8620/',
    'https://www.usaspending.gov/award/ASST_NON_B-21-DZ-22-0001_8620/',
    'https://www.usaspending.gov/award/ASST_NON_B-22-DF-22-0001_8620/',
]

all_data = all_data[~all_data.usaspending_permalink.isin(grants_to_exclude)]

In [None]:
all_data['city_state'] = all_data['primary_place_of_performance_city_name'] + ', ' + all_data['primary_place_of_performance_state_name']

all_data['city_state'] = all_data['city_state'].apply(lambda x: str(x).title())

#### Summary Table Generation

In [6]:
# Group data by city and department, then specific grant type
grant_summary_table = all_data.groupby(['city_state', 'awarding_agency_name', 'program_match']).agg(
    count = ('program_match','size'),
    count_open  = ('grant_is_open','sum'),
    total_estimated_remaining_funds = ('estimated_remaining_funds','sum'),
    total_obligated_funds = ('total_obligated_amount','sum')
).reset_index()

In [None]:
# Define function to nicely format program matches, e.g. 'DOJ - Cops Hiring Program'
def gen_new_program_match(row):
    return f'{agencies[row.awarding_agency_name]} - {row.program_match}'

In [None]:
# Title case and re-format program matches
grant_summary_table.program_match = grant_summary_table.program_match.apply(lambda x: x.title())
grant_summary_table.program_match = grant_summary_table.apply(gen_new_program_match, axis=1)

In [7]:
# Generate city-level summary data--both for CVI and overall grant funding.
city_summary_table_total = all_data.groupby(['city_state']).agg(
    total_obligated_funds = ('total_obligated_amount','sum')
).reset_index()

city_summary_table_cvi = all_data[all_data.program_match != 'other'].groupby(['city_state']).agg(
    cvi_obligated_funds = ('total_obligated_amount','sum')
).reset_index()

city_summary_table = city_summary_table_total.merge(city_summary_table_cvi, how='left', on='city_state')

#### Import gun violence dataframe and cleaning

In [8]:
gun_df = pd.read_csv("usa_spending_contract_data/gun_data.csv")

# Subset columns
gun_df = gun_df[[
    'city_state',
    'fatal_shootings',
    'avg_popn',
    'rate_per_100k'
]]

# Remove any NA information from our gun dataframe
gun_df.dropna(axis=0, inplace=True)

# We need to exclude some cities which don't have USA Spending matches from our gun data set.
gun_df_cities_to_exclude = [
    'Louisville/Jefferson County, Kentucky',
    'Nashville-Davidson, Tennessee',
    'Urban Honolulu CDP, Hawaii'
]

gun_df = gun_df[~gun_df.city_state.isin(gun_df_cities_to_exclude)].reset_index(drop=True)

#### Validation: do all gun dataset cities match a city in the USA Spending data set?

In [9]:
# Convert the city_state column in all_data to title case and get unique values
unique_city_states = all_data['city_state'].dropna().str.title().unique()

# Check if the city_state values in gun_df are in the unique_city_states list
is_in_unique_city_states = gun_df['city_state'].str.title().isin(unique_city_states)

cities_not_in_unique_city_states = gun_df.loc[~is_in_unique_city_states, 'city_state']

assert len(cities_not_in_unique_city_states) == 0

#### Produce a merged dataframe with gun data and spending data, by city, combined

In [None]:
# Subset to just our city matches, then merge data together
city_summary_table = city_summary_table[city_summary_table.city_state.isin(gun_df['city_state'].str.title())]

merged_df = city_summary_table.merge(gun_df, how='left', on='city_state')

In [10]:
# Create several useful calculated metric fields in our data set
# This works out to be annual funding over annual homicides, so annual funding per one homicide.
merged_df['total_funding_per_hom'] = merged_df.total_obligated_funds / merged_df.fatal_shootings
merged_df['cvi_funding_per_hom'] = merged_df.cvi_obligated_funds / merged_df.fatal_shootings

# We add in extra by-5 divisions here in order to keep annual definitions standard. We correct for 5-year funding values.
merged_df['total_funding_per_person'] = merged_df.total_obligated_funds / merged_df.avg_popn / 5
merged_df['cvi_funding_per_person'] = merged_df.cvi_obligated_funds / merged_df.avg_popn / 5

# Change fatal shootings to be an annual value
merged_df['fatal_shootings'] = round(merged_df['fatal_shootings'] / 5)

In [None]:
# Output to csv
merged_df.to_csv('clean_data/city_summary_table.csv', index=False)

#### Produce a merged dataframe with gun data and spending data, by city/funding dept/grant, combined

In [11]:
# Subset to just our city matches, then merge data together
grant_summary_table = grant_summary_table[grant_summary_table.city_state.isin(gun_df['city_state'].str.title())]

merged_df = grant_summary_table.merge(gun_df, how='left', on='city_state')

In [None]:
# Create several useful calculated metric fields in our data set
merged_df['funding_per_hom'] = merged_df.total_obligated_funds / merged_df.fatal_shootings
merged_df['total_funding_per_person'] = merged_df.total_obligated_funds / merged_df.avg_popn / 5

merged_df['fatal_shootings'] = round(merged_df['fatal_shootings'] / 5)

In [None]:
# Output to csv
merged_df.to_csv('clean_data/grant_summary_table.csv', index=False)

#### Create new dataset for closest 5 cities by 3 metrics

For ease and efficiency of computation, we will use the `scipy.spatial.distance` package for this analysis.

In [13]:
# Function to find the 5 closest cities based on a specific column using 1-norm (Manhattan distance)
def find_closest_cities_1norm(df, column):
    distances = cdist(df[[column]], df[[column]], metric='cityblock')
    np.fill_diagonal(distances, np.inf)  # To exclude the city itself from being considered
    closest_indices = np.argsort(distances, axis=1)[:, :5]
    closest_cities = df['city_state'].values[closest_indices]
    return [list(cities) for cities in closest_cities]

# Calculate closest 5 cities for each column
gun_df['closest_5_fatal_shootings'] = find_closest_cities_1norm(gun_df, 'fatal_shootings')
gun_df['closest_5_avg_popn'] = find_closest_cities_1norm(gun_df, 'avg_popn')
gun_df['closest_5_rate_per_100k'] = find_closest_cities_1norm(gun_df, 'rate_per_100k')

In [14]:
# Output to pickle, to preserve the list format of columns
gun_df.to_pickle('clean_data/gun_df_distances.pkl')