In [1]:
import pandas as pd
pd.options.display.max_columns=200
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500

# from thefuzz import fuzz
# from thefuzz import process

from datetime import datetime

from glob import glob
import re

In [2]:
LIMIT_TO_OPEN_GRANTS = False
LIMIT_TO_RELEVANT_GRANTS = True
LIMIT_TO_NEW_ENTRIES_ONLY = True
REMOVE_STATE_GOVTS = True

def normalize_program_names(_df, title_col = 'cfda_title'):
    df = _df.copy()
    df[title_col] = df[title_col].str.lower()
    df[title_col] = df[title_col].fillna('none')

    return df

In [3]:

def get_year(file_path):

    pattern = r'FY(\d{4})'

    # Search for the pattern in the string
    match = re.search(pattern, file_path)

    if match:
        year = match.group(1)
        return year
    else:
        return 1900


In [4]:
import re

all_data = []
for g in glob('usa_spending_contract_data/FY*/*',recursive=True):
    temp_data = pd.read_csv(g)
    temp_data['fiscal_year'] = get_year(g)
    all_data.append(temp_data)

raw_contracts = pd.concat(all_data)


# raw_contracts = pd.read_csv('usa_spending_contract_data/FY2019/All_Assistance_PrimeTransactions_2024-05-28_H10M16S23_1.csv')
# # raw_contracts = pd.read_csv('usa_spending_contract_data/FY2024/All_Assistance_PrimeTransactions_2024-05-28_H01M15S04_1.csv')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
contracts = normalize_program_names(raw_contracts)

In [6]:
contracts['total_outlayed_amount_for_overall_award'] = contracts['total_outlayed_amount_for_overall_award'].fillna(0)

contracts['estimated_remaining_funds'] = contracts['total_obligated_amount'] - contracts['total_outlayed_amount_for_overall_award']

contracts['spent_percent'] = ((contracts['total_outlayed_amount_for_overall_award'] / contracts['total_obligated_amount'])*100).round(0)

contracts['remaining_funds_percent'] = ((contracts['estimated_remaining_funds'] / contracts['total_obligated_amount'])*100).round(0)

contracts['period_of_performance_current_end_date'] = pd.to_datetime(contracts['period_of_performance_current_end_date'])

contracts['period_of_performance_start_date'] = pd.to_datetime(contracts['period_of_performance_start_date'])
contracts['action_date'] = pd.to_datetime(contracts['action_date'])


contracts['grant_is_open'] = contracts['period_of_performance_current_end_date'] >= datetime(2024,9,1)


contracts['program_activities_funding_this_award'] = contracts['program_activities_funding_this_award'].fillna('missing')

# Code program names 

In [7]:
program_map = {
    'comunity violence intervention and prevention intitiative':
        ['0153: COMMUNITY-BASED VIOLENCE PREVENTION INITIATIVES',
        #'public safety partnership and community policing grants',
        'community-based violence prevention program'],
    'victims of crime act (voca)':['0001: CRIME VICTIMS GRANTS AND ASSISTANCE','0006: TRIBAL VICTIMS ASSISTANCE GRANTS'],
    'center for culturally responsive victim services':[],
    'byrne memorial justice assistance grant':['0007: JUSTICE ASSISTANCE GRANTS','edward byrne memorial justice assistance grant program'],
    'byrne criminal justice innovation':['0081: COMMUNITY BASED CRIME REDUCTION PROGRAM (BYRNE CRIMINAL JUSTICE INNOVATION)'],
    'strategies to support children exposed to violence':['0024: CHILDREN EXPOSED TO VIOLENCE','children exposed to violence'],
    'project safe neighborhood':['0043: PROJECT SAFE NEIGHBORHOODS','project safe neighborhoods'],
    'youth violence and reductions':[],
    'school violence prevention program':['0143: STOP SCHOOL VIOLENCE ACT'],
    'second chance act community based re-entry program':['0050: SECOND CHANCE ACT/OFFENDER REENTRY'],
    'community policing development micro-grants':['0001: PUBLIC SAFETY AND COMMUNITY POLICING GRANTS'],
    'smart policing':['0021: STRATEGIES FOR POLICING INNOVATION'],
    'cops hiring program':['0009: COPS HIRING PROGRAM']
}


In [8]:
def match_grant_program_to_map(x):
    if x is None:
        return 'other'

    program_matches = []
    for program_name, program_list in program_map.items():

        for program_code in program_list:
            if x.find(program_code) != -1:
                program_matches.append(program_name)
    
    # if len(program_matches) > 1:
    #     print(f"MULTIPLE MATCHES FOR {x}")
        
    
    if len(program_matches) == 0:
        return "other"

    return program_matches[0]

In [9]:
contracts["program_match__funding_code"] = contracts[
    "program_activities_funding_this_award"
].apply(match_grant_program_to_map)

contracts["program_match__title"] = contracts["cfda_title"].apply(
    match_grant_program_to_map
)
contracts["program_match"] = contracts["program_match__funding_code"].where(
    contracts["program_match__funding_code"] != "other",
    contracts["program_match__title"],
)

In [10]:
contracts.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48276 entries, 0 to 1850
Data columns (total 120 columns):
 #    Column                                                          Dtype         
---   ------                                                          -----         
 0    assistance_transaction_unique_key                               object        
 1    assistance_award_unique_key                                     object        
 2    award_id_fain                                                   object        
 3    modification_number                                             object        
 4    award_id_uri                                                    object        
 5    sai_number                                                      object        
 6    federal_action_obligation                                       float64       
 7    total_obligated_amount                                          float64       
 8    total_outlayed_amount_for_overall_a

## Check matches and non-matches

In [11]:
contracts['program_match'].value_counts(dropna=False)

other                                                        31405
byrne memorial justice assistance grant                       9087
victims of crime act (voca)                                   3310
cops hiring program                                           1016
second chance act community based re-entry program             812
school violence prevention program                             770
community policing development micro-grants                    717
project safe neighborhood                                      708
comunity violence intervention and prevention intitiative      152
byrne criminal justice innovation                              152
smart policing                                                  88
strategies to support children exposed to violence              59
Name: program_match, dtype: int64

In [12]:
contracts[contracts['program_match']=='other']['cfda_title'].value_counts()

public safety partnership and community policing grants                                                                                                          2989
coronavirus emergency supplemental funding program                                                                                                               2287
national institute of justice research, evaluation, and development project grants                                                                               1812
dna backlog reduction program                                                                                                                                    1445
state criminal alien assistance program                                                                                                                          1059
drug court discretionary grant program                                                                                                                           1028
serv

In [13]:
contracts[contracts['cfda_title']=='public safety partnership and community policing grants']['transaction_description'].sample(10).values

array(['COP ENHANCEMENT 2018', 'HOPING TO HIRE ONE POLICE OFFICER', 'CHP',
       'TRGP-HIRE', 'CHP', 'SVPP',
       'SAVANNAH- CHATAM COUNTY OF BOARD OF EDUCATION IS PROPOSING TO PROVIDE CIT TRAINING TO ALL SROS TO PROVIDE AN EFFICIENT RESPONSE TO ALL INCIDENT REQUIRING CRISIS INTERVENTION. THEY ALSO PLAN TO PARTNER WILL LOCAL AGENCIES SUCH AS THE DEPARTMENT OF FAMILY AND CHILDREN SERVICES, ALL LOCAL YOUTH AND FAMILY PROGRAMS, ALL LOCAL LAW ENFORCEMENT AGENCIES IN CHATHAM COUNTY, GEORGIA, AND RAPE CRISIS CENTERS. THE BOEPD WILL ALSO PARTNER WITH THE DEPARTMENT OF JUVENILE JUSTICE AND JUVENILE COURT.',
       'ANACONDA-DEER LODGE COUNTY/ANACONDA SCHOOLS - SCHOOL RESOURCE OFFICER PROJECT',
       'COPS OFFICE ANTI-HEROIN TASK FORCE PROGRAM (AHTF)',
       'PULLMAN, WA LEMHWA PROJECT'], dtype=object)

# Only retain useful columns

In [14]:
keep_cols = [
    'usaspending_permalink',
    'fiscal_year',
    'total_obligated_amount',
    'estimated_remaining_funds',
    'spent_percent',
    'grant_is_open',
    'program_match',
    'business_types_description',
    'prime_award_base_transaction_description',
    'transaction_description',
    'funding_opportunity_goals_text',
    'cfda_title',
    'cfda_number',
    'primary_place_of_performance_scope',
    'primary_place_of_performance_city_name',
    'primary_place_of_performance_state_name',
    'primary_place_of_performance_zip_4',
    'recipient_name',
    'funding_agency_name',
    'period_of_performance_start_date',
    'period_of_performance_current_end_date',
    'assistance_award_unique_key',
    'action_date'
]

# De-duplicate & Final Processing


In [16]:
# Use filters for open and program match grants

if LIMIT_TO_OPEN_GRANTS:
    contracts = contracts[contracts['grant_is_open']==True]

if LIMIT_TO_NEW_ENTRIES_ONLY:
    contracts = contracts[contracts['action_type_description'] == 'NEW']

if REMOVE_STATE_GOVTS:
    contracts = contracts[contracts['business_types_description'] != 'STATE GOVERNMENT']

In [17]:
dup_cols = [
    'usaspending_permalink',
]

contracts.sort_values(by='action_date',ascending=True,inplace=True)

contracts.drop_duplicates(subset=dup_cols,keep='last',inplace=True)

# Write out

In [18]:
contracts[keep_cols].to_parquet('clean_data/clean_doj_all_contracts.parquet')

if LIMIT_TO_RELEVANT_GRANTS:
    contracts = contracts[contracts['program_match'] != 'other']

contracts[keep_cols].to_parquet('clean_data/clean_doj_contracts.parquet')

# QA

In [19]:
contracts[contracts['usaspending_permalink']=='https://www.usaspending.gov/award/ASST_NON_2017GPBX0001_1550/']


Unnamed: 0,assistance_transaction_unique_key,assistance_award_unique_key,award_id_fain,modification_number,award_id_uri,sai_number,federal_action_obligation,total_obligated_amount,total_outlayed_amount_for_overall_award,indirect_cost_federal_share_amount,non_federal_funding_amount,total_non_federal_funding_amount,face_value_of_loan,original_loan_subsidy_cost,total_face_value_of_loan,total_loan_subsidy_cost,generated_pragmatic_obligations,disaster_emergency_fund_codes_for_overall_award,outlayed_amount_from_COVID-19_supplementals_for_overall_award,obligated_amount_from_COVID-19_supplementals_for_overall_award,outlayed_amount_from_IIJA_supplemental_for_overall_award,obligated_amount_from_IIJA_supplemental_for_overall_award,action_date,action_date_fiscal_year,period_of_performance_start_date,period_of_performance_current_end_date,awarding_agency_code,awarding_agency_name,awarding_sub_agency_code,awarding_sub_agency_name,awarding_office_code,awarding_office_name,funding_agency_code,funding_agency_name,funding_sub_agency_code,funding_sub_agency_name,funding_office_code,funding_office_name,treasury_accounts_funding_this_award,federal_accounts_funding_this_award,object_classes_funding_this_award,program_activities_funding_this_award,recipient_uei,recipient_duns,recipient_name,recipient_name_raw,recipient_parent_uei,recipient_parent_duns,recipient_parent_name,recipient_parent_name_raw,recipient_country_code,recipient_country_name,recipient_address_line_1,recipient_address_line_2,recipient_city_code,recipient_city_name,prime_award_transaction_recipient_county_fips_code,recipient_county_name,prime_award_transaction_recipient_state_fips_code,recipient_state_code,recipient_state_name,recipient_zip_code,recipient_zip_last_4_code,prime_award_transaction_recipient_cd_original,prime_award_transaction_recipient_cd_current,recipient_foreign_city_name,recipient_foreign_province_name,recipient_foreign_postal_code,primary_place_of_performance_scope,primary_place_of_performance_country_code,primary_place_of_performance_country_name,primary_place_of_performance_code,primary_place_of_performance_city_name,prime_award_transaction_place_of_performance_county_fips_code,primary_place_of_performance_county_name,prime_award_transaction_place_of_performance_state_fips_code,primary_place_of_performance_state_name,primary_place_of_performance_zip_4,prime_award_transaction_place_of_performance_cd_original,prime_award_transaction_place_of_performance_cd_current,primary_place_of_performance_foreign_location,cfda_number,cfda_title,funding_opportunity_number,funding_opportunity_goals_text,assistance_type_code,assistance_type_description,transaction_description,prime_award_base_transaction_description,business_funds_indicator_code,business_funds_indicator_description,business_types_code,business_types_description,correction_delete_indicator_code,correction_delete_indicator_description,action_type_code,action_type_description,record_type_code,record_type_description,highly_compensated_officer_1_name,highly_compensated_officer_1_amount,highly_compensated_officer_2_name,highly_compensated_officer_2_amount,highly_compensated_officer_3_name,highly_compensated_officer_3_amount,highly_compensated_officer_4_name,highly_compensated_officer_4_amount,highly_compensated_officer_5_name,highly_compensated_officer_5_amount,usaspending_permalink,initial_report_date,last_modified_date,fiscal_year,estimated_remaining_funds,spent_percent,remaining_funds_percent,grant_is_open,program_match__funding_code,program_match__title,program_match
