# Load libraries and configs

In [None]:
%autosave 0

In [None]:
def load_srs_data(year):
    """
    Load srs dataset and sum the incident counts into annual_total_incidents column
    Jan = v95
    Feb = v213
    Mar = v331
    Apr = v449
    May = v567
    Jun = v685
    Jul = v803
    Aug = v921
    Sep = v1039
    Oct = v1157
    Nov = v1275
    Dec = v1393
    """
    df = pd.read_csv("../compute_weights/Data/srs2016_2020_smoothed.csv").rename(columns={"ORI_UNIVERSE":"ORI"})
    df['srs_annual_total_incidents'] = df[["totcrime"]].sum(axis=1)

    return df

In [None]:
def load_reta_data(year,output_folder):
    """
    Load reta/srs merge dataset
    """
    df = pd.read_csv(output_folder/"artifacts"/f"missing_months_{year}.csv")

    return df

In [None]:
def get_incident_count_by_agency_id_from_database(year):
    """ Query the total incident count by agency_id from the aws NIBRS database
    Args:
        year: The year to pull from
    """
    
    next_year = datetime(day=1,month=1,year=int(year)+1).date()
    
    # Get list of agency IDs, Current ORIs and Legacy ORIs
    sql_query = f"""
    SELECT DISTINCT 
     ref_agency.agency_id,
     ref_agency.ori,
     ref_agency.legacy_ori
    FROM ucr_prd.ref_agency_yearly
	   LEFT JOIN ucr_prd.ref_agency USING (agency_id)
	   LEFT JOIN ucr_prd.ref_agency_status USING (agency_id, data_year)
    WHERE ref_agency_status.data_year = {year} 
     AND ref_agency_yearly.is_nibrs IS TRUE
     AND CAST(ref_agency.nibrs_start_date as date) <= CAST('{next_year}' as date);
    """

    df_ori = pd.read_sql(sql_query, engine_database)
    
    # Get counts of incidents by agency ID
    sql_query = f"""
    SELECT 
     agency_id,
     COUNT(*)
    FROM ucr_prd.nibrs_incident
    WHERE EXTRACT(year FROM nibrs_incident.incident_date) = {year}
    GROUP BY agency_id
    ORDER BY agency_id
    """
    df_incident_count = pd.read_sql(sql_query, engine_database)

    df_incident_count.rename(columns={'count': 'nibrs_annual_total_incidents'}, inplace=True)
    
    df = df_ori.merge(df_incident_count, how='left', left_on='agency_id', right_on='agency_id')

    return df

In [None]:
def load_datasets(year, output_folder):
    # Load the relevant datasets for the provided year
    # ORI Fields
    # Reta-mm
        # ORI_reta
    # Universe
        # ORI_current_univ, matches ORI_reta
        # LEGACY_ORI, matches ORI_reta
        # ORI_current_from_legacy_univ
        

    print(year)

    # LOAD RETA
    
    df_reta = load_reta_data(year, output_folder)
    print('df_reta loaded', len(df_reta), 'rows')
    # Convert population "100,000" string to 100000 integer
    df_reta['POPULATION'] = df_reta['POPULATION'].replace({',':''},regex=True).apply(pd.to_numeric,1)
    df_reta = df_reta.rename(columns={"ORI": "ORI_reta"})
    
    # Keep AGENCY_STATUS = 'Active' or 'Federal'
    # Keep COVERED_FLAG = 'N'
    # Keep DORMANT_FLAG = 'N'
    length_start = len(df_reta)
    print('  ', len(df_reta[(df_reta.AGENCY_STATUS != 'Active') & 
                            (df_reta.AGENCY_STATUS != 'Federal')]), 'AGENCY_STATUS not Active or Federal (Likely LEOKA or blank)')
    print('  ', len(df_reta[(df_reta.COVERED_FLAG != 'N')]), 'COVERED_FLAG not N (agency reporting is duplicated by another)')
    print('  ', len(df_reta[(df_reta.DORMANT_FLAG != 'N')]), 'DORMANT_FLAG not N (agency is dormant)')
    df_final = df_reta[((df_reta.AGENCY_STATUS == 'Active') | 
                        (df_reta.AGENCY_STATUS == 'Federal')) &
                       (df_reta.COVERED_FLAG == 'N') &
                       (df_reta.DORMANT_FLAG == 'N')
                      ]
    length_change = length_start - len(df_final)
    print('  ', length_change, 'rows dropped due to 1 or more of above conditions')
    print('  ', len(df_final), 'rows kept as df_final')

    
    # LOAD UNIVERSE
    print()
    df_universe = pd.read_csv(output_folder / "initial_tasks_output" /f"ref_agency_{year}.csv")
    print('df_universe loaded', len(df_universe), 'rows')

    length_start = len(df_universe)
    print('  ', len(df_universe[(df_universe.AGENCY_STATUS != 'A') & 
                            (df_universe.AGENCY_STATUS != 'F')]), 'AGENCY_STATUS not Active (A) or Federal (F) (Likely LEOKA or blank)')
    print('  ', len(df_universe[(df_universe.COVERED_BY_LEGACY_ORI.notna())]), 'COVERED_FLAG not empty (agency reporting is duplicated by another)')
    print('  ', len(df_universe[(df_universe.DORMANT_FLAG != 'N')]), 'DORMANT_FLAG not N (agency is dormant)')
    df_universe = df_universe[((df_universe.AGENCY_STATUS == 'A') | 
                               (df_universe.AGENCY_STATUS == 'F')) &
                              (df_universe.COVERED_BY_LEGACY_ORI.isna()) &
                              (df_universe.DORMANT_FLAG == 'N')
                             ]
    length_change = length_start - len(df_universe)
    print('  ', length_change, 'rows dropped due to 1 or more of above conditions')
    print('  ', len(df_universe), 'rows kept')
    print()
    
    # Keep only Reporting_type = 'I'
    print('df_final start with', len(df_final), 'rows')
    length_start = len(df_final)
    # Merge Universe data that matches reta current ORI
    # Merge Universe data that matches reta legacy ORI
    df_final = df_final.merge(df_universe[['ORI',
                                           'REPORTING_TYPE',
                                           'PE_MALE_OFFICER_COUNT',
                                           'PE_FEMALE_OFFICER_COUNT']], how='left', left_on='ORI_reta', right_on='ORI')
    df_final = df_final.rename(columns={"REPORTING_TYPE": "REPORTING_TYPE_current_ori"})
    df_final = df_final.rename(columns={"ORI": "ORI_current_univ"})
    df_final['Officer_Count'] = df_final['PE_MALE_OFFICER_COUNT'] + df_final['PE_FEMALE_OFFICER_COUNT']
    print('   Universe officer count merged onto df_final')

    print('  ', len(df_final[df_final['REPORTING_TYPE_current_ori'].notna()]), 'Reta ORI matches Universe Current ORI')
    print('    ', len(df_final[(df_final.REPORTING_TYPE_current_ori.notna()) & 
                               (df_final.REPORTING_TYPE_current_ori == 'I')]), 'Type I (NIBRS Reporters)')
    print('    ', len(df_final[(df_final.REPORTING_TYPE_current_ori.notna()) &
                               (df_final.REPORTING_TYPE_current_ori != 'I')]), 'Not Type I (dropped)')
    
#     print('  ', df_final['REPORTING_TYPE_current_ori'].count(), 'universe rows match current ORI')
#     print('  ', df_final['REPORTING_TYPE_current_ori'].isna().sum(), 'universe rows do not match current ORI')
    
    df_final = df_final.merge(df_universe[['LEGACY_ORI','ORI','REPORTING_TYPE']], how='left', left_on='ORI_reta', right_on='LEGACY_ORI')
    df_final = df_final.rename(columns={"REPORTING_TYPE": "REPORTING_TYPE_legacy_ori"})
    df_final = df_final.rename(columns={"ORI": "ORI_current_from_legacy_univ"})

    print('  ', len(df_final[df_final['REPORTING_TYPE_current_ori'].isna()]), 'Reta ORI matches Universe Legacy ORI')
    print('    ', len(df_final[(df_final.REPORTING_TYPE_current_ori.isna()) & 
                               (df_final.REPORTING_TYPE_legacy_ori == 'I')]), 'Type I (NIBRS Reporters)')
    print('    ', len(df_final[(df_final.REPORTING_TYPE_current_ori.isna()) &
                               (df_final.REPORTING_TYPE_legacy_ori != 'I')]), 'Not Type I (dropped)')

    df_final = df_final[(df_final.REPORTING_TYPE_current_ori == 'I') | 
                        ((df_final.REPORTING_TYPE_current_ori != 'I') & 
                         (df_final.REPORTING_TYPE_legacy_ori == 'I'))
                       ]
    print('  ', len(df_final), 'rows kept')
    
    # Useful Code:
    # agencies found by normal current ORI
        # df_final[(df_final.REPORTING_TYPE_current_ori == 'I')]
    # subset of agencies that could only be found by legacy ori
        # df_final[(df_final['REPORTING_TYPE_current_ori'].isna())==True]

    # LOAD INCIDENT COUNTS FROM NIBRS (AWS)
    # agency_id, ori, legacy_ori, nibrs_annual_total_incidents
    df_incidents = get_incident_count_by_agency_id_from_database(year)
    print()
    print('df_incidents loaded', len(df_incidents), 'rows')

    df_final = df_final.merge(df_incidents[['ori','nibrs_annual_total_incidents']], how='left', left_on='ORI_reta', right_on='ori')
    print('  ', df_final.ori.count(), 'agencies found in AWS using current ORIs')
    df_final = df_final.rename(columns={"ori": "ori_current_from_aws"})
    df_final = df_final.rename(columns={"nibrs_annual_total_incidents": "incidents_current_ori"})

    df_final = df_final.merge(df_incidents[['ori','nibrs_annual_total_incidents']], how='left', left_on='LEGACY_ORI', right_on='ori')
    print('  ', df_final.ori.count(), 'agencies found in AWS using legacy ORIs')
    df_final = df_final.rename(columns={"ori": "ori_legacy_from_aws"})
    df_final = df_final.rename(columns={"nibrs_annual_total_incidents": "incidents_legacy_ori"})

    df_final = df_final.merge(df_incidents[['ori','nibrs_annual_total_incidents']], how='left', left_on='ORI_current_from_legacy_univ', right_on='ori')
    print('  ', df_final.ori.count(), 'agencies found in AWS using current ORIs backtracked from legacy ORIs')
    df_final = df_final.rename(columns={"ori": "ori_current_from_legacy_aws"})
    df_final = df_final.rename(columns={"nibrs_annual_total_incidents": "incidents_current_from_legacy_ori"})
    
    # Collapse all nibrs_annual_total_incidents columns into single final column
    df_final['nibrs_annual_total_incidents'] = df_final[['incidents_current_ori',
                                                       'incidents_legacy_ori',
                                                       'incidents_current_from_legacy_ori']].max(axis = 1, skipna = True)
    print('    ', len(df_final[df_final['nibrs_annual_total_incidents']>0]), 'Total agencies with incidents in AWS')
    print('    ', len(df_final[df_final['nibrs_annual_total_incidents']==0]), 'Total agencies with zero incidents in AWS')
    print('    ', len(df_final[df_final['nibrs_annual_total_incidents'].isna()]), 'Total agencies not in AWS')
    print('    ', df_final[['nibrs_annual_total_incidents']].sum(axis = 0, skipna = True)[0], 'Incidents linked from AWS')

    
    # LOAD INCIDENT COUNTS FROM SRS
    df_srs = load_srs_data(year)
    print()
    print('df_srs loaded', len(df_srs), 'rows')
    df_final = df_final.merge(df_srs[['ORI','srs_annual_total_incidents']], how='left', left_on='ORI_reta', right_on='ORI')
    print('  ', df_final.ORI.count(), 'agencies found in AWS using current ORIs')
    df_final = df_final.rename(columns={"ORI": "ori_current_from_srs"})
    df_final = df_final.rename(columns={"srs_annual_total_incidents": "srs_incidents_current_ori"})

    df_final = df_final.merge(df_srs[['ORI','srs_annual_total_incidents']], how='left', left_on='LEGACY_ORI', right_on='ORI')
    print('  ', df_final.ORI.count(), 'agencies found in AWS using legacy ORIs')
    df_final = df_final.rename(columns={"ORI": "ori_legacy_from_srs"})
    df_final = df_final.rename(columns={"srs_annual_total_incidents": "srs_incidents_legacy_ori"})

    df_final = df_final.merge(df_srs[['ORI','srs_annual_total_incidents']], how='left', left_on='ORI_current_from_legacy_univ', right_on='ORI')
    print('  ', df_final.ORI.count(), 'agencies found in AWS using current ORIs backtracked from legacy ORIs')
    df_final = df_final.rename(columns={"ORI": "ori_current_from_legacy_srs"})
    df_final = df_final.rename(columns={"srs_annual_total_incidents": "srs_incidents_current_from_legacy_ori"})
    
    # Collapse all srs_annual_total_incidents columns into single final column
    df_final['srs_annual_total_incidents'] = df_final[['srs_incidents_current_ori',
                                                       'srs_incidents_legacy_ori',
                                                       'srs_incidents_current_from_legacy_ori']].max(axis = 1, skipna = True)
    print('    ', len(df_final[df_final['srs_annual_total_incidents']>0]), 'Total agencies with incidents in SRS')
    print('    ', len(df_final[df_final['srs_annual_total_incidents']==0]), 'Total agencies with zero incidents in SRS')
    print('    ', len(df_final[df_final['srs_annual_total_incidents'].isna()]), 'Total agencies without incidents in SRS')
    print('    ', df_final[['srs_annual_total_incidents']].sum(axis = 0, skipna = True)[0], 'Incidents linked from SRS')

    return df_final, df_universe, df_incidents, df_srs

In [None]:
def clean_up_oris(df_final):
#     # ORI Columns
#     ORI_reta
#     ORI_current_univ
#     ORI_current_from_legacy_univ
# MAKE    ORI_resolved
#     ORI_current_univ for REPORTING_TYPE_current_ori
#     ORI_current_from_legacy_univ for REPORTING_TYPE_legacy_ori
    
#     # Reporting Type
#     REPORTING_TYPE_current_ori
#     REPORTING_TYPE_legacy_ori

# K    nibrs_annual_total_incidents
# K    srs_annual_total_incidents
    
    mask_current = (df_final['REPORTING_TYPE_current_ori'].notna())
    mask_legacy =  (df_final['REPORTING_TYPE_current_ori'].isna())
    df_final.loc[mask_current, 'ORI_resolved'] = df_final['ORI_current_univ']
    df_final.loc[mask_legacy, 'ORI_resolved'] = df_final['ORI_current_from_legacy_univ']
    
# Drop leftover columns
    df_final = df_final.drop(['LEGACY_ORI',
                              'ori_current_from_aws','ori_legacy_from_aws','ori_current_from_legacy_aws',
                              'ori_current_from_srs','ori_legacy_from_srs','ori_current_from_legacy_srs',
                              'incidents_current_ori','incidents_legacy_ori','incidents_current_from_legacy_ori',
                              'srs_incidents_current_ori','srs_incidents_legacy_ori',
                              'srs_incidents_current_from_legacy_ori'], axis=1)

    return df_final

# Add comparison flag columns to df_nibrs dataframe

In [None]:
# Functions to add flag columns with comparisons of various nibrs/srs statuses
def create_greater_greater_flag(r):
    if (r['nibrs_annual_total_incidents'] > 0) & (r['srs_annual_total_incidents'] > 0):
        return 1
    else:
        return 0


def create_greater_equal_flag(r):

    if (r['nibrs_annual_total_incidents'] > 0) & (r['srs_annual_total_incidents'] == 0):
        return 1
    else:
        return 0


def create_equal_greater_flag(r):

    if (r['nibrs_annual_total_incidents'] == 0) & (r['srs_annual_total_incidents'] > 0):
        return 1
    else:
        return 0


def create_equal_equal_flag(r):
    if (r['nibrs_annual_total_incidents'] == 0) & (r['srs_annual_total_incidents'] == 0):
        return 1
    else:
        return 0


def create_present_present_flag(r):
    # print(r['nibrs_annual_total_incidents'])
    # print(r['srs_annual_total_incidents'])
    if pd.notnull(r['nibrs_annual_total_incidents']) & pd.notnull(r['srs_annual_total_incidents']):
        return 1
    else:
        return 0


def create_present_missing_flag(r):
    if pd.notnull(r['nibrs_annual_total_incidents']) & pd.isnull(r['srs_annual_total_incidents']):
        return 1
    else:
        return 0


def create_missing_present_flag(r):
    if pd.isnull(r['nibrs_annual_total_incidents']) & pd.notnull(r['srs_annual_total_incidents']):
        return 1
    else:
        return 0


def create_missing_missing_flag(r):
    if pd.isnull(r['nibrs_annual_total_incidents']) & pd.isnull(r['srs_annual_total_incidents']):
        return 1
    else:
        return 0


def create_greater_missing_flag(r):
    if (r['nibrs_annual_total_incidents'] > 0) & pd.isnull(r['srs_annual_total_incidents']):
        return 1
    else:
        return 0


def create_equal_missing_flag(r):
    if (r['nibrs_annual_total_incidents'] == 0) & pd.isnull(r['srs_annual_total_incidents']):
        return 1
    else:
        return 0


def create_missing_greater_flag(r):
    if pd.isnull(r['nibrs_annual_total_incidents']) & (r['srs_annual_total_incidents'] > 0):
        return 1
    else:
        return 0


def create_missing_equal_flag(r):
    if pd.isnull(r['nibrs_annual_total_incidents']) & (r['srs_annual_total_incidents'] == 0):
        return 1
    else:
        return 0

In [None]:
def compare_nibrs_to_srs(df_final):
    df_final['NIBRS>0_SRS>0'] = df_final.apply(create_greater_greater_flag, axis=1)
    
    # NIBRS counts > 0, SRS counts == 0
    df_final['NIBRS>0_SRS=0'] = df_final.apply(create_greater_equal_flag, axis=1)

    # NIBRS counts == 0, SRS counts > 0
    df_final['NIBRS=0_SRS>0'] = df_final.apply(create_equal_greater_flag, axis=1)

    # NIBRS counts == 0, SRS counts == 0
    df_final['NIBRS=0_SRS=0'] = df_final.apply(create_equal_equal_flag, axis=1)

    # NIBRS counts present, SRS counts present
    df_final['NIBRS_present_SRS_present'] = df_final.apply(create_present_present_flag, axis=1)

    # NIBRS counts present, SRS counts missing
    df_final['NIBRS_present_SRS_missing'] = df_final.apply(create_present_missing_flag, axis=1)

    # NIBRS counts missing, SRS counts present
    df_final['NIBRS_missing_SRS_present'] = df_final.apply(create_missing_present_flag, axis=1)

    # NIBRS counts missing, SRS counts missing
    df_final['NIBRS_missing_SRS_missing'] = df_final.apply(create_missing_missing_flag, axis=1)

    # NIBRS counts > 0, SRS counts missing
    df_final['NIBRS>0_SRS_missing'] = df_final.apply(create_greater_missing_flag, axis=1)
    # NIBRS counts = 0, SRS counts missing
    df_final['NIBRS=0_SRS_missing'] = df_final.apply(create_equal_missing_flag, axis=1)

    # NIBRS counts missing, SRS counts>0
    df_final['NIBRS_missing_SRS>0'] = df_final.apply(create_missing_greater_flag, axis=1)

    # NIBRS counts missing, SRS counts=0
    df_final['NIBRS_missing_SRS=0'] = df_final.apply(create_missing_equal_flag, axis=1)
    
    return df_final

# Generate df_output

In [None]:
def add_spacer_row(df):
    df.loc[len(df)]=['-----','-----','-----','-----','-----']
    return df

def add_blank_row(df):
    df.loc[len(df)]=[' ',' ',' ',' ',' ']
    return df

In [None]:
def add_output_row(df, year, row, counts, flag_name, Flag_Definition, Comments):
    # Columns for df = ['<year>_Counts', 'Flag_Name', 'Flag_Definition', 'Comments']
    df.at[row,f'{year}_Counts'] = counts
    df.at[row,'Flag_Name'] = flag_name
    df.at[row,'Flag_Definition'] = Flag_Definition
    df.at[row,'Comments'] = Comments
    return df

In [None]:
def create_output_df(df, df_final, year):
    
    # Records for ORIs present in NIBRS(NIBRS) and SRS
    counts = df_final['NIBRS_present_SRS_present'].sum()
    flag_name = 'NIBRS_present_SRS_present'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 0, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS>0_SRS>0'].sum()
    flag_name = 'NIBRS>0_SRS>0'
    Flag_Definition = 'ORIs with incidents listed in both NIBRS and SRS'
    Comments = 'Expected result for most agencies'
    df = add_output_row(df, year, 1, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS>0_SRS=0'].sum()
    flag_name = 'NIBRS>0_SRS=0'
    Flag_Definition = 'ORIs with incidents listed in NIBRS but none in SRS'
    Comments = 'Ok if all NIBRS incidents are type=other'
    df = add_output_row(df, year, 2, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS>0'].sum()
    flag_name = 'NIBRS=0_SRS>0'
    Flag_Definition = 'ORIs with incidents listed in SRS but none in NIBRS'
    Comments = ''
    df = add_output_row(df, year, 3, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS=0'].sum()
    flag_name = 'NIBRS=0_SRS=0'
    Flag_Definition = 'ORIs with no incidents listed in NIBRS or SRS'
    Comments = ''
    df = add_output_row(df, year, 4, counts, flag_name, Flag_Definition, Comments) 

    
    # Records for ORIs present in NIBRS(NIBRS) but missing in SRS
    counts = df_final['NIBRS_present_SRS_missing'].sum()
    flag_name = 'NIBRS_present_SRS_missing'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 5, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS>0_SRS_missing'].sum()
    flag_name = 'NIBRS>0_SRS_missing'
    Flag_Definition = 'ORI has Incidents listed in NIBRS, but ORI missing from SRS'
    Comments = ''
    df = add_output_row(df, year, 6, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS_missing'].sum()
    flag_name = 'NIBRS=0_SRS_missing'
    Flag_Definition = 'ORI has no incidents listed in NIBRS, and ORI missing from SRS'
    Comments = ''
    df = add_output_row(df, year, 7, counts, flag_name, Flag_Definition, Comments) 

    
    # Records for ORIs missing in NIBRS(NIBRS) but present in SRS
    counts = df_final['NIBRS_missing_SRS_present'].sum()
    flag_name = 'NIBRS_missing_SRS_present'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 8, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS_missing_SRS>0'].sum()
    flag_name = 'NIBRS_missing_SRS>0'
    Flag_Definition = 'ORI has Incidents listed in SRS, but ORI missing from NIBRS'
    Comments = ''
    df = add_output_row(df, year, 9, counts, flag_name, Flag_Definition, Comments) 

    counts = df_final['NIBRS_missing_SRS=0'].sum()
    flag_name = 'NIBRS_missing_SRS=0'
    Flag_Definition = 'ORI has no incidents listed in SRS, and ORI missing from NIBRS'
    Comments = ''
    df = add_output_row(df, year, 10, counts, flag_name, Flag_Definition, Comments)  
   

    # Records for ORIs missing in both NIBRS(NIBRS) and SRS
    counts = df_final['NIBRS_missing_SRS_missing'].sum()
    flag_name = 'NIBRS_missing_SRS_missing'
    Flag_Definition = 'ORI is missing from both NIBRS and SRS'
    Comments = ''
    df = add_output_row(df, year, 11, counts, flag_name, Flag_Definition, Comments)

    return df

In [None]:
def create_output_df_high_population(df, df_final, year, limit):
    # Only for ORIs with population > limit
    # Records for ORIs present in NIBRS(NIBRS) and SRS
    counts = df_final['NIBRS_present_SRS_present'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS_present_SRS_present'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 0, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS>0_SRS>0'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS>0_SRS>0'
    Flag_Definition = 'ORIs with incidents listed in both NIBRS and SRS'
    Comments = 'Expected result for most agencies'
    df = add_output_row(df, year, 1, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS>0_SRS=0'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS>0_SRS=0'
    Flag_Definition = 'ORIs with incidents listed in NIBRS but none in SRS'
    Comments = 'Ok if all NIBRS incidents are type=other'
    df = add_output_row(df, year, 2, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS>0'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS=0_SRS>0'
    Flag_Definition = 'ORIs with incidents listed in SRS but none in NIBRS'
    Comments = ''
    df = add_output_row(df, year, 3, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS=0'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS=0_SRS=0'
    Flag_Definition = 'ORIs with no incidents listed in NIBRS or SRS'
    Comments = ''
    df = add_output_row(df, year, 4, counts, flag_name, Flag_Definition, Comments) 

    
    # Records for ORIs present in NIBRS(NIBRS) but missing in SRS
    counts = df_final['NIBRS_present_SRS_missing'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS_present_SRS_missing'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 5, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS>0_SRS_missing'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS>0_SRS_missing'
    Flag_Definition = 'ORI has Incidents listed in NIBRS, but ORI missing from SRS'
    Comments = ''
    df = add_output_row(df, year, 6, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS_missing'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS=0_SRS_missing'
    Flag_Definition = 'ORI has no incidents listed in NIBRS, and ORI missing from SRS'
    Comments = ''
    df = add_output_row(df, year, 7, counts, flag_name, Flag_Definition, Comments) 
    
    # Records for ORIs missing in NIBRS(NIBRS) but present in SRS
    counts = df_final['NIBRS_missing_SRS_present'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS_present'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 8, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS_missing_SRS>0'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS>0'
    Flag_Definition = 'ORI has Incidents listed in SRS, but ORI missing from NIBRS'
    Comments = ''
    df = add_output_row(df, year, 9, counts, flag_name, Flag_Definition, Comments) 

    counts = df_final['NIBRS_missing_SRS=0'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS=0'
    Flag_Definition = 'ORI has no incidents listed in SRS, and ORI missing from NIBRS'
    Comments = ''
    df = add_output_row(df, year, 10, counts, flag_name, Flag_Definition, Comments)    
   

    # Records for ORIs missing in both NIBRS(NIBRS) and SRS
    counts = df_final['NIBRS_missing_SRS_missing'][(df_final['POPULATION']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS_missing'
    Flag_Definition = 'ORI is missing from both NIBRS and SRS'
    Comments = ''
    df = add_output_row(df, year, 11, counts, flag_name, Flag_Definition, Comments)
    
    return df

In [None]:
def create_output_df_officer_count(df, df_final, year, limit):
    # Only for ORIs with Officer_Count > limit
    # Records for ORIs present in NIBRS(NIBRS) and SRS
    counts = df_final['NIBRS_present_SRS_present'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS_present_SRS_present'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 0, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS>0_SRS>0'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS>0_SRS>0'
    Flag_Definition = 'ORIs with incidents listed in both NIBRS and SRS'
    Comments = 'Expected result for most agencies'
    df = add_output_row(df, year, 1, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS>0_SRS=0'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS>0_SRS=0'
    Flag_Definition = 'ORIs with incidents listed in NIBRS but none in SRS'
    Comments = 'Ok if all NIBRS incidents are type=other'
    df = add_output_row(df, year, 2, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS>0'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS=0_SRS>0'
    Flag_Definition = 'ORIs with incidents listed in SRS but none in NIBRS'
    Comments = ''
    df = add_output_row(df, year, 3, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS=0'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS=0_SRS=0'
    Flag_Definition = 'ORIs with no incidents listed in NIBRS or SRS'
    Comments = ''
    df = add_output_row(df, year, 4, counts, flag_name, Flag_Definition, Comments) 

    
    # Records for ORIs present in NIBRS(NIBRS) but missing in SRS
    counts = df_final['NIBRS_present_SRS_missing'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS_present_SRS_missing'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 5, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS>0_SRS_missing'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS>0_SRS_missing'
    Flag_Definition = 'ORI has Incidents listed in NIBRS, but ORI missing from SRS'
    Comments = ''
    df = add_output_row(df, year, 6, counts, flag_name, Flag_Definition, Comments) 
    
    counts = df_final['NIBRS=0_SRS_missing'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS=0_SRS_missing'
    Flag_Definition = 'ORI has no incidents listed in NIBRS, and ORI missing from SRS'
    Comments = ''
    df = add_output_row(df, year, 7, counts, flag_name, Flag_Definition, Comments) 

    
    # Records for ORIs missing in NIBRS(NIBRS) but present in SRS
    counts = df_final['NIBRS_missing_SRS_present'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS_present'
    Flag_Definition = ''
    Comments = ''
    df = add_output_row(df, year, 8, counts, flag_name, Flag_Definition, Comments)
    
    counts = df_final['NIBRS_missing_SRS>0'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS>0'
    Flag_Definition = 'ORI has Incidents listed in SRS, but ORI missing from NIBRS'
    Comments = ''
    df = add_output_row(df, year, 9, counts, flag_name, Flag_Definition, Comments) 

    counts = df_final['NIBRS_missing_SRS=0'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS=0'
    Flag_Definition = 'ORI has no incidents listed in SRS, and ORI missing from NIBRS'
    Comments = ''
    df = add_output_row(df, year, 10, counts, flag_name, Flag_Definition, Comments) 
   

    # Records for ORIs missing in both NIBRS(NIBRS) and SRS
    counts = df_final['NIBRS_missing_SRS_missing'][(df_final['Officer_Count']>limit)].sum()
    flag_name = 'NIBRS_missing_SRS_missing'
    Flag_Definition = 'ORI is missing from both NIBRS and SRS'
    Comments = ''
    df = add_output_row(df, year, 11, counts, flag_name, Flag_Definition, Comments)

    return df

# Generate CSV Output

In [None]:
def generate_csv_files(year, df):
    df[df['NIBRS>0_SRS=0']==1].to_csv(output_dir / f"{year}_NIBRS>0_SRS=0.csv")
    print("Saved:", output_dir / f"{year}_NIBRS>0_SRS=0.csv")
    df[df['NIBRS>0_SRS_missing']==1].to_csv(output_dir / f"{year}_NIBRS>0_SRS_missing.csv")
    print("Saved:", output_dir / f"{year}_NIBRS>0_SRS_missing.csv")
    df[df['NIBRS_missing_SRS>0']==1].to_csv(output_dir / f"{year}_NIBRS_missing_SRS>0.csv")
    print("Saved:", output_dir / f"{year}_NIBRS_missing_SRS>0.csv")
    df[df['NIBRS_missing_SRS=0']==1].to_csv(output_dir / f"{year}_NIBRS_missing_SRS=0.csv")
    print("Saved:", output_dir / f"{year}_NIBRS_missing_SRS=0.csv")
    df[df['NIBRS_missing_SRS_missing']==1].to_csv(output_dir / f"{year}_NIBRS_missing_SRS_missing.csv")
    print("Saved:", output_dir / f"{year}_NIBRS_missing_SRS_missing.csv")
