# Report Part 3: Missing Variables - One State


This report looks at the NIBRS database and reports on the number of missing values for various variables given a year of interest. 

These counts are split into types of offenses. For some counts, this uses the broad "Offense against" types (offense against Person, Property, or Society). For other counts, only crimes against Persons are relevant, so the counts instead split by crime type (Assault, Homicide, Human Trafficking, Kidnapping, Sex, Sex Nonforcible).


In [None]:
from datetime import datetime
import os
print("Author: Automated Pipeline")
year = int(os.getenv('DATA_YEAR'))
state = os.getenv('INPUT_STATE')
print("Generating reports for year:",year," and state",state)
print("Report date:", datetime.now().strftime("%m/%d/%y"))

In [None]:
from utils import *
from dictionaries import *
from pathlib import Path
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# --- Create connection
engine_database = connect_to_database()

output_folder = Path(os.getenv("OUTPUT_PIPELINE_DIR"))
output_dir = output_folder / "QC_output_files"
output_dir.mkdir(parents=True, exist_ok=True)

raw_query_folder = output_folder / "QC_query_outputs_files"
raw_query_folder.mkdir(parents=True, exist_ok=True)


state_name_to_abbrev = {v: k for k, v in us_state_abbrev.items()}
  
# QUESTION why are these different from just pulling from the database. 
# NOTE: in person, 09C (Justifiable Homicide) is missing from this list
person_offense_codes = ['09A','09B','100','11A','11B','11C','11D','36A','36B','13A','13B','13C','64A','64B']

property_offense_codes = ['23A','23B','23C','23D','23E','23F','23G','23H','240','200','510','220','250',\
                      '290','270','210','26A','26B','26C','26D','26E','26F','26G','120','280']
society_offense_codes = ['35A','35B','39A','39B','39C','39D','370','40A','40B','520','720','40C']
offense_code_dict = {"Person":person_offense_codes,"Property":property_offense_codes,"Society":society_offense_codes}
    
offense_code_dict["Type A"] = offense_code_dict["Person"] + offense_code_dict["Property"] + offense_code_dict["Society"]

#QUESTION: how did we come up with these codes? Is there any way to derive these codes?
cargo_theft_offense_codes = ['120','210','220','23D','23F','23H','240','26A','26B','26C','26E','26F','26G','270','510']
property_loss_offense_codes = ['100', '200', '510', '220', '250', '290', '270', '210', '26A', '26B', '26C','26D',\
                               '26E', '23A', '23B', '23C', '23D', '23E', '23F', '23G', '23H', '240', '120', '280',\
                               '35A', '35B', '39A', '39B', '39C', '39D','26F','26G']
relevant_offenses_weapons = ['09A','09B','09C','100','11A','11B','11C','11D','120','13A','13B','210','520','64A','64B']
relevant_offenses_criminal = ['720','250','280','35A','35B','39C','370','520']
relevant_offenses_hour = ['09A','13A','13B','13C']

relevant_criminal_activities = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'O', 'P', 'S', 'T', 'U', 'W']
relevant_weapons = ['01', '11', '11A', '12', '12A', '13', '13A', '14', '14A', '15', '15A', '16', '17', '20', '30', '35', '40', '50', '55', '60', '65', '70', '75', '80', '85', '90']

specific_crime_codes = {
    "Assault Offenses":["13A", "13B"],
    "Homicide Offenses":["09A", "09B"], #WHY not 09C?
    "Kidnapping/Abduction":["100"],
    "Human Trafficking Offenses":["64A", "64B"],
    "Sex Offenses":["11A", "11B", "11C", "11D"],
    "Sex Offenses Nonforcible":["36A", "36B"]
}



In [56]:
def get_counts_crime_types(df,valid_col):
    """This function takes a dataframe with a particular valid col and 
    gets the value counts on that column for each of six types of offenses 
    and overall crimes against persons.
    """
    indices = [["Crimes against persons","Freq"],["Crimes against persons","%"]]
    
    crime_types = list(specific_crime_codes.keys())
    crime_types.sort()
    for crime_type in crime_types:
        indices += [[crime_type,"Freq"],[crime_type,"%"]]
        
    index = pd.MultiIndex.from_frame(pd.DataFrame(indices, columns=["",""]))
    data = []
    for source in ["Person"] + crime_types:
        if source == "Person":
            df_source = df.loc[df["offense_code"].isin(offense_code_dict[source])]
        else:
            df_source = df.loc[df["offense_code"].isin(specific_crime_codes[source])]
        
        data.append(df_source[valid_col].value_counts().to_dict())
        
        freqs = df_source[valid_col].value_counts(normalize=True).to_dict()
        freqs = {key: val*100 for key,val in freqs.items()}
        data.append(freqs)
        

    return pd.DataFrame(data,index=index).transpose()
    
    
def get_counts_offense(df, valid_col):
    """This function takes a dataframe with a particular valid col and 
    gets the value counts on that column for each of the three types of 
    offenses.
    """
    index = pd.MultiIndex.from_frame(pd.DataFrame([["Type A","Freq"],["Type A","%"],
                                                   ["Person","Freq"],["Person","%"],
                                                   ["Property","Freq"],["Property","%"],
                                                   ["Society","Freq"],["Society","%"]],
                                                  columns=["",""]))
    
    data = []
    for source in ["Type A","Person","Property","Society"]:
        df_source = df.loc[df["offense_code"].isin(offense_code_dict[source])]
        data.append(df_source[valid_col].value_counts().to_dict())
        
        freqs = df_source[valid_col].value_counts(normalize=True).to_dict()
        freqs = {key: val*100 for key,val in freqs.items()}
        data.append(freqs)
        

    return pd.DataFrame(data,index=index).transpose()

def get_demographic_counts(dem_frame, count_function):
    """Plenty of the counts need age, gender, race, and ethnicity using the same 
    business logic. This function gets the counts for each category.
    
    Args: 
        dem_frame: a dataframe with the relevant demographic columns
        count_function: either get_counts_offense or get_counts_crime_types
    """
    dem_frame[["age_code","sex_code","race_code","ethnicity_code"]] = dem_frame[["age_code","sex_code","race_code","ethnicity_code"]].fillna("999")
    
    
    ####################################
    # AGE CODE
    ####################################
    valid = dem_frame[dem_frame.age_code.isin(["NN", "NB", "BB","AG","99"])].copy()
    valid["valid_age_code"] = "Valid Response"

    unknown = dem_frame[dem_frame.age_code == "00"].copy()
    unknown["valid_age_code"] = "Unknown"

    missing = dem_frame[dem_frame.age_code == "999"].copy()
    missing["valid_age_code"] = "Missing"

    valid_age_counts = count_function(pd.concat([valid,unknown,missing],
                                                    sort=False).dropna(subset=["valid_age_code"]), 
                                          "valid_age_code")

    ####################################
    # SEX CODE
    ####################################

    valid = dem_frame[dem_frame.sex_code.isin(["M","F"])].copy()
    valid["valid_sex"] = "Valid Response"

    unknown = dem_frame[dem_frame.sex_code == "U"].copy()
    unknown["valid_sex"] = "Unknown"

    missing = dem_frame[(dem_frame.sex_code == -99) | (dem_frame.sex_code == " ")].copy()
    missing["valid_sex"] = "Missing"

    valid_sex_counts = count_function(pd.concat([valid,unknown,missing],
                                                    sort=False).dropna(subset=["valid_sex"]), 
                                          "valid_sex")

    ####################################
    # RACE CODE
    ####################################

    valid = dem_frame[dem_frame.race_code.isin(["W", "B", "I", "A", "AP", "C", "J", "P", "O", "M"])].copy()
    valid["valid_race"] = "Valid Response"

    unknown = dem_frame[dem_frame.race_code.isin(["U","99"])].copy()
    unknown["valid_race"] = "Unknown"

    missing = dem_frame[dem_frame.race_code == "999"].copy()
    missing["valid_race"] = "Missing"

    valid_race_counts = count_function(pd.concat([valid,unknown,missing],
                                                     sort=False).dropna(subset=["valid_race"]), 
                                           "valid_race")

    ####################################
    # ETHNICITY CODE
    ####################################

    valid = dem_frame[dem_frame.ethnicity_code.isin(["H","N","M"])].copy()
    valid["valid_ethnicity"] = "Valid Response"

    unknown = dem_frame[dem_frame.ethnicity_code.isin(["U"])].copy()
    unknown["valid_ethnicity"] = "Unknown"

    missing = dem_frame[dem_frame.ethnicity_code == "999"].copy()
    missing["valid_ethnicity"] = "Missing"

    valid_ethnicity_counts = count_function(pd.concat([valid,unknown,missing],
                                                          sort=False).dropna(subset=["valid_ethnicity"]), 
                                                "valid_ethnicity")
    
    return valid_age_counts, valid_sex_counts, valid_race_counts, valid_ethnicity_counts

## Offense

In [None]:
offense_frame = pd.read_csv(raw_query_folder / f"qc_missingness_offense_frame_{state}.csv.gz", low_memory=False)

In [None]:
####################################
#LOCATION TYPE
####################################

#convert location_code char to num so we have an easier time specifying ranges
offense_frame["location_code"] = offense_frame["location_code"].fillna(99).astype(int)

#create recode
offense_frame["valid_location"] = offense_frame["location_code"].apply(lambda x: "Valid Response" if x in range(1,59) else "Missing")

######
#freqs
######
valid_location_counts = get_counts_offense(offense_frame,"valid_location")

print("Getting location counts")
valid_location_counts

In [None]:
####################################
#BIAS MOTIVATION
####################################
#convert bias_code char to num
offense_frame["bias_code"] = offense_frame["bias_code"].fillna(-99).astype(int)

#create recode
offense_frame["valid_bias"] = offense_frame["bias_code"].apply(lambda x: "Valid Response" if int(x) in range(1,89) \
                                                                   else ("Bias Unknown" if int(x) == 99 \
                                                                         else "Missing"))
######
#freqs
######
print("Getting bias counts")
valid_bias_counts = get_counts_offense(offense_frame,"valid_bias")
valid_bias_counts

In [None]:
####################################
#CRIMINAL ACTIVITY/GANG INFORMATION
####################################

#replace N/As with a valid level
offense_frame["criminal_act_code"] = offense_frame["criminal_act_code"].fillna("99")

#filter to levels we need so we do not have any unnecessary levels(missing when item is not mandatory)
unknown = offense_frame[offense_frame.criminal_act_code == "N"].copy()
unknown["valid_criminal"] = "Unknown"

valid = offense_frame[(offense_frame.criminal_act_code != "N") & (offense_frame.criminal_act_code.isin(relevant_criminal_activities))].copy()
valid["valid_criminal"] = "Valid Response"

missing = offense_frame[(offense_frame.criminal_act_code == "99") & (offense_frame.offense_code.isin(relevant_offenses_criminal))].copy()
missing["valid_criminal"] = "Missing"


######
#freqs
######
print("Getting criminal act counts")
valid_criminal_counts = get_counts_offense(pd.concat([unknown,valid,missing]).dropna(subset=["valid_criminal"]),"valid_criminal")
valid_criminal_counts

In [None]:
####################################
#WEAPON INVOLVEMENT
####################################
#replace N/As with a valid level
offense_frame["weapon_code"] = offense_frame["weapon_code"].fillna("99")

unknown = offense_frame[offense_frame.weapon_code == "95"].copy()
unknown["valid_weapon"] = "Unknown"

valid = offense_frame[(offense_frame.weapon_code != "95") & (offense_frame.weapon_code.isin(relevant_weapons))].copy()

valid["valid_weapon"] = "Valid Response"

missing = offense_frame[(offense_frame.weapon_code == "99") & (offense_frame.offense_code.isin(relevant_offenses_weapons))].copy()
missing["valid_weapon"] = "Missing"

print("Getting weapon ID counts")
valid_weapon_counts = get_counts_offense(pd.concat([unknown,valid,missing]).dropna(subset=["valid_weapon"]),"valid_weapon")
valid_weapon_counts

In [None]:
print("Finished with offense counts")
offense_results = pd.concat({"Location type":valid_location_counts,
                               "Bias motivation":valid_bias_counts,
                               "Type of criminal activity/gang information": valid_criminal_counts,
                               "Weapon involevement":valid_weapon_counts}).fillna(0)
offense_results

## Admin

In [None]:
admin_frame = pd.read_csv(raw_query_folder / f"qc_missingness_admin_frame_{state}.csv.gz")

In [None]:
####################################
#CARGO THEFT
####################################

relevant_frame = admin_frame[(admin_frame.cargo_theft_flag.isin(["Y","N"])) | 
                    (((admin_frame.offense_code.isin(cargo_theft_offense_codes)) & 
                     (admin_frame.cargo_theft_flag == " ")))].copy()




valid = relevant_frame[relevant_frame.cargo_theft_flag.isin(["Y","N"])].copy()
valid["valid_cargo"] = "Valid Response"

missing = relevant_frame[(relevant_frame.cargo_theft_flag == " ") & (relevant_frame.offense_code.isin(cargo_theft_offense_codes))].copy()
missing["valid_cargo"] = "Missing"


# for this section, the codes are further subset
valid_prop_codes = ['510','220','26B','270','210','26A','26G','26F','26C','240','23H','120','23D','23F','26E']

both = pd.concat([valid,missing],sort=False)
both = both[both.offense_code.isin(valid_prop_codes)]

valid_cargo_counts = get_counts_offense(both.dropna(subset=["valid_cargo"]), "valid_cargo")
print("Getting cargo theft counts")
valid_cargo_counts

In [None]:
####################################
#INCIDENT HOUR(REPORT DATE=NO)
####################################

#replace N/As with a valid level
admin_frame["incident_hour"] = admin_frame["incident_hour"].fillna(99)
admin_frame_hour = admin_frame.loc[admin_frame["report_date_flag"] == ""].copy()

valid = admin_frame_hour[(admin_frame_hour.incident_hour >= 0) &\
                         (admin_frame_hour.incident_hour <= 24)].copy()
valid["valid_hour1"] = "Valid Response"

missing = admin_frame_hour[(admin_frame_hour.incident_hour == 99) & \
                           (admin_frame_hour.offense_code.isin(relevant_offenses_hour)) & \
                           (admin_frame_hour.victim_type_code == "L")].copy()
missing["valid_hour1"] = "Missing"
print("Getting incident hour counts for report date flag = ''")

valid_hour1_counts = get_counts_offense(pd.concat([valid,missing],sort=False).dropna(subset=["valid_hour1"]), "valid_hour1")
valid_hour1_counts

In [None]:
####################################
#INCIDENT HOUR(REPORT DATE=YES)
####################################

#replace N/As with a valid level
admin_frame["incident_hour"] = admin_frame["incident_hour"].fillna(99)
admin_frame_hour2 = admin_frame.loc[admin_frame["report_date_flag"] == "R"].copy()

valid = admin_frame_hour2[(admin_frame_hour2.incident_hour >= 0) &\
                         (admin_frame_hour2.incident_hour <= 24)].copy()
valid["valid_hour2"] = "Valid Response"

missing = admin_frame_hour2[(admin_frame_hour2.incident_hour == 99) & \
                           (admin_frame_hour2.offense_code.isin(relevant_offenses_hour)) & \
                           (admin_frame_hour2.victim_type_code == "L")].copy()
missing["valid_hour2"] = "Missing"

print("Getting incident hour counts for report date flag = 'R'")
valid_hour2_counts = get_counts_offense(pd.concat([valid,missing],sort=False).dropna(subset=["valid_hour2"]), "valid_hour2")
valid_hour2_counts

In [None]:
####################################
#CLEARED EXCEPTIONALLY
####################################
#replace N/As with a valid level
admin_frame["cleared_except_code"] = admin_frame["cleared_except_code"].fillna("99")

#filter to levels we need so we do not have any unnecessary levels(missing when item is not mandatory)
####QUESTION: doesn't this filter out the 99's? Why is this here?
#admin_frame_cleared = admin_frame.loc[admin_frame["cleared_except_id"].isin(list(range(1,7)))]

valid = admin_frame[admin_frame.cleared_except_code.isin(['A', 'B', 'C', 'D', 'E', 'N'])].copy()
valid["valid_clear"] = "Valid Response"

missing = admin_frame[admin_frame.cleared_except_code == "99"].copy()
missing["valid_clear"] = "Missing"

print("Getting cleared exceptionally counts")
valid_clear_counts = get_counts_offense(pd.concat([valid,missing],sort=False).dropna(subset=["valid_clear"]), "valid_clear")
valid_clear_counts

In [None]:
admin_results = pd.concat({"Cargo theft":valid_cargo_counts,
                         "Report date indicator = No":valid_hour1_counts,
                         "Report date indicator = Yes": valid_hour2_counts,
                         "Cleared exceptionally":valid_clear_counts}).fillna(0)
print("Finished with admin counts")
admin_results

## Property

In [None]:
property_frame = pd.read_csv(raw_query_folder / f"qc_missingness_property_frame_{state}.csv.gz")

In [None]:
####################################
#PROPERTY LOSS 
####################################

#replace N/As with a valid level
property_frame["prop_loss_code"] = property_frame["prop_loss_code"].fillna(99)

#filter to levels we need so we do not have any unnecessary levels(missing when item is not mandatory)
valid = property_frame[property_frame.prop_loss_code.isin(list(range(1,8)))].copy()
valid["valid_prop_loss"] = "Valid Response"

unknown = property_frame[(property_frame.prop_loss_code == 8)\
                       & (property_frame.offense_code.isin(property_loss_offense_codes))].copy()
unknown["valid_prop_loss"] = "Unknown"

missing = property_frame[(property_frame.prop_loss_code == 99)\
                       & (property_frame.offense_code.isin(property_loss_offense_codes))].copy()
missing["valid_prop_loss"] = "Missing"

print("Getting property loss counts")

valid_prop_loss_counts = get_counts_offense(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_prop_loss"]), "valid_prop_loss")
valid_prop_loss_counts

In [None]:
####################################
#PROPERTY VALUE
####################################


#replace N/As with a valid level
property_frame["property_value"] = property_frame["property_value"].fillna(-99).astype(int)

#filter to levels we need so we do not have any unnecessary levels(missing when item is not mandatory)
valid = property_frame[(property_frame.property_value >=2) & (property_frame.property_value <= 999999999)].copy()
valid["valid_prop_value"] = "Valid Response"

unknown = property_frame[(property_frame.property_value == 1)\
                       & (property_frame.prop_loss_code.isin(list(range(2,8))))].copy()
unknown["valid_prop_value"] = "Unknown"

missing = property_frame[(property_frame.property_value == -99)\
                       & (property_frame.prop_loss_code.isin(list(range(2,8))))].copy()
missing["valid_prop_value"] = "Missing"

print("Getting property value counts")
valid_prop_value_counts = get_counts_offense(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_prop_value"]), "valid_prop_value")
valid_prop_value_counts

In [None]:
prop_results = pd.concat({"Type of property loss":valid_prop_loss_counts,"Value of property":valid_prop_value_counts}).sort_index(axis=1).drop(columns=["Type A","Person","Society"])
print("Finished property counts")

prop_results

## Offender

In [None]:
offender_frame = pd.read_csv(raw_query_folder / f"qc_missingness_offender_frame_{state}.csv.gz")

In [None]:
####################################
#OFFENDER NUMBER
####################################

#replace N/As with a valid level
offender_frame["offender_seq_num"] = offender_frame["offender_seq_num"].astype(float)

#filter to levels we need so we do not have any unnecessary levels(missing when item is not mandatory)
valid = offender_frame[(offender_frame.offender_seq_num >= 1) & (offender_frame.offender_seq_num <= 99)].copy()
valid["valid_offender_seq_value"] = "Valid Response"

unknown = offender_frame[offender_frame.offender_seq_num == 0].copy()
unknown["valid_offender_seq_value"] = "Unknown"

missing = offender_frame[(offender_frame.offender_seq_num < 0) | (offender_frame.offender_seq_num > 99)].copy()
missing["valid_offender_seq_value"] = "Missing"

print("Getting offender sequence counts")
valid_offender_seq_counts = get_counts_offense(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_offender_seq_value"]), "valid_offender_seq_value")
valid_offender_seq_counts

In [None]:
####################################
# DEMOGRAPHICS
####################################

print("Getting offender demographics")

valid_offender_age_counts, \
valid_offender_sex_counts, \
valid_offender_race_counts, \
valid_offender_ethnicity_counts = get_demographic_counts(offender_frame, get_counts_offense)

In [None]:
print("Finished offender counts")

offender_results = pd.concat({"Number of offenders":valid_offender_seq_counts,
                            "Age of offender id":valid_offender_age_counts,
                            "Sex of offender":valid_offender_sex_counts,
                            "Race of offender":valid_offender_race_counts,
                            "Ethnicity of offender":valid_offender_ethnicity_counts})
offender_results

## Victim

### Part 1: Demographics

In [None]:
victim_dem_frame = pd.read_csv(raw_query_folder / f"qc_missingness_victim_dem_frame_{state}.csv.gz")

In [None]:
####################################
#VICTIM NUMBER
####################################

victim_dem_frame["victim_seq_num"] = victim_dem_frame["victim_seq_num"].astype(float)

#filter to levels we need so we do not have any unnecessary levels(missing when item is not mandatory)
valid = victim_dem_frame[(victim_dem_frame.victim_seq_num >= 1) & (victim_dem_frame.victim_seq_num < 999)].copy()
valid["valid_victim_seq_value"] = "Valid Response"

unknown = victim_dem_frame[victim_dem_frame.victim_seq_num == 999].copy()
unknown["valid_victim_seq_value"] = "Unknown"

missing = victim_dem_frame[(victim_dem_frame.victim_seq_num != 999) & \
                           ((victim_dem_frame.victim_seq_num <= 0) | (victim_dem_frame.victim_seq_num >= 999))].copy()
missing["valid_victim_seq_value"] = "Missing"

print("Getting victim sequence number counts")

valid_victim_seq_counts = get_counts_crime_types(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_victim_seq_value"]), "valid_victim_seq_value")
valid_victim_seq_counts

In [None]:
####################################
# DEMOGRAPHICS
####################################
print("Getting victim demographics counts")

valid_victim_age_counts, \
valid_victim_sex_counts, \
valid_victim_race_counts, \
valid_victim_ethnicity_counts = get_demographic_counts(victim_dem_frame, get_counts_crime_types)

### Part 2: Injury

In [None]:
victim_injury_frame = pd.read_csv(raw_query_folder / f"qc_missingness_victim_injury_frame_{state}.csv.gz")

In [None]:
####################################
#VICTIM INJURY
####################################

valid = victim_injury_frame[victim_injury_frame.injury_code.isin(["B","I","L","M","N","O","T","U"])].copy()
valid["valid_victim_injury"] = "Valid Response"

# there are no "unknown" injury codes
unknown = victim_injury_frame[~(victim_injury_frame.injury_code.isin(["B","I","L","M","N","O","T","U"]))].copy()
unknown["valid_victim_injury"] = "Unknown"

missing = victim_injury_frame[victim_injury_frame.injury_code.isna()].copy()
missing["valid_victim_injury"] = "Missing"

print("Getting victim injury counts")
valid_victim_injury_counts = get_counts_crime_types(pd.concat([valid,missing],sort=False).dropna(subset=["valid_victim_injury"]), "valid_victim_injury")
valid_victim_injury_counts

### Part 3: Circumstances

In [None]:
victim_circumstances_frame = pd.read_csv(raw_query_folder / f"qc_missingness_victim_circumstances_frame_{state}.csv.gz")

In [None]:
####################################
#VICTIM CIRCUMSTANCES
####################################
victim_circumstances_frame["circumstance_code"] = victim_circumstances_frame["circumstance_code"].astype(float)


valid = victim_circumstances_frame[(victim_circumstances_frame.circumstance_code >= 1) & \
                                   (victim_circumstances_frame.circumstance_code <= 34) & \
                                   (victim_circumstances_frame.circumstance_code != 10)].copy()
valid["valid_victim_circumstances"] = "Valid Response"

unknown = victim_circumstances_frame[victim_circumstances_frame.circumstance_code == 10].copy()
unknown["valid_victim_circumstances"] = "Unknown"

missing = victim_circumstances_frame[(victim_circumstances_frame.circumstance_code < 1) | \
                                     (victim_circumstances_frame.circumstance_code > 34)].copy()
missing["valid_victim_circumstances"] = "Missing"
print("Getting victim circumstances counts")

valid_victim_circumstances_counts = get_counts_crime_types(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_victim_circumstances"]), "valid_victim_circumstances")
valid_victim_circumstances_counts

### Part 4: Victim-offender relationship

In [None]:
victim_relationship_frame = pd.read_csv(raw_query_folder / f"qc_missingness_victim_relationship_frame_{state}.csv.gz")

In [None]:
####################################
#VICTIM RELATIONSHIP
####################################
valid = victim_relationship_frame[(victim_relationship_frame.relationship_code.isin(["AQ", "BE", "BG", "CF", "CH", "CO", "CS", "EE", "ER", "FC", "FP", "FR", "GC", "GP", "HR", "IL", "NE", "OF", "OK", "PA", "SB", "SC", "SE", "SP", "SS", "ST", "VO", "XR", "XS"])) & \
                                   (victim_relationship_frame.relationship_code != "RU")].copy()
valid["valid_victim_relationship"] = "Valid Response"

unknown = victim_relationship_frame[victim_relationship_frame.relationship_code == "RU"].copy()
unknown["valid_victim_relationship"] = "Unknown"

missing = victim_relationship_frame[~(victim_relationship_frame.relationship_code.isin(["AQ", "BE", "BG", "CF", "CH", "CO", "CS", "EE", "ER", "FC", "FP", "FR", "GC", "GP", "HR", "IL", "NE", "OF", "OK", "PA", "RU", "SB", "SC", "SE", "SP", "SS", "ST", "VO", "XR", "XS"]))].copy()
missing["valid_victim_relationship"] = "Missing"

print("Getting victim offender relationship counts")
valid_victim_relationship_counts = get_counts_crime_types(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_victim_relationship"]), "valid_victim_relationship")
valid_victim_relationship_counts

In [None]:
print("Finished with victim counts")
victim_results = pd.concat({"Number of victims":valid_victim_seq_counts,
                            "Age of victim id":valid_victim_age_counts,
                            "Sex of victim":valid_victim_sex_counts,
                            "Race of victim":valid_victim_race_counts,
                            "Ethnicity of victim":valid_victim_ethnicity_counts,
                            "Injury":valid_victim_injury_counts,
                            "Victim-offender relationship":valid_victim_relationship_counts,
                            "Circumstances":valid_victim_circumstances_counts,
                           }).fillna(0)
victim_results

## Arrestee

In [None]:
arrestee_dem_frame = pd.read_csv(raw_query_folder / f"qc_missingness_arrestee_dem_frame_{state}.csv.gz")

In [None]:
####################################
# DEMOGRAPHICS
####################################
print("Getting arrestee demographics counts")

valid_arrestee_age_counts, \
valid_arrestee_sex_counts, \
valid_arrestee_race_counts, \
valid_arrestee_ethnicity_counts = get_demographic_counts(arrestee_dem_frame, get_counts_offense)

In [None]:
####################################
# TYPE OF ARREST
####################################

valid = arrestee_dem_frame[arrestee_dem_frame.arrest_type_code.isin(["O","S","T"])].copy()
valid["valid_arrest_type"] = "Valid Response"

missing = arrestee_dem_frame[~arrestee_dem_frame.arrest_type_code.isin(["O","S","T"])].copy()
missing["valid_arrest_type"] = "Missing"

print("Getting arrest type counts")

valid_arrest_type_counts = get_counts_offense(pd.concat([valid,missing],sort=False).dropna(subset=["valid_arrest_type"]), "valid_arrest_type")
valid_arrest_type_counts

In [None]:
####################################
# MULTIPLE ARREST INDICATOR
####################################

valid = arrestee_dem_frame[arrestee_dem_frame.multiple_indicator.isin(["C","M"])].copy()
valid["valid_arrestee_multiple"] = "Valid Response"

unknown = arrestee_dem_frame[arrestee_dem_frame.multiple_indicator == "N"].copy()
unknown["valid_arrestee_multiple"] = "Not Applicable"

missing = arrestee_dem_frame[~arrestee_dem_frame.multiple_indicator.isin(["C","M","N"])].copy()
missing["valid_arrestee_multiple"] = "Missing"

print("Getting multiple arrest indicator counts")

valid_arrestee_multiple_counts = get_counts_offense(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_arrestee_multiple"]), "valid_arrestee_multiple")
valid_arrestee_multiple_counts

In [None]:
####################################
# JUVENILE DISPOSITION
####################################

arrestee_dem_frame["valid_id_under_18"] = arrestee_dem_frame["under_18_disposition_code"].apply(lambda x: True if x in ["H","R"] else False)
       
arrestee_dem_frame["valid_age_under_18"]= arrestee_dem_frame.apply(lambda row: True if (row["age_code"]=="AG") \
                                                                   and (int(row["age_num"] >=0) and int(row["age_num"] < 18)) \
                                                                   else False,axis=1)


valid = arrestee_dem_frame[(arrestee_dem_frame.valid_age_under_18 & arrestee_dem_frame.valid_id_under_18)].copy()
valid["valid_arrestee_juvenile"] = "Valid Response"

unknown = arrestee_dem_frame[~arrestee_dem_frame.valid_age_under_18].copy()
unknown["valid_arrestee_juvenile"] = "Not Applicable (Over 18)"

missing = arrestee_dem_frame[(~arrestee_dem_frame.valid_id_under_18)&(arrestee_dem_frame.valid_age_under_18)].copy()
missing["valid_arrestee_juvenile"] = "Missing"

print("Getting juvenile disposition counts")

valid_arrestee_juvenile_counts = get_counts_offense(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_arrestee_juvenile"]), "valid_arrestee_juvenile")
valid_arrestee_juvenile_counts


In [None]:
arrestee_weapon_frame = pd.read_csv(raw_query_folder / f"qc_missingness_arrestee_weapon_frame_{state}.csv.gz")

In [None]:
####################################
# WEAPON
####################################
arrestee_weapon_frame["weapon_code"] = arrestee_weapon_frame["weapon_code"].fillna("999")

valid = arrestee_weapon_frame[arrestee_weapon_frame.weapon_code.isin(relevant_weapons)].copy()
valid["valid_weapon_id"] = "Valid Response"

unknown = arrestee_weapon_frame[arrestee_weapon_frame.weapon_code == "95"].copy()
unknown["valid_weapon_id"] = "Unknown"

missing = arrestee_weapon_frame[arrestee_weapon_frame.weapon_code == "999"].copy()
missing["valid_weapon_id"] = "Missing"

print("Getting arrestee weapon id counts")
valid_weapon_code_counts = get_counts_offense(pd.concat([valid,unknown,missing],sort=False).dropna(subset=["valid_weapon_id"]), "valid_weapon_id")
valid_weapon_code_counts

In [None]:
print("Finished arrestee counts")
arrestee_results = pd.concat({"Age of arrestee id":valid_arrestee_age_counts,
                            "Sex of arrestee":valid_arrestee_sex_counts,
                            "Race of arrestee":valid_arrestee_race_counts,
                            "Ethnicity of arrestee":valid_arrestee_ethnicity_counts,
                            "Type of arrest":valid_arrest_type_counts,
                            "Arrestee armed with":valid_weapon_code_counts,
                            "Multiple arrest indicator":valid_arrestee_multiple_counts,
                            "Juvenile disposition":valid_arrestee_juvenile_counts,
                           }).fillna(0)
arrestee_results

In [None]:
print("Finished generating all counts.")
output_file = output_dir / f"missing_variables_{state}_{year}.xlsx"
print("Results outputted to",output_file)


with pd.ExcelWriter(output_file) as writer:
        offense_results.to_excel(writer, sheet_name=f"Offense {year}")
        admin_results.to_excel(writer, sheet_name=f"Admin {year}")
        prop_results.to_excel(writer, sheet_name=f"Property {year}")
        victim_results.to_excel(writer, sheet_name=f"Victim {year}")
        offender_results.to_excel(writer, sheet_name=f"Offender {year}")
        arrestee_results.to_excel(writer, sheet_name=f"Arrestee {year}")