Merge Murder Accountability Project (MAP) FBI Return A text files

In [1]:
import pandas as pd
import struct
import sys
import glob
import numpy as np

Step 1: Parse data from fixed-width files
The code in this step is largely adopted from Donald Braman's Return A conversion scripts — specifically schemagenerating.py and fwf2psv.py.

Convert the FBI's textual representation of numbers into integers. The schema for the fixed-width files is available in the Ret A Rec Descrip.pdf.

In [2]:
file_header_widths = [
    1, 2, 7, 2, 1, 2, 5, 2, 1, 7, 1,
    6, 4, 2, 1, 9, 3, 3, 9, 3, 3, 9,
    3, 3, 9, 9, 9, 1, 1, 1, 1, 24, 6,
    30, 30, 30, 30, 5, 1, 29
]

monthly_header_widths = [
    2, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1
] 

cards_0123_widths = [
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
    5, 5, 5, 5, 5, 5
] 

card_4_widths = [
    3, 3, 7
]

In [3]:
mw =  monthly_header_widths + cards_0123_widths * 4 + card_4_widths
field_widths = file_header_widths + mw * 12

In [4]:
assert sum(field_widths) == 7385

In [5]:
file_header_names = [
    "id", "state", "agency_ori_7", "ori_group", "division", 
    "year", "sequence", "juv_age", "core_ci", "covered_by", "covered_by_group", 
    "last_update", "field_office", "num_months", "agency_count",
    "pop1", "county1", "msa1", 
    "pop2", "county2", "msa2", 
    "pop3", "county3", "msa3",
    "county1_pop", "county2_pop", "county3_pop", 
    "pop_source", 
    "follow_up", "special_mail_group", "special_mail_addr",
    "agency_name", "agency_state", 
    "address1", "address2", "address3", "address4", "zip",
    "old_pop_group",
    "unused_header"
]

monthly_header_names = [
    "month_in", "date_last_update", 
    "card_0_type", "card_1_type", "card_2_type", "card_3_type", "card_4_type", 
    "card_0_pt", "card_1_pt", "card_2_pt", "card_3_pt"
]

crime_card_prefixes = [
    "unfounded_",
    "actual_",
    "cleared_",
    "cleared_u18_"
]

cards_0123_names = [
    "murder",
    "manslaughter",
    "rape_total", "rape_by_force",  "rape_attempt",
    "robbery_total", "robbery_gun", "robbery_knife", "robbery_othweap",  "robbery_strong_arm",
    "assault_total", "assault_gun", "assault_knife", "assault_othweap", "assault_hands", "assault_simple", 
    "burglary_total", "burglary_forcible_entry", "burglary_no_forcible_entry", "burglary_attempt",
    "larceny", "motor_vehicle_theft", "auto_theft", "truck_bus_theft", "other_vehicle_theft", 
    "total_all_fields", "larceny_under_50_dollars", "unused"
]

In [6]:
cards = [
    prefix + offense
        for prefix in crime_card_prefixes
              for offense in cards_0123_names
]

card_4_names = [
    "officers_killed_felony",
    "officers_killed_accident",
    "officers_assaulted"
]

card_names = monthly_header_names + cards + card_4_names

month_prefixes = [
    "jan_", "feb_", "mar_", "apr_", "may_", "jun_",
    "jul_", "aug_", "sep_", "oct_", "nov_", "dec_"
]

twelve_cards = [
    month + cn
        for month in month_prefixes
            for cn in card_names
]

field_names = file_header_names + twelve_cards

In [7]:
assert len(field_names) == len(field_widths)

In [8]:
cols_trimmed = [ c for c in field_names if (
    c in [ "year", "agency_ori_7", "agency_name", "ori_group", "pop1", "pop2", "pop3" ]
    or ((
        "_cleared_" in c
        or "_actual_" in c
    ) and ("murder" in c or "assault" in c) and ("18" not in c))
) ]

len(cols_trimmed)

175

In [9]:
cols_trimmed[:3]

['agency_ori_7', 'ori_group', 'year']

In [10]:
trimmed_indexes = [ field_names.index(c) for c in cols_trimmed ]
trimmed_indexes[:3]

[2, 3, 5]

In [11]:
# Make sure records are in correct order
assert (
    pd.Series(trimmed_indexes) - pd.Series(trimmed_indexes).shift(1)
).loc[lambda x: x < 0].pipe(len) == 0

In [12]:
def build_parser():
    total_width = sum(field_widths)
    fmtstring = " ".join("{}s".format(fw) for fw in field_widths)
    fieldstruct = struct.Struct(fmtstring)
    parse_bytes = fieldstruct.unpack_from
        
    def parse_line(line):
        line = line.strip().replace(b"\x00", b"")
        line = line + b" " * (total_width - len(line))
        return [ x.strip().decode("latin-1") for i, x in enumerate(parse_bytes(line))
            if i in trimmed_indexes ]

    return parse_line

parser = build_parser()

In [13]:
def convert_year(year_str):
    year_int = int(year_str)
    if year_int < 60: return 2000 + year_int
    else: return 1900 + year_int

In [14]:
NEGATIVE_SYMBOLS = "}JKLMNOPQR"

In [15]:
def parse_reta_num_string(num_string):
    try: return int(num_string)
    except:
        base = (-10 * int(num_string[:-1]))
        return base - NEGATIVE_SYMBOLS.index(num_string[-1])

In [16]:
parse_reta_num_string("00001J")

-11

In [17]:
def parse_file(path):
    sys.stderr.write(f"Parsing {path}\n")
    sys.stderr.flush()
    df = (
        pd.DataFrame(
            list(map(parser, open(path, "rb"))),
            columns = cols_trimmed
        )
        .replace("", np.nan)
        .dropna(subset = [ "agency_ori_7", "year" ])
    )
    df["year"] = df["year"].apply(convert_year)
    
    for col in df.columns:
        if "pop" in col[:3]:
            df[col] = df[col].astype(int)
        elif "_actual_" in col or "_cleared_" in col:
            df[col] = df[col].apply(parse_reta_num_string).astype(int)
        else:
            pass
        
    return df

In [18]:
parsed = pd.DataFrame()
for reta in range(1960,2018):
    strReta = str(reta)
    path = '../Data/text_files/RETA'+ strReta +'.TXT'
    parsed = pd.concat([ parsed, parse_file(path) ])

Parsing ../Data/text_files/RETA1960.TXT
Parsing ../Data/text_files/RETA1961.TXT
Parsing ../Data/text_files/RETA1962.TXT
Parsing ../Data/text_files/RETA1963.TXT
Parsing ../Data/text_files/RETA1964.TXT
Parsing ../Data/text_files/RETA1965.TXT
Parsing ../Data/text_files/RETA1966.TXT
Parsing ../Data/text_files/RETA1967.TXT
Parsing ../Data/text_files/RETA1968.TXT
Parsing ../Data/text_files/RETA1969.TXT
Parsing ../Data/text_files/RETA1970.TXT
Parsing ../Data/text_files/RETA1971.TXT
Parsing ../Data/text_files/RETA1972.TXT
Parsing ../Data/text_files/RETA1973.TXT
Parsing ../Data/text_files/RETA1974.TXT
Parsing ../Data/text_files/RETA1975.TXT
Parsing ../Data/text_files/RETA1976.TXT
Parsing ../Data/text_files/RETA1977.TXT
Parsing ../Data/text_files/RETA1978.TXT
Parsing ../Data/text_files/RETA1979.TXT
Parsing ../Data/text_files/RETA1980.TXT
Parsing ../Data/text_files/RETA1981.TXT
Parsing ../Data/text_files/RETA1982.TXT
Parsing ../Data/text_files/RETA1983.TXT
Parsing ../Data/text_files/RETA1984.TXT


In [19]:
parsed.head()

Unnamed: 0,agency_ori_7,ori_group,year,pop1,pop2,pop3,agency_name,jan_actual_murder,jan_actual_assault_total,jan_actual_assault_gun,...,dec_actual_assault_othweap,dec_actual_assault_hands,dec_actual_assault_simple,dec_cleared_murder,dec_cleared_assault_total,dec_cleared_assault_gun,dec_cleared_assault_knife,dec_cleared_assault_othweap,dec_cleared_assault_hands,dec_cleared_assault_simple
0,AL00100,9A,1960,165354,0,0,JEFFERSON,0,7,0,...,0,0,0,0,0,0,0,0,0,0
1,AL00101,4,1960,33054,0,0,BESSEMER,0,6,0,...,0,0,0,0,0,0,0,0,0,0
2,AL00102,1C,1960,340887,0,0,BIRMINGHAM,2,82,0,...,0,0,0,0,0,0,0,0,0,0
3,AL00103,5,1960,12680,0,0,MOUNTAIN BROOK,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AL00104,5,1960,15816,0,0,FAIRFIELD,0,4,0,...,0,0,0,0,0,0,0,0,0,0


Step 2: Fix apparent ORI errors

In this step, we fix a few rows where an agency's ORI code appears to be mislabeled, based on duplicate entries for the same ORI.

In [20]:
def fix_errors(df):

    df.loc[(
        (df["agency_ori_7"] == "VA02101") &
        (df["year"].isin([ 1967, 1970 ])) &
        (df["ori_group"] == "9D")
    ), "agency_ori_7"] = "VA021SP"

    df.loc[(
        (df["agency_ori_7"] == "SC02901") &
        (df["year"] == 1972) &
        (df["ori_group"] == "9A")
    ), "agency_ori_7"] = "VA02901"

    return df

In [21]:
parsed = fix_errors(parsed)

In [22]:
parsed.duplicated(subset = [ "year", "agency_ori_7" ]).sum()

32

Step 3: Extract and save agency-year metadata

In [23]:
agency_metadata = (
    parsed
    [[
        "year",
        "agency_ori_7",
        "agency_name",
        "ori_group",
        "pop1",
        "pop2",
        "pop3"
    ]]
    .drop_duplicates()
    .astype({
        "pop1": int,
        "pop2": int,
        "pop3": int
    })
)

In [24]:
agency_metadata.head()

Unnamed: 0,year,agency_ori_7,agency_name,ori_group,pop1,pop2,pop3
0,1960,AL00100,JEFFERSON,9A,165354,0,0
1,1960,AL00101,BESSEMER,4,33054,0,0
2,1960,AL00102,BIRMINGHAM,1C,340887,0,0
3,1960,AL00103,MOUNTAIN BROOK,5,12680,0,0
4,1960,AL00104,FAIRFIELD,5,15816,0,0


In [25]:
(
    agency_metadata
    .to_csv(
        "reta-agency-metadata.csv",
        index = False
    )
)

In [None]:
parsed.to_csv("reta-all.csv",index = False)

Step 4: Summarize into annual counts

Note: In this step, we convert the "wide" structure of the data into an easier-to-analyze structure.

In [28]:
tidy_counts = (
    parsed
    .melt(
        id_vars = [ "agency_ori_7", "year" ],
        value_vars = [ c for c in parsed.columns
            if c.count("_") > 0 and c.split("_")[1] in [ "actual", "cleared" ] ]
    )

    .assign(
        offense = lambda x: x["variable"].apply(lambda s: s.split("_", 2)[-1]),
        
        count_type = lambda x: x["variable"].apply(lambda s: s.split("_", 2)[-2]),
    )

)

MemoryError: Unable to allocate 2.40 GiB for an array with shape (2, 161364168) and data type object

In [None]:
tidy_counts.head()

In [None]:
tidy_counts["count_type"].value_counts().sort_index()

In [None]:
tidy_counts["offense"].value_counts().sort_index()

In [None]:
# Overview of negative numbers in the data
(
    tidy_counts
    .loc[lambda df: df["value"] < 0]
    ["value"]
    .value_counts()
    .sort_index(ascending = False)
)
annual_counts = (
    tidy_counts
    .groupby([
        "agency_ori_7",
        "year",
        "offense",
        "count_type"
    ])
    ["value"]
    .sum()
    .unstack()
    .reset_index()
)

In [None]:
annual_counts.head()

Step 5: Save counts for subsequent analysis

In [None]:
(
    annual_counts
    .to_csv(
        "reta-annual-counts.csv",
        index = False
    )
)

In [None]:
Step 6: Save entire dataframe into a single file

In [None]:
parsed.to_csv("reta-all.csv",index = False)