# What Dis

Consolidated crime data, curated from the FBI and city-data.com

# Imports and Definitions

In [1]:
import pandas as pd
import numpy as np

In [2]:
abbreviations = {
    "Alabama"       : "AL",
    "Alaska"        : "AK",
    "Arizona"       : "AZ",
    "Arkansas"      : "AR",
    "California"    : "CA",
    "Colorado"      : "CO",
    "Connecticut"   : "CT",
    "Delaware"      : "DE",
    "Florida"       : "FL",
    "Georgia"       : "GA",
    "Hawaii"        : "HI",
    "Idaho"         : "ID",
    "Illinois"      : "IL",
    "Indiana"       : "IN",
    "Iowa"          : "IA",
    "Kansas"        : "KS",
    "Kentucky"      : "KY",
    "Louisiana"     : "LA",
    "Maine"         : "ME",
    "Maryland"      : "MD",
    "Massachusetts" : "MA",
    "Michigan"      : "MI",
    "Minnesota"     : "MN",
    "Mississippi"   : "MS",
    "Missouri"      : "MO",
    "Montana"       : "MT",
    "Nebraska"      : "NE",
    "Nevada"        : "NV",
    "New Hampshire" : "NH",
    "New Jersey"    : "NJ",
    "New Mexico"    : "NM",
    "New York"      : "NY",
    "North Carolina": "NC",
    "North Dakota"  : "ND",
    "Ohio"          : "OH",
    "Oklahoma"      : "OK",
    "Oregon"        : "OR",
    "Pennsylvania"  : "PA",
    "Rhode Island"  : "RI",
    "South Carolina": "SC",
    "South Dakota"  : "SD",
    "Tennessee"     : "TN",
    "Texas"         : "TX",
    "Utah"          : "UT",
    "Vermont"       : "VT",
    "Virginia"      : "VA",
    "Washington"    : "WA",
    "West Virginia" : "WV",
    "Wisconsin"     : "WI",
    "Wyoming"       : "WY",
}

In [3]:
places = {
    "Alabama": [
        "Birmingham",
        "Montgomery",],
    "Alaska": [
        "Anchorage",
        "Juneau",],
    "Arizona": [
        "Phoenix",
        "Tucson",],
    "Arkansas": [
        "Little Rock",
        "Fort Smith",],
    "California": [
        "San Diego",
        "Los Angeles",
        "Sacramento",],
    "Colorado": [
        "Denver",
        "Colorado Springs",
        "Aurora",],
    "Connecticut": [
        "Bridgeport",
        "Hartford",],
    "Delaware": [
        "Dover",
        "Wilmington",],
    "Florida": [
        "Jacksonville",
        "Miami",
        "Tallahassee",],
    "Georgia": [
        "Atlanta",
        "Savannah",],
    "Hawaii": [
        "Honolulu",
        "Kauai",
        "Maui",],
    "Idaho": [
        "Boise",
        "Meridian",],
    "Illinois": [
        "Chicago",
        "Springfield",],
    "Indiana": [
        "Fort Wayne",
        "Indianapolis",],
    "Iowa": [
        "Cedar Rapids",
        "Des Moines",],
    "Kansas": [
        "Topeka",
        "Wichita",],
    "Kentucky": [
        "Frankfort",
        "Louisville",],
    "Louisiana": [
        "Baton Rouge",
        "New Orleans",],
    "Maine": [
        "Augusta",
        "Portland",],
    "Maryland": [
        "Baltimore",
        "Annapolis",],
    "Massachusetts": [
        "Boston",
        "Worcester",],
    "Michigan": [
        "Detroit",
        "Lansing",],
    "Minnesota": [
        "Minneapolis",
        "Saint Paul",],
    "Mississippi": [
        "Jackson",
        "Gulfport",],
    "Missouri": [
        "Kansas City",
        "Jefferson City",],
    "Montana": [
        "Billings",
        "Helena",],
    "Nebraska": [
        "Lincoln",
        "Omaha",],
    "Nevada": [
        "Carson City",
        "Las Vegas",
        "Reno",],
    "New Hampshire": [
        "Concord",
        "Manchester",],
    "New Jersey": [
        "Newark",
        "Trenton",],
    "New Mexico": [
        "Albuquerque",
        "Santa Fe",],
    "New York": [
        "Albany",
        "New York City",],
    "North Carolina": [
        "Charlotte",
        "Raleigh",],
    "North Dakota": [
        "Bismarck",
        "Fargo",],
    "Ohio": [
        "Columbus",
        "Cleveland",],
    "Oklahoma": [
        "Oklahoma City",
        "Tulsa",],
    "Oregon": [
        "Portland",
        "Salem",],
    "Pennsylvania": [
        "Harrisburg",
        "Philadelphia",],
    "Rhode Island": [
        "Providence",
        "Warwick",],
    "South Carolina": [
        "Charleston",
        "Columbia",],
    "South Dakota": [
        "Pierre",
        "Sioux Falls",],
    "Tennessee": [
        "Nashville",
        "Memphis",],
    "Texas": [
        "Austin",
        "Houston",],
    "Utah": [
        "Salt Lake City",
        "West Valley City",],
    "Vermont": [
        "Burlington",
        "Montpelier",],
    "Virginia": [
        "Richmond",
        "Virginia Beach",],
    "Washington": [
        "Olympia",
        "Seattle",],
    "West Virginia": [
        "Charleston",
        "Huntington",],
    "Wisconsin": [
        "Madison",
        "Milwaukee",],
    "Wyoming": [
        "Casper",
        "Cheyenne",],
}

In [4]:
years = list(range(2010, 2016+1))

In [5]:
missing_places = { # places missing from citydata
    "Hawaii": [
        "Honolulu",
        "Kauai",
        "Maui",],
    "Kentucky": [
        "Louisville",], # not actually missing, but has like no data
    "Nevada": [
        "Carson City",],
    "Tennessee": [
        "Nashville",],
    "Utah": [
        "West Valley City",],
}

In [6]:
corrections = { # alternative names in fbi
    "Charlotte": "charlotte-mecklenburg",
    "Las Vegas": "las vegas metropolitan police department",
    "Louisville": "louisville metro",
    "Savannah": "savannah-chatham metropolitan",
    "West Valley City": "west valley",
    "New York City": "new york"
}

In [7]:
citydata = pd.read_pickle("citydata_crime_data.pkl")

In [8]:
fbi = pd.read_pickle("fbi_crime_data.pkl")

# Processing

In [9]:
data = citydata.copy()

In [10]:
# add populations
data['population'] = np.nan
for state, city, year in zip(data.index.get_level_values('state'),
                             data.index.get_level_values('city'),
                             data.index.get_level_values('year')):
    try:
        fbi.loc[state.lower(), (corrections[city] if city in corrections else city).lower(), year]
    except KeyError:
        continue
    data.loc[state, city, year].loc['population'] = \
            fbi.loc[state.lower(),
                    (corrections[city] if city in corrections else city).lower(),
                    year] \
                   .loc['population']

In [11]:
# set `fbi` column names to match `data`
fbi.rename({"aggravated assault": "assault",
            "forcible rape": "rape",
            "larceny or theft": "theft",
            "motor vehicle theft": "vehicle theft"},
           axis='columns', inplace=True)
# rearrange `fbi` columns to match `data`
fbi = fbi[data.columns]

In [12]:
# unfold multiindex to work with
col_order = data.columns
data.reset_index(inplace=True)

In [13]:
# add missing cities
new_data = []
for state, cities in missing_places.items():
    for city in cities:
        try: # check if `fbi` contains that missing place
            fbi.loc[state.lower(), (corrections.get(city) or city).lower()]
        except KeyError:
            print("fbi does not contain {city}, {state}"
                  .format(city=city, state=state))
            continue
        new_data.append(fbi.loc[state.lower(),(corrections.get(city) or city).lower()]
                        .reset_index()
                        .assign(state=state, city=city))

fbi does not contain Kauai, Hawaii
fbi does not contain Maui, Hawaii
fbi does not contain Carson City, Nevada


In [14]:
data = pd.concat([data, *new_data]) \
           .sort_values(['state', 'city', 'year']) \
           .drop_duplicates(['state', 'city', 'year'], keep='last')

In [15]:
# add state abbreviations to city names
data['city'] = data['city'] + ", " + data['state'].apply(lambda state: abbreviations[state])
data.drop('state', axis=1, inplace=True)

In [16]:
# refold multiindex
data = data.set_index(['city', 'year'])[col_order]

In [17]:
data.to_pickle("crime_data.pkl")