# What Dis

2010–2016 crime data varaious cities throughout the United States as curated by some random website.

[source](http://www.city-data.com/crime/)

# Imports and Definitions

In [1]:
import pandas as pd
import numpy as np

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def to_val(cell):
    val = cell.get_text().split('(')[0].strip().replace(',', '')
    return float(val) if val != 'N/A' else np.nan

In [4]:
places = {
    "Alabama": [
        "Birmingham",
        "Montgomery",],
    "Alaska": [
        "Anchorage",
        "Juneau",],
    "Arizona": [
        "Phoenix",
        "Tucson",],
    "Arkansas": [
        "Little Rock",
        "Fort Smith",],
    "California": [
        "San Diego",
        "Los Angeles",
        "Sacramento",],
    "Colorado": [
        "Denver",
        "Colorado Springs",
        "Aurora",],
    "Connecticut": [
        "Bridgeport",
        "Hartford",],
    "Delaware": [
        "Dover",
        "Wilmington",],
    "Florida": [
        "Jacksonville",
        "Miami",
        "Tallahassee",],
    "Georgia": [
        "Atlanta",
        "Savannah",],
    "Hawaii": [
        "Honolulu",
        "Kauai",
        "Maui",],
    "Idaho": [
        "Boise",
        "Meridian",],
    "Illinois": [
        "Chicago",
        "Springfield",],
    "Indiana": [
        "Fort Wayne",
        "Indianapolis",],
    "Iowa": [
        "Cedar Rapids",
        "Des Moines",],
    "Kansas": [
        "Topeka",
        "Wichita",],
    "Kentucky": [
        "Frankfort",
        "Louisville",],
    "Louisiana": [
        "Baton Rouge",
        "New Orleans",],
    "Maine": [
        "Augusta",
        "Portland",],
    "Maryland": [
        "Baltimore",
        "Annapolis",],
    "Massachusetts": [
        "Boston",
        "Worcester",],
    "Michigan": [
        "Detroit",
        "Lansing",],
    "Minnesota": [
        "Minneapolis",
        "St. Paul",],
    "Mississippi": [
        "Jackson",
        "Gulfport",],
    "Missouri": [
        "Kansas City",
        "Jefferson City",],
    "Montana": [
        "Billings",
        "Helena",],
    "Nebraska": [
        "Lincoln",
        "Omaha",],
    "Nevada": [
        "Carson City",
        "Las Vegas",
        "Reno",],
    "New Hampshire": [
        "Concord",
        "Manchester",],
    "New Jersey": [
        "Newark",
        "Trenton",],
    "New Mexico": [
        "Albuquerque",
        "Santa Fe",],
    "New York": [
        "Albany",
        "New York City",],
    "North Carolina": [
        "Charlotte",
        "Raleigh",],
    "North Dakota": [
        "Bismarck",
        "Fargo",],
    "Ohio": [
        "Columbus",
        "Cleveland",],
    "Oklahoma": [
        "Oklahoma City",
        "Tulsa",],
    "Oregon": [
        "Portland",
        "Salem",],
    "Pennsylvania": [
        "Harrisburg",
        "Philadelphia",],
    "Rhode Island": [
        "Providence",
        "Warwick",],
    "South Carolina": [
        "Charleston",
        "Columbia",],
    "South Dakota": [
        "Pierre",
        "Sioux Falls",],
    "Tennessee": [
        "Nashville",
        "Memphis",],
    "Texas": [
        "Austin",
        "Houston",],
    "Utah": [
        "Salt Lake City",
        "West Valley City",],
    "Vermont": [
        "Burlington",
        "Montpelier",],
    "Virginia": [
        "Richmond",
        "Virginia Beach",],
    "Washington": [
        "Olympia",
        "Seattle",],
    "West Virginia": [
        "Charleston",
        "Huntington",],
    "Wisconsin": [
        "Madison",
        "Milwaukee",],
    "Wyoming": [
        "Casper",
        "Cheyenne",],
}

In [5]:
corrections = { # little quirks
    "Boise": "Boise City",
    "New York City": "New York",
}

# Scraping

In [6]:
data = []
with requests.Session() as session:
    for state, cities in places.items():
        #print("scraping data for {state}:".format(state=state))
        for city in cities:
            #print("{city},".format(city=city)
            state_url = "http://www.city-data.com/crime/crime-{city}-{state}.html" \
                        .format(city=(corrections[city] if city in corrections else city).replace(' ', '-'),
                                state=state.replace(' ', '-'))
            city_soup = BeautifulSoup(session.get(state_url).content, 'html.parser')
            if city_soup.find(id="errormsg"):
                print("unable to find {city}, {state}; skipping"
                      .format(city=city,
                              state=state))
                continue
            table = city_soup.find(id="crimeTab")
            city_data = []
            city_data.append([cell.find('h4').get_text()
                              for cell
                              in table.find('thead').find_all('tr')[-1].find_all('th')][1:])
            for row in table.find('tbody').find_all('tr')[:-1]: # ignore footer
                city_data.append([to_val(cell)
                                  for cell
                                  in row.find_all('td')[1:]])
            data.append(pd.DataFrame({
                'state': state,
                'city': city,
                'year': city_data[0],
                'murder': city_data[1],
                'rape': city_data[2],
                'robbery': city_data[3],
                'assault': city_data[4],
                'burglary': city_data[5],
                'theft': city_data[6],
                'vehicle theft': city_data[7],
                'arson': city_data[8],
            }))
        
df = pd.concat(data, ignore_index=True).set_index(['state', 'city', 'year']).sort_index()

unable to find Honolulu, Hawaii; skipping
unable to find Kauai, Hawaii; skipping
unable to find Maui, Hawaii; skipping
unable to find Carson City, Nevada; skipping
unable to find Nashville, Tennessee; skipping
unable to find West Valley City, Utah; skipping


In [7]:
df.to_pickle("citydata_crime_data.pkl")

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,arson,assault,burglary,murder,rape,robbery,theft,vehicle theft
state,city,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,Birmingham,2002,211.0,1697.0,4389.0,65.0,239.0,1186.0,11640.0,2049.0
Alabama,Birmingham,2003,175.0,1706.0,4831.0,85.0,204.0,1352.0,11934.0,2809.0
Alabama,Birmingham,2004,142.0,1593.0,5156.0,59.0,240.0,1369.0,11970.0,2351.0
Alabama,Birmingham,2005,136.0,1675.0,4933.0,104.0,241.0,1429.0,11962.0,2028.0
Alabama,Birmingham,2006,228.0,1422.0,4813.0,104.0,220.0,1429.0,12113.0,2081.0
Alabama,Birmingham,2007,221.0,1396.0,4864.0,86.0,229.0,1609.0,12528.0,2246.0
Alabama,Birmingham,2008,134.0,1456.0,5153.0,82.0,212.0,1499.0,12761.0,2140.0
Alabama,Birmingham,2009,135.0,1399.0,5019.0,65.0,198.0,1150.0,11546.0,1594.0
Alabama,Birmingham,2011,123.0,1916.0,5806.0,54.0,182.0,1011.0,10522.0,1513.0
Alabama,Birmingham,2012,117.0,2035.0,4704.0,67.0,152.0,983.0,9042.0,1042.0
