In [2]:
import requests
import glob
import pandas as pd
from bs4 import BeautifulSoup

## 2019

In [9]:
root = "https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-10/table-10-state-cuts"
div_class = "contenttype-excelfile state-published url"

res = requests.get(root)
soup = BeautifulSoup(res.text, "html.parser")
layer1 = soup.find_all("a", div_class)

with open("data/crime/2019/links.txt", "w") as f:
    for state_link in layer1:
        f.write(state_link["href"]+"/output.xls\n")

In [10]:
data = list()
for state_file in glob.glob("data/crime/2019/states/*.xls"):
    df = pd.read_excel(state_file, skiprows=5, skipfooter=2).iloc[:, 1:]
    df["State"] = state_file.split("/")[-1].replace(".xls", "")
    data.append(df)

df = pd.concat(data, axis=0)

df["Rape"] = df.filter(like="Rape", axis=1).fillna(0).sum(axis=1)
df["Arson"] = df.filter(like="Arson", axis=1).fillna(0).sum(axis=1)

for col in df.filter(regex="Rape\d").columns.tolist() + df.filter(regex="Arson\d").columns.tolist():
    del df[col]

df

Unnamed: 0,County,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,Arson,State,Rape
0,Butler,49.0,0.0,1.0,36.0,301.0,61.0,201.0,39.0,3.0,kansas,12.0
1,Doniphan,1.0,0.0,0.0,1.0,8.0,3.0,4.0,1.0,0.0,kansas,0.0
2,Geary,15.0,1.0,0.0,13.0,32.0,8.0,19.0,5.0,0.0,kansas,1.0
3,Harvey,11.0,0.0,1.0,10.0,24.0,12.0,11.0,1.0,0.0,kansas,0.0
4,Jackson,15.0,0.0,0.0,12.0,122.0,17.0,96.0,9.0,1.0,kansas,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
48,Stanley,0.0,0.0,0.0,0.0,10.0,3.0,6.0,1.0,0.0,south_dakota,0.0
49,Sully,4.0,0.0,0.0,4.0,9.0,7.0,0.0,2.0,2.0,south_dakota,0.0
50,Tripp,2.0,0.0,0.0,1.0,4.0,0.0,3.0,1.0,0.0,south_dakota,1.0
51,Walworth,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,south_dakota,0.0


In [11]:
df.to_csv("data/crime/2019/crime2019.csv")

## 2013

In [5]:
root = "https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/table-10/table_10_offenses_known_to_law_enforcement_by_state_by_metropolitan_and_nonmetropolitan_counties_2013.xls/view"
div_class = "arrow-left-large"

res = requests.get(root)
soup = BeautifulSoup(res.text, "html.parser")
layer1 = soup.find_all("a", div_class)

with open("data/crime/2013/links.txt", "w") as f:
    for state_link in layer1:
        f.write(state_link["href"]+"/output.xls\n")

In [6]:
data = list()
for state_file in glob.glob("data/crime/2013/states/*.xls"):
    df = pd.read_excel(state_file, skiprows=5, skipfooter=2).iloc[:, 1:]
    df["State"] = (
        state_file
            .split("/")[-1]
            .replace("_by", "")
            .replace("_metropolitan", "")
            .replace(".xls", "")
    )
    data.append(df)

df = pd.concat(data, axis=0)

df["Arson"] = df.filter(like="Arson", axis=1).fillna(0).sum(axis=1)

for col in df.filter(regex="Arson\d").columns.tolist():
    del df[col]
    
del df["Rape\n(legacy definition)2"]

df

Unnamed: 0,County,Violent \ncrime,Murder and \nnonnegligent \nmanslaughter,Rape\n(revised definition)1,Robbery,Aggravated \nassault,Property \ncrime,Burglary,Larceny-\ntheft,Motor \nvehicle \ntheft,Arson,State
0,Asotin,25.0,0.0,2.0,3.0,20.0,187.0,46.0,131.0,10.0,0.0,washington
1,Benton,53.0,2.0,5.0,4.0,42.0,490.0,153.0,290.0,47.0,2.0,washington
2,Chelan,42.0,0.0,4.0,3.0,35.0,486.0,147.0,307.0,32.0,0.0,washington
3,Clark,245.0,1.0,26.0,39.0,179.0,3390.0,910.0,2040.0,440.0,33.0,washington
4,Columbia,3.0,0.0,1.0,0.0,2.0,213.0,54.0,156.0,3.0,0.0,washington
...,...,...,...,...,...,...,...,...,...,...,...,...
42,Stanley,0.0,0.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,0.0,south_dakota
43,Sully,0.0,0.0,0.0,0.0,0.0,4.0,2.0,1.0,1.0,0.0,south_dakota
44,Walworth,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,south_dakota
45,Yankton,0.0,0.0,0.0,0.0,0.0,69.0,29.0,37.0,3.0,0.0,south_dakota


In [7]:
df.to_csv("data/crime/2013/crime2013.csv")

## 2016

In [9]:
root = "https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/tables/table-8/table-8.xls/view"
div_class = "arrow-left-large"

res = requests.get(root)
soup = BeautifulSoup(res.text, "html.parser")
layer1 = soup.find_all("a", div_class)

with open("data/crime/2016/links.txt", "w") as f:
    for state_link in layer1:
        f.write(state_link["href"]+"/output.xls\n")

In [16]:
data = list()
for state_file in glob.glob("data/crime/2016/states/*.xls"):
    df = pd.read_excel(state_file, skiprows=5, skipfooter=2).iloc[:, 1:]
    df["State"] = (
        state_file
            .split("/")[-1]
            .replace("_by", "")
            .replace("_metropolitan", "")
            .replace(".xls", "")
    )
    data.append(df)

df = pd.concat(data, axis=0)

df["Arson"] = df.filter(like="Arson", axis=1).fillna(0).sum(axis=1)

for col in df.filter(regex="Arson\d").columns.tolist():
    del df[col]
    
del df["Rape\n(legacy\ndefinition)2"]

df

Unnamed: 0,County,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape\n(revised\ndefinition)1,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,State,Arson
0,Autauga,73.0,1.0,12.0,8.0,52.0,429.0,146.0,233.0,50.0,alabama,0.0
1,Baldwin,127.0,1.0,5.0,23.0,98.0,613.0,229.0,342.0,42.0,alabama,0.0
2,Bibb,0.0,0.0,0.0,0.0,0.0,37.0,20.0,14.0,3.0,alabama,0.0
3,Blount,394.0,1.0,17.0,9.0,367.0,867.0,261.0,501.0,105.0,alabama,0.0
4,Calhoun,23.0,0.0,7.0,5.0,11.0,319.0,137.0,181.0,1.0,alabama,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
35,Pacific,25.0,1.0,4.0,1.0,19.0,201.0,74.0,117.0,10.0,washington,0.0
36,San Juan,10.0,1.0,1.0,1.0,7.0,177.0,35.0,136.0,6.0,washington,3.0
37,Wahkiakum,9.0,0.0,2.0,0.0,7.0,42.0,14.0,26.0,2.0,washington,0.0
38,Whitman,19.0,0.0,8.0,1.0,10.0,94.0,55.0,28.0,11.0,washington,1.0


In [17]:
df.to_csv("data/crime/2016/crime2016.csv")