# Data Collection
Data collected from Standford's Open Policing database.  
(https://openpolicing.stanford.edu/data/)

In [26]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from io import BytesIO
from zipfile import ZipFile
import requests
from urllib.request import urlopen

## Getting file links
I'll first scrape all csv zip file urls from the website.

In [27]:
html_page = requests.get('https://openpolicing.stanford.edu/data/')
soup = BeautifulSoup(html_page.content, 'html.parser')
file_urls = soup.findAll('a', title = 'Download data as CSV')
links = [link.get('href') for link in file_urls]

## Download and compile
I'll iterate through each zip file, download, unzip, open csv and filter only data from 2015 (to keep the data at manageable size).

In [30]:
full_df = pd.DataFrame()

for link in links:
    # open url
    resp = urlopen(link)
    # read zipfile
    zipfile = ZipFile(BytesIO(resp.read()))
    # get the csv file name
    fname = zipfile.namelist()[0]
    # convert to pandas dateframe
    df = pd.read_csv(zipfile.open(fname), dtype=object)
    # close file
    zipfile.close()
    # convert date string to datetime object
    df.date = pd.to_datetime(df.date)
    # filter dataframe
    df = df[(df.date > '2015-01-01') & (df.date < '2016-01-01')]
    # then only sample 5000
    if len(df) >= 5000:
        df = df.sample(5000)
    # add fname to use later to grab location info
    df['fname'] = fname
    # concat
    full_df = pd.concat([full_df, df], ignore_index=True, sort=False)
    # let me know where you are
    print(f'{fname} Completed')

ar_little_rock_2020_04_01.csv Completed
az_gilbert_2020_04_01.csv Completed
az_mesa_2020_04_01.csv Completed
az_statewide_2020_04_01.csv Completed
ca_anaheim_2020_04_01.csv Completed
ca_bakersfield_2020_04_01.csv Completed
ca_long_beach_2020_04_01.csv Completed
ca_los_angeles_2020_04_01.csv Completed
ca_oakland_2020_04_01.csv Completed
ca_san_bernardino_2020_04_01.csv Completed
ca_san_diego_2020_04_01.csv Completed
ca_san_francisco_2020_04_01.csv Completed
ca_san_jose_2020_04_01.csv Completed
ca_santa_ana_2020_04_01.csv Completed
ca_statewide_2020_04_01.csv Completed
ca_stockton_2020_04_01.csv Completed
co_aurora_2020_04_01.csv Completed
co_denver_2020_04_01.csv Completed
co_statewide_2020_04_01.csv Completed
ct_hartford_2020_04_01.csv Completed
ct_statewide_2020_04_01.csv Completed
fl_saint_petersburg_2020_04_01.csv Completed
fl_statewide_2020_04_01.csv Completed
fl_tampa_2020_04_01.csv Completed
ga_statewide_2020_04_01.csv Completed
ia_statewide_2020_04_01.csv Completed
id_idaho_fall

## Export Dataset

In [None]:
#full_df.to_csv('DATA/full_df.csv')