In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib, time, string, re

In [None]:
# Setup of Webdriver and Selenium

options = Options()
#options.headless = True
service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(options=options, service=service)

In [None]:
# The City of Madison states that the data in the incident reports are not necessarily reflective of 
#    total crime statistics, but instead are chosen by the Officer in Charge's discretion of
#     what cases may have public interest.

## WRITE THIS TO A FILE STORED ON MY PC, and then COMMENT OUT SCRAPER
# Here we are populating the dataset by scraping each page's table.
url = "https://www.cityofmadison.com/police/newsroom/incidentreports/"
driver.get(url)
data = []  

for i in range(5): ## Change to while True to get whole dataset
    main_content = driver.find_element("id", "main-content")
    #headers = main_content.find_elements("class name", "row.title.incident-reports") # Used for reference 
    rows = main_content.find_elements("class name", "row.incident-reports")[1:] # Slicing off header row 
    for row in rows:
        date = row.find_element("class name", "date")
        incident = row.find_element("class name", "agency")
        casenumber = row.find_element("class name", "casenumber")
        address = row.find_element("class name", "address")
        officer = row.find_element("class name", "releasedby")
        updated = row.find_element("class name", "updated")
        link = row.find_element("tag name", "a") # driver.get(link), description = Class name span5
        #description = row.find_element("").find_element("")
        data.append([date.text, incident.text, address.text, officer.text, 
                     casenumber.text, updated.text, link.get_attribute("href")])
    try:
        next_button = main_content.find_element("link text", "Next »")
        next_button.click()
    except NoSuchElementException:
        break
        
        
df = pd.DataFrame(data, columns=["date", "incident","address","officer","casenumber","updated","link"])

df.to_csv("/data.csv")

In [None]:
# Here we are accessing each of the linked full incident-reports in order to 
#      pull the description and add it to the dataframe.

description_list = []
for link in df["link"]:
    driver.get(link)
    description = driver.find_elements("class name", "span5")[-1]
    description_list.append(description.text)
df["description"] = description_list

In [None]:
# First, Let us take a quick look at our dataset!
df.head()
df.set_index('casenumber')

In [None]:
df.iloc[4] # checking values of fifth entry

In [None]:
df.isnull().sum() # Checking if there is any missing data

In [None]:
# Plotting most common incidents using Seaborn
incident_counts = df["incident"].value_counts()[:10] # could we use [~"information"] to remove from this graph?
fig, ax = plt.subplots()
sns.barplot(y=incident_counts.index, x=incident_counts.values, 
            width=0.95, alpha=0.75, orient="h", ax=ax)
plt.title("Top 10 Most Common Incidents")
plt.xlabel("Number of Occurences")