# Gun Violence Archive: Mass shooting database

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import datetime as dt
import numpy as np
import altair as alt
import altair_stiles as altstiles
import requests
from bs4 import BeautifulSoup

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [5]:
today = dt.date.today().strftime("%Y-%m-%d")

---

#### Get the history provided by Gun Violence Archive

In [6]:
archive = pd.read_csv(
    "data/raw/gun-violence-archive_mass_shootings_2022_05_26.csv",
    dtype={"Incident ID": str},
    parse_dates=["Incident Date"],
)

In [7]:
archive

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,total
0,946496,2017-10-01,Nevada,Las Vegas,3950 Las Vegas Blvd S,59,441,500
1,577157,2016-06-12,Florida,Orlando,1912 S Orange Ave,50,53,103
2,980577,2017-11-05,Texas,Sutherland Springs,216 4th St,27,20,47
3,1466705,2019-08-03,Texas,El Paso,7101 Gateway Blvd,23,23,46
4,456893,2015-12-02,California,San Bernardino,1365 South Waterman Ave,16,19,35
...,...,...,...,...,...,...,...,...
3856,491674,2013-01-23,Tennessee,Chattanooga,1501 Dodds Ave,1,3,4
3857,479389,2013-01-21,California,Brentwood,1100 block of Breton Drive,0,4,4
3858,478948,2013-01-07,Oklahoma,Tulsa,6000 block of South Owasso,4,0,4
3859,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,4


#### Go [here](https://www.gunviolencearchive.org/mass-shooting) and get the latest. Name it by date. 

In [8]:
latest = pd.read_csv(
    "data/raw/gun-violence-archive_2022-07-05.csv",
    dtype={"Incident ID": str},
    parse_dates=["Incident Date"],
).drop("Operations", axis=1)

In [9]:
latest["total"] = latest["# Killed"] + latest["# Injured"]

In [10]:
src = (
    pd.concat([archive, latest])
    .drop_duplicates(subset="Incident ID", keep="last")
    .sort_values("Incident Date", ascending=False)
)

#### Clean up the column headers

In [11]:
src.columns = (
    src.columns.str.replace(" ", "_", regex=False)
    .str.replace("#", "no", regex=False)
    .str.lower()
)

In [12]:
src["year"] = src["incident_date"].dt.year

### AP states

In [13]:
ap_states = {
    "Alabama": "Ala.",
    "Alaska": "Alaska",
    "Arizona": "Ariz.",
    "Arkansas": "Ark.",
    "California": "Calif.",
    "Colorado": "Colo.",
    "Connecticut": "Conn.",
    "Delaware": "Del.",
    "Florida": "Fla.",
    "Georgia": "Ga.",
    "Hawaii": "Hawaii",
    "Idaho": "Idaho",
    "Illinois": "Ill.",
    "Indiana": "Ind.",
    "Iowa": "Iowa",
    "Kansas": "Kan.",
    "Kentucky": "Ky.",
    "Louisiana": "La.",
    "Maine": "Md.",
    "Maryland": "Mass.",
    "Massachusetts": "Maine",
    "Michigan": "Mich.",
    "Minnesota": "Minn.",
    "Mississippi": "Miss.",
    "Missouri": "Mo.",
    "Montana": "Mont.",
    "Nebraska": "Neb.",
    "Nevada": "Nev.",
    "New Hampshire": "N.H.",
    "New Jersey": "N.J.",
    "New Mexico": "N.M.",
    "New York": "N.Y.",
    "North Carolina": "N.C.",
    "North Dakota": "N.D.",
    "Ohio": "Ohio",
    "Oklahoma": "Okla.",
    "Oregon": "Ore.",
    "Pennsylvania": "Pa.",
    "Rhode Island": "R.I.",
    "South Carolina": "S.C.",
    "South Dakota": "S.D.",
    "Tennessee": "Tenn.",
    "Texas": "Texas",
    "Utah": "Utah",
    "Vermont": "Vt.",
    "Virginia": "Va.",
    "Washington": "Wash.",
    "West Virginia": "W.Va.",
    "Wisconsin": "Wis.",
    "Wyoming": "Wyo",
}

In [14]:
src["ap_state"] = src["state"].map(ap_states)

----

In [15]:
df = src.sort_values("incident_date", ascending=False).copy()

In [16]:
names_to_postal = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "Virgin Islands": "VI",
    "District of Columbia": "DC",
    "Dakota": "DK",
    "Orleans": "OL",
    "Philippine Islands": "PI",
}

In [17]:
df["postal_state"] = df["state"].map(names_to_postal)

In [18]:
df["full_address"] = (
    df["address"] + ", " + df["city_or_county"] + ", " + df["postal_state"]
)
df["full_address"] = df["full_address"].str.replace(" block of ", " ", regex=False)

---

In [30]:
df["year"].value_counts()

2021    692
2020    610
2019    417
2016    382
2017    347
2018    336
2015    336
2022    314
2014    272
2013    254
2012      4
Name: year, dtype: int64

In [29]:
len(df.query('state == "Colorado"'))

56

In [24]:
df.query('state == "Colorado"')

Unnamed: 0,incident_id,incident_date,state,city_or_county,address,no_killed,no_injured,total,year,ap_state,postal_state,full_address
53,2328342,2022-06-12,Colorado,Denver,4600 block of Colorado Ave,2,4,6,2022,Colo.,CO,"4600 Colorado Ave, Denver, CO"
95,2315190,2022-05-28,Colorado,Colorado Springs,3800 block of E Pikes Pike Ave,1,3,4,2022,Colo.,CO,"3800 E Pikes Pike Ave, Colorado Springs, CO"
199,2269059,2022-04-02,Colorado,Colorado Springs,2400 block of Barkman Dr,0,4,4,2022,Colo.,CO,"2400 Barkman Dr, Colorado Springs, CO"
204,2263608,2022-03-25,Colorado,Colorado Springs,820 Citadel Dr E,2,2,4,2022,Colo.,CO,"820 Citadel Dr E, Colorado Springs, CO"
231,2250540,2022-03-09,Colorado,Aurora,3300 N Ouray St,1,4,5,2022,Colo.,CO,"3300 N Ouray St, Aurora, CO"
302,2207406,2022-01-09,Colorado,Colorado Springs,1960 Chelton Rd,2,3,5,2022,Colo.,CO,"1960 Chelton Rd, Colorado Springs, CO"
313,2201677,2022-01-01,Colorado,Denver,1919 Blake St,2,2,4,2022,Colo.,CO,"1919 Blake St, Denver, CO"
319,2198521,2021-12-27,Colorado,Denver,56 Broadway,6,2,8,2021,Colo.,CO,"56 Broadway, Denver, CO"
359,2176653,2021-11-28,Colorado,Aurora,1500 block of Dayton St,0,5,5,2021,Colo.,CO,"1500 Dayton St, Aurora, CO"
377,2167297,2021-11-15,Colorado,Aurora,1200 Nome St,0,6,6,2021,Colo.,CO,"1200 Nome St, Aurora, CO"


---

#### Just in cases where there was a death

In [None]:
killed_df = df[df["no_killed"] > 0].copy()

In [None]:
killed_df.head(10)

#### Mass killings (as defined by [FBI](https://www.fbi.gov/about/partnerships/office-of-partner-engagement/active-shooter-resources))

In [None]:
len(killed_df[killed_df["no_killed"] >= 3])

In [None]:
mass_killings = killed_df[killed_df["no_killed"] >= 3]
mass_killings.head()

---

## Export

In [None]:
killed_df.to_csv("data/processed/mass_shootings_involving_deaths.csv", index=False)

In [None]:
killed_df.head(20).to_csv(
    "data/processed/mass_shootings_involving_deaths_sample.csv", index=False
)

In [None]:
mass_killings.to_csv("data/processed/mass_shootings_mass_killings.csv", index=False)