# Gun Violence Archive: Mass shooting database

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import datetime as dt
import numpy as np
import altair as alt
import altair_stiles as altstiles
import requests
from bs4 import BeautifulSoup



In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [5]:
today = dt.date.today().strftime("%Y-%m-%d")

---

#### Get the history provided by Gun Violence Archive

In [6]:
archive = pd.read_csv(
    "data/raw/gun-violence-archive_mass_shootings_2022_05_26.csv",
    dtype={"Incident ID": str},
    parse_dates=["Incident Date"],
)

In [7]:
archive

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,total
0,946496,2017-10-01,Nevada,Las Vegas,3950 Las Vegas Blvd S,59,441,500
1,577157,2016-06-12,Florida,Orlando,1912 S Orange Ave,50,53,103
2,980577,2017-11-05,Texas,Sutherland Springs,216 4th St,27,20,47
3,1466705,2019-08-03,Texas,El Paso,7101 Gateway Blvd,23,23,46
4,456893,2015-12-02,California,San Bernardino,1365 South Waterman Ave,16,19,35
...,...,...,...,...,...,...,...,...
3856,491674,2013-01-23,Tennessee,Chattanooga,1501 Dodds Ave,1,3,4
3857,479389,2013-01-21,California,Brentwood,1100 block of Breton Drive,0,4,4
3858,478948,2013-01-07,Oklahoma,Tulsa,6000 block of South Owasso,4,0,4
3859,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,4


#### Go [here](https://www.gunviolencearchive.org/mass-shooting) and get the latest. Name it by date. 

In [8]:
latest = pd.read_csv(
    "data/raw/gun-violence-archive_2022-07-05.csv",
    dtype={"Incident ID": str},
    parse_dates=["Incident Date"],
).drop("Operations", axis=1)

In [9]:
latest["total"] = latest["# Killed"] + latest["# Injured"]

In [10]:
src = (
    pd.concat([archive, latest])
    .drop_duplicates(subset="Incident ID", keep="last")
    .sort_values("Incident Date", ascending=False)
)

#### Clean up the column headers

In [11]:
src.columns = (
    src.columns.str.replace(" ", "_", regex=False)
    .str.replace("#", "no", regex=False)
    .str.lower()
)

In [12]:
src["year"] = src["incident_date"].dt.year

### AP states

In [13]:
ap_states = {
    "Alabama": "Ala.",
    "Alaska": "Alaska",
    "Arizona": "Ariz.",
    "Arkansas": "Ark.",
    "California": "Calif.",
    "Colorado": "Colo.",
    "Connecticut": "Conn.",
    "Delaware": "Del.",
    "Florida": "Fla.",
    "Georgia": "Ga.",
    "Hawaii": "Hawaii",
    "Idaho": "Idaho",
    "Illinois": "Ill.",
    "Indiana": "Ind.",
    "Iowa": "Iowa",
    "Kansas": "Kan.",
    "Kentucky": "Ky.",
    "Louisiana": "La.",
    "Maine": "Md.",
    "Maryland": "Mass.",
    "Massachusetts": "Maine",
    "Michigan": "Mich.",
    "Minnesota": "Minn.",
    "Mississippi": "Miss.",
    "Missouri": "Mo.",
    "Montana": "Mont.",
    "Nebraska": "Neb.",
    "Nevada": "Nev.",
    "New Hampshire": "N.H.",
    "New Jersey": "N.J.",
    "New Mexico": "N.M.",
    "New York": "N.Y.",
    "North Carolina": "N.C.",
    "North Dakota": "N.D.",
    "Ohio": "Ohio",
    "Oklahoma": "Okla.",
    "Oregon": "Ore.",
    "Pennsylvania": "Pa.",
    "Rhode Island": "R.I.",
    "South Carolina": "S.C.",
    "South Dakota": "S.D.",
    "Tennessee": "Tenn.",
    "Texas": "Texas",
    "Utah": "Utah",
    "Vermont": "Vt.",
    "Virginia": "Va.",
    "Washington": "Wash.",
    "West Virginia": "W.Va.",
    "Wisconsin": "Wis.",
    "Wyoming": "Wyo",
}

In [14]:
src["ap_state"] = src["state"].map(ap_states)

----

In [15]:
df = src.sort_values("incident_date", ascending=False).copy()

In [16]:
names_to_postal = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "Virgin Islands": "VI",
    "District of Columbia": "DC",
    "Dakota": "DK",
    "Orleans": "OL",
    "Philippine Islands": "PI",
}

In [17]:
df["postal_state"] = df["state"].map(names_to_postal)

In [18]:
df["full_address"] = (
    df["address"] + ", " + df["city_or_county"] + ", " + df["postal_state"]
)
df["full_address"] = df["full_address"].str.replace(" block of ", " ", regex=False)

---

## July 4th weekend? 

#### How many mass shootings? 

In [19]:
len(df[df["incident_date"] >= "2022-07-02"])

14

#### Killed in mass shootings? 

In [20]:
df[df["incident_date"] >= "2022-07-02"]["no_killed"].sum()

15

#### Injured? 

In [21]:
df[df["incident_date"] >= "2022-07-02"]["no_injured"].sum()

87

---

## Every July 4th (since 2013)

#### How many years of data? 

In [40]:
len(fourth["year"].unique())

10

#### Just the fourth

In [22]:
fourth = df[df["incident_date"].dt.strftime("%m-%d") == "07-04"]

#### How many mass shootings?

In [23]:
len(fourth)

48

#### Killed

In [24]:
fourth["no_killed"].sum()

37

#### Injured

In [25]:
fourth["no_injured"].sum()

228

---

#### Just in cases where there was a death

In [26]:
killed_df = df[df["no_killed"] > 0].copy()

In [27]:
killed_df.head(10)

Unnamed: 0,incident_id,incident_date,state,city_or_county,address,no_killed,no_injured,total,year,ap_state,postal_state,full_address
0,2349746,2022-07-05,Wisconsin,Kenosha,6300 block of 25th Ave,1,4,5,2022,Wis.,WI,"6300 25th Ave, Kenosha, WI"
1,2348384,2022-07-04,Illinois,Highland Park,Central Ave and 2nd St,6,31,37,2022,Ill.,IL,"Central Ave and 2nd St, Highland Park, IL"
3,2348230,2022-07-04,California,Sacramento,1525 L St,1,4,5,2022,Calif.,CA,"1525 L St, Sacramento, CA"
10,2348303,2022-07-03,Arizona,Surprise,W Carlin Dr,3,4,7,2022,Ariz.,AZ,"W Carlin Dr, Surprise, AZ"
12,2347060,2022-07-02,Texas,Haltom City,5700 block of Diamond Oaks Dr N,3,4,7,2022,Texas,TX,"5700 Diamond Oaks Dr N, Haltom City, TX"
13,2347228,2022-07-02,New York,Corona (Queens),129-09 89th Ave,1,3,4,2022,N.Y.,NY,"129-09 89th Ave, Corona (Queens), NY"
15,2346406,2022-07-01,Mississippi,Greenwood,Johnson St and Main St,1,7,8,2022,Miss.,MS,"Johnson St and Main St, Greenwood, MS"
16,2345410,2022-07-01,Illinois,Chicago,408 S Wells St,2,3,5,2022,Ill.,IL,"408 S Wells St, Chicago, IL"
17,2345144,2022-06-30,Kentucky,Allen,49 N Railroad St,3,5,8,2022,Ky.,KY,"49 N Railroad St, Allen, KY"
22,2340458,2022-06-26,Georgia,Blakely,N Church St and Washington Ave,1,6,7,2022,Ga.,GA,"N Church St and Washington Ave, Blakely, GA"


#### Mass killings (as defined by [FBI](https://www.fbi.gov/about/partnerships/office-of-partner-engagement/active-shooter-resources))

In [28]:
len(killed_df[killed_df["no_killed"] >= 3])

479

In [29]:
mass_killings = killed_df[killed_df["no_killed"] >= 3]
mass_killings.head()

Unnamed: 0,incident_id,incident_date,state,city_or_county,address,no_killed,no_injured,total,year,ap_state,postal_state,full_address
1,2348384,2022-07-04,Illinois,Highland Park,Central Ave and 2nd St,6,31,37,2022,Ill.,IL,"Central Ave and 2nd St, Highland Park, IL"
10,2348303,2022-07-03,Arizona,Surprise,W Carlin Dr,3,4,7,2022,Ariz.,AZ,"W Carlin Dr, Surprise, AZ"
12,2347060,2022-07-02,Texas,Haltom City,5700 block of Diamond Oaks Dr N,3,4,7,2022,Texas,TX,"5700 Diamond Oaks Dr N, Haltom City, TX"
17,2345144,2022-06-30,Kentucky,Allen,49 N Railroad St,3,5,8,2022,Ky.,KY,"49 N Railroad St, Allen, KY"
48,2328374,2022-06-12,California,Los Angeles,3300 block of E 14th St,3,4,7,2022,Calif.,CA,"3300 E 14th St, Los Angeles, CA"


---

## Export

In [30]:
killed_df.to_csv("data/processed/mass_shootings_involving_deaths.csv", index=False)

In [31]:
killed_df.head(20).to_csv(
    "data/processed/mass_shootings_involving_deaths_sample.csv", index=False
)

In [32]:
mass_killings.to_csv("data/processed/mass_shootings_mass_killings.csv", index=False)