## Mass shooting in the US from 2014 to 2022

Scraping the website https://www.gunviolencearchive.org/ to get details of mass shootings in the US since 2014. The purpose of this project is to determine the age distribution of the shooters in the past decade. 

In [1]:
# make all the imports

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
from playwright.async_api import async_playwright
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

In [2]:
# go to the website

url = 'https://www.gunviolencearchive.org/'
await page.goto(url)

<Response url='https://www.gunviolencearchive.org/' request=<Request url='https://www.gunviolencearchive.org/' method='GET'>>

In [3]:
# click the 'reports' button

await page.locator('//*[@id="block-gva-general-gva-navigation"]/ul/li[4]/a').click()

In [4]:
# try clicking the first report

await page.locator('//*[@id="block-system-main"]/div/div/div/div/div/ul[2]/li[1]/a').click()

In [14]:
# print all the href in 'a' tags that contain 'View Incident'

html = await page.content()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', text='View Incident')
for link in links:
    href = link.get('href')
    print(href)

/incident/2521112
/incident/2521084
/incident/2521423
/incident/2518860
/incident/2518281
/incident/2518292
/incident/2518344
/incident/2518456
/incident/2517797
/incident/2516494
/incident/2516309
/incident/2514902
/incident/2514220
/incident/2514564
/incident/2513512
/incident/2513575
/incident/2513517
/incident/2513496
/incident/2513044
/incident/2513527
/incident/2512985
/incident/2512343
/incident/2512617
/incident/2510365
/incident/2510063


In [15]:
# do the same thing but in a function

async def extract_links(page):
    html = await page.content()
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a', text='View Incident')
    for link in links:
        href = link.get('href')
        print(href)

await extract_links(page)

/incident/2521112
/incident/2521084
/incident/2521423
/incident/2518860
/incident/2518281
/incident/2518292
/incident/2518344
/incident/2518456
/incident/2517797
/incident/2516494
/incident/2516309
/incident/2514902
/incident/2514220
/incident/2514564
/incident/2513512
/incident/2513575
/incident/2513517
/incident/2513496
/incident/2513044
/incident/2513527
/incident/2512985
/incident/2512343
/incident/2512617
/incident/2510365
/incident/2510063


### Replicate through every page of the first report

In [7]:
# click the 'next' button

await page.locator('//*[@id="block-system-main"]/div/ul/li[11]/a').click()

In [8]:
# click the button in a loop (till the end of the results)

while True:
    try:
        await page.locator('//*[@id="block-system-main"]/div/ul/li[11]/a').click()
    except: #Exception as e:
        # raise(e)
        break

In [5]:
# click the button in a loop (till the end of the results) and print the links

while True:
    try:
        # locate where the href is nested 
        html = await page.content()
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', text='View Incident')
        for link in links:
            href = link.get('href')
            print(href)
            
        # let the page load 
        async with page.expect_navigation():
            # locate the next button
            await page.locator('//*[@id="block-system-main"]/div/ul/li[11]/a').click()
        print("Pressing the button")
        print("---------")
    except:
        # break out of  the loop
        break

/incident/2521112
/incident/2521084
/incident/2521423
/incident/2518860
/incident/2518281
/incident/2518292
/incident/2518344
/incident/2518456
/incident/2517797
/incident/2516494
/incident/2516309
/incident/2514902
/incident/2514220
/incident/2514564
/incident/2513512
/incident/2513575
/incident/2513517
/incident/2513496
/incident/2513044
/incident/2513527
/incident/2512985
/incident/2512343
/incident/2512617
/incident/2510365
/incident/2510063
Pressing the button
---------
/incident/2509362
/incident/2509663
/incident/2508683
/incident/2508370
/incident/2508522
/incident/2507836
/incident/2504645
/incident/2503649
/incident/2503345
/incident/2503245
/incident/2503058
/incident/2502680
/incident/2503178
/incident/2503197
/incident/2502550
/incident/2501869
/incident/2499119
/incident/2499381
/incident/2498792
/incident/2498013
/incident/2499425
/incident/2497948
/incident/2497502
/incident/2496714
/incident/2497164
Pressing the button
---------
/incident/2428456
/incident/2426627
/inc

In [5]:
# put the result in a list

mass_shootings = []

while True:
    try:
        # locate where the href is nested 
        html = await page.content()
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', text='View Incident')
        for link in links:
            href = link.get('href')
            mass_shootings.append(href)
            
        # let the page load 
        async with page.expect_navigation():
            # locate the next button
            await page.locator('//*[@id="block-system-main"]/div/ul/li[11]/a').click()
        print("Pressing the button")
        print("---------")
    except:
        # break out of  the loop
        break

mass_shootings


Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------
Pressing the button
---------


['/incident/2521112',
 '/incident/2521084',
 '/incident/2521423',
 '/incident/2518860',
 '/incident/2518281',
 '/incident/2518292',
 '/incident/2518344',
 '/incident/2518456',
 '/incident/2517797',
 '/incident/2516494',
 '/incident/2516309',
 '/incident/2514902',
 '/incident/2514220',
 '/incident/2514564',
 '/incident/2513512',
 '/incident/2513575',
 '/incident/2513517',
 '/incident/2513496',
 '/incident/2513044',
 '/incident/2513527',
 '/incident/2512985',
 '/incident/2512343',
 '/incident/2512617',
 '/incident/2510365',
 '/incident/2510063',
 '/incident/2509362',
 '/incident/2509663',
 '/incident/2508683',
 '/incident/2508370',
 '/incident/2508522',
 '/incident/2507836',
 '/incident/2504645',
 '/incident/2503649',
 '/incident/2503345',
 '/incident/2503245',
 '/incident/2503058',
 '/incident/2502680',
 '/incident/2503178',
 '/incident/2503197',
 '/incident/2502550',
 '/incident/2501869',
 '/incident/2499119',
 '/incident/2499381',
 '/incident/2498792',
 '/incident/2498013',
 '/inciden

In [7]:
# add https://www.gunviolencearchive.org/ to every link in mass_shootings

mass_shootings_urls = []

for link in mass_shootings:
    link = 'https://www.gunviolencearchive.org/' + link
    mass_shootings_urls.append(link)

mass_shootings_urls 


['https://www.gunviolencearchive.org//incident/2521112',
 'https://www.gunviolencearchive.org//incident/2521084',
 'https://www.gunviolencearchive.org//incident/2521423',
 'https://www.gunviolencearchive.org//incident/2518860',
 'https://www.gunviolencearchive.org//incident/2518281',
 'https://www.gunviolencearchive.org//incident/2518292',
 'https://www.gunviolencearchive.org//incident/2518344',
 'https://www.gunviolencearchive.org//incident/2518456',
 'https://www.gunviolencearchive.org//incident/2517797',
 'https://www.gunviolencearchive.org//incident/2516494',
 'https://www.gunviolencearchive.org//incident/2516309',
 'https://www.gunviolencearchive.org//incident/2514902',
 'https://www.gunviolencearchive.org//incident/2514220',
 'https://www.gunviolencearchive.org//incident/2514564',
 'https://www.gunviolencearchive.org//incident/2513512',
 'https://www.gunviolencearchive.org//incident/2513575',
 'https://www.gunviolencearchive.org//incident/2513517',
 'https://www.gunviolencearchiv

In [None]:
# save the list as a csv file

import pandas as pd

df = pd.DataFrame(mass_shootings_urls)
df.to_csv('mass_shootings.csv', index=False)
