In [5]:
!pip install requests beautifulsoup4 pandas



In [31]:
from bs4 import BeautifulSoup as bs
import requests
from urllib.request import urlopen
import pandas as pd
import datetime
import re

url = "https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States"
data = urlopen(url) # data is an http response object
data_html = data.read() # we can get the whole html of that page by using the read() method 

soup = bs(data_html, "html.parser")
tables = soup.find_all('table', {'class' : 'wikitable'})
print("Number of Tables: {}".format(len(tables)))

Number of Tables: 34


In [11]:
print(tables[3].prettify())

<table class="wikitable sortable mw-datatable">
 <tbody>
  <tr>
   <th width="12%">
    Date
   </th>
   <th width="16%">
    Location
   </th>
   <th width="5%">
    Dead
   </th>
   <th width="5%">
    Injured
   </th>
   <th width="5%">
    Total
   </th>
   <th width="60%">
    Description
   </th>
  </tr>
  <tr>
   <td>
    November 22, 2022
   </td>
   <td>
    <a href="/wiki/Chesapeake,_Virginia" title="Chesapeake, Virginia">
     Chesapeake, Virginia
    </a>
   </td>
   <td>
    7
    <sup class="reference" id="cite_ref-:0_10-29">
     <a href="#cite_note-:0-10">
      <span class="cite-bracket">
       [
      </span>
      n 1
      <span class="cite-bracket">
       ]
      </span>
     </a>
    </sup>
   </td>
   <td>
    4
   </td>
   <td>
    <b>
     11
    </b>
   </td>
   <td>
    <a href="/wiki/2022_Chesapeake_shooting" title="2022 Chesapeake shooting">
     2022 Chesapeake shooting
    </a>
    : Six people were killed, and four others were injured, when a night-shi

In [23]:
head = soup.find_all('h1', {})
print(head)
print(len(head))

[<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">List of mass shootings in the United States</span></h1>]
1


In [24]:
table_2022 = tables[3]
headers = table_2022.find_all('th')
col_titles = [ct.text.strip() for ct in headers]
print(col_titles)

['Date', 'Location', 'Dead', 'Injured', 'Total', 'Description']


In [25]:
rows_data = table_2022.find_all('tr')
print("Total number of rows: {}".format(len(rows_data)))

Total number of rows: 24


In [36]:
table_data = []
for row_id, row in enumerate(rows_data) : 
    curr_row = []
    if row_id == 0:
        continue
    row_data = row.find_all('td', {})
    for col_id, col_data in enumerate(row_data) :
        text = col_data.text.replace(",", " ").strip()
        curr_row.append(text)
    if len(curr_row) == len(col_titles):
        table_data.append(curr_row)

print(table_data[0])

['November 19–20  2022', 'Colorado Springs  Colorado', '5', '26[n 1][n 6]', '31', 'Colorado Springs nightclub shooting: A gunman killed five after entering a local gay bar and 26 others  including the gunman  were wounded during the attack  19 of them by gunfire. A suspect was later taken into custody.']


In [46]:
def remove_wiki_refs(text):
    return re.sub(r'\[\w+ ?\d*\]', '', text).strip()
    
# Apply the cleaning function to all cells that are strings
df_cleaned = df.map(lambda x: remove_wiki_refs(x) if isinstance(x, str) else x)

# Save to a new cleaned CSV file
df_cleaned.to_csv("Mass Shootings 2022 Cleaned.csv", index=False)

In [37]:
df = pd.DataFrame(table_data, columns=col_titles)
df.to_csv("Mass Shootings 2022.csv", index=False)

In [48]:
import os
print("Current working directory:", os.getcwd())
if os.path.exists("Mass Shootings 2022.csv"):
    print("CSV file created successfully!")
else:
    print("CSV file not found.")

df_check = pd.read_csv("Mass Shootings 2022.csv")
print(df_check.head())

Current working directory: C:\Users\shali
CSV file created successfully!
                   Date                    Location    Dead       Injured  \
0     November 22  2022        Chesapeake  Virginia  7[n 1]             4   
1  November 19–20  2022  Colorado Springs  Colorado       5  26[n 1][n 6]   
2     November 13  2022   Charlottesville  Virginia       3             2   
3      October 24  2022         St. Louis  Missouri  3[n 1]        7[n 7]   
4      October 13  2022     Raleigh  North Carolina       5             2   

   Total                                        Description  
0     11  2022 Chesapeake shooting: Six people were kill...  
1     31  Colorado Springs nightclub shooting: A gunman ...  
2      5  2022 University of Virginia shooting: A studen...  
3     10  2022 Central Visual and Performing Arts High S...  
4      7  2022 Raleigh shootings: The shooter  a teenage...  


In [49]:
filename = "Mass Shootings 2022.csv"
filepath = os.path.abspath(filename)

print("File saved at:", filepath)

File saved at: C:\Users\shali\Mass Shootings 2022.csv


In [50]:
print("Cleaned CSV saved at:", os.path.abspath("Mass Shootings 2022 Cleaned.csv"))

Cleaned CSV saved at: C:\Users\shali\Mass Shootings 2022 Cleaned.csv
