# HTML Only Article Extraction

- person name
- event date
- publication date
- publisher
- paragraph text
- paragraph index

### Dependencies

In [1]:
import articleDateExtractor
from bs4 import BeautifulSoup, Comment
import requests
import polars as pl
import json
from datetime import datetime

In [2]:
# these are all non-fatal incidents from our own database
articles = [
    "https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey", 
    "https://www.nbcnews.com/news/us-news/isaiah-brown-black-man-shot-virginia-deputy-who-gave-him-n1265373", 
    "https://www.cnn.com/2023/11/17/us/charlotte-north-carolina-officer-struck-woman/index.html", 
    "https://www.bbc.com/news/articles/c05z7pm9llpo",
    "https://abcnews.go.com/amp/US/mad-time-watch-nba-player-video-showing-police/story?id=55407317"]

more_articles = ["https://www.nytimes.com/2021/06/19/world/canada/montreal-police-video-teenager.html",
    "https://www.nytimes.com/2024/10/18/us/tyron-mcalpin-charges-dropped-maricopa-arizona.html",
    "https://www.nytimes.com/2020/06/24/world/canada/canada-allan-adam-indigenous.html", 
    "https://www.cbc.ca/news/canada/british-columbia/rcmp-mona-wang-lacy-browning-police-violence-kelowna-1.6952794", 
    "https://www.washingtonpost.com/nation/2021/04/20/karen-garner-video-loveland-criminal-probe/",
]

### Webhose

In [3]:
for a in articles:
    date = articleDateExtractor.extractArticlePublishedDate(a)
    print(date)

Extracting date from https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey
2020-08-27 12:13:16+00:00
Extracting date from https://www.nbcnews.com/news/us-news/isaiah-brown-black-man-shot-virginia-deputy-who-gave-him-n1265373
2021-04-26 17:16:00+00:00
Extracting date from https://www.cnn.com/2023/11/17/us/charlotte-north-carolina-officer-struck-woman/index.html
2023-11-18 02:00:06.957000+00:00
Extracting date from https://www.bbc.com/news/articles/c05z7pm9llpo
2024-11-04 10:13:41.709000+00:00
Extracting date from https://abcnews.go.com/amp/US/mad-time-watch-nba-player-video-showing-police/story?id=55407317
2018-05-24 22:25:00+00:00


#### Notes
Seems to struggle with in more_articles. NYT extracts but throws exception first, CBC and Washington don't load.

# Beautiful Soup

### Notes
- simple soln but won't work for all websites (auth, or dynamically loaded)
- some common publishers it doesnt work for: nyt, cbc

In [4]:
def getHTML(url):
    res = requests.get(url) 
    return res.text

### Publisher Info

In [5]:
# the meta data functions likely need more robust error handling.
# works for non JS sites

def getMetaData(soup):
    ld_scripts = soup.find_all('script', {'type': 'application/ld+json'})
        
    for script in ld_scripts:
        if not script.string:
            continue  
        # temp store this script 
        article_data = {}
        temp_data = json.loads(script.string)
        if isinstance(temp_data, dict):
            if temp_data.get('@type') == 'NewsArticle' or temp_data.get('@type') == 'ReportageNewsArticle':
                article_data = temp_data

        # where data saved as objs, check that we actually have NewsArticle obj
        if isinstance(temp_data, dict) and not article_data:
            continue
        
        ld_data = json.loads(script.string)
            
        # arr
        if isinstance(ld_data, list):
            news_article = next((item for item in ld_data if item.get('@type') == 'NewsArticle'), None)
            if news_article:
                return news_article
        # obj
        else:
            return ld_data
                    
        return None
        
# these two might need more robust error handling
def getPublisher(news_article):
    pub = news_article.get("publisher", {})
    # pub_name = pub.get("name")
    return pub.get("name")
    
def getPublicationDate(news_article):
    pub_date = news_article.get("datePublished")
    if pub_date:
        date = datetime.fromisoformat(pub_date.replace("Z", "+00:00"))
        return date.date()
    return None

In [6]:
for a in articles:
    meta = getMetaData(BeautifulSoup(getHTML(a)))
    publisher = getPublisher(meta)
    print("published by: ", publisher)
    publication_date = getPublicationDate(meta)
    print("originally published on: ", publication_date)

published by:  The Guardian
originally published on:  2020-08-27
published by:  NBC News
originally published on:  2021-04-26
published by:  CNN
originally published on:  2023-11-18
published by:  BBC News
originally published on:  2024-11-04
published by:  ABC News
originally published on:  2018-05-24


## Canadian Only

### Global

In [7]:
global_soup = BeautifulSoup(getHTML("https://globalnews.ca/news/3731838/asirt-investigating-after-alleged-shots-fired-near-northwest-edmonton-shopping-centre/"))

In [8]:
paragraphs = global_soup.article.find_all('p')
global_text = global_soup.title.get_text()
for p in paragraphs:
    global_text += "\n"
    global_text += p.get_text()

In [9]:
file_name = "global_article.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(global_text)

print(f"Article saved to {file_name}")

Article saved to global_article.txt


### Popular Publishers

#### Guardian

In [10]:
guardian_soup = BeautifulSoup(getHTML("https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey"))

In [11]:
paragraphs = guardian_soup.article.find_all('p')
guardian_text = guardian_soup.title.get_text()
for p in paragraphs:
    guardian_text += "\n"
    guardian_text += p.get_text()   

In [12]:
# times = guardian_soup.find_all('time')
# for t in times:
#     print(t)

In [13]:
file_name = "guardian_article.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(guardian_text)

print(f"Article saved to {file_name}")

Article saved to guardian_article.txt


##### Notes
- not sure if every publisher keeps their content in p tags but this does well at isolating text at least for the guardian
- docs say you should be able to get the source line for each tag, but that wasn't working on this specific article
- might be able to pull publication date using this too if webhose doesn't work out

#### BBC

In [14]:
bbc_soup = BeautifulSoup(getHTML("https://www.bbc.com/news/articles/c05z7pm9llpo"))

In [15]:
paragraphs = bbc_soup.article.find_all('p')
bbc_text = bbc_soup.title.get_text()
for p in paragraphs:
    bbc_text += "\n"
    bbc_text += p.get_text()    

In [16]:
file_name = "bbc_article.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(bbc_text)

print(f"Article saved to {file_name}")

Article saved to bbc_article.txt


#### CNN

In [17]:
cnn_soup = BeautifulSoup(getHTML("https://www.cnn.com/2023/11/17/us/charlotte-north-carolina-officer-struck-woman/index.html"))

In [18]:
paragraphs = cnn_soup.article.find_all('p')
cnn_text = cnn_soup.title.get_text()
for p in paragraphs:
    cnn_text += "\n"
    cnn_text += p.get_text()    

In [19]:
file_name = "cnn_article.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(cnn_text)

print(f"Article saved to {file_name}")

Article saved to cnn_article.txt


#### NBC

In [20]:
nbc_soup = BeautifulSoup(getHTML( "https://www.nbcnews.com/news/us-news/isaiah-brown-black-man-shot-virginia-deputy-who-gave-him-n1265373"))

In [21]:
paragraphs = nbc_soup.article.find_all('p')
nbc_text = nbc_soup.title.get_text()
for p in paragraphs:
    nbc_text += "\n"
    nbc_text += p.get_text()    

In [22]:
file_name = "nbc_article.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(nbc_text)

print(f"Article saved to {file_name}")

Article saved to nbc_article.txt


#### ABC

In [23]:
abc_soup = BeautifulSoup(getHTML("https://abcnews.go.com/amp/US/mad-time-watch-nba-player-video-showing-police/story?id=55407317"))

In [24]:
paragraphs = abc_soup.article.find_all('p')
abc_text = abc_soup.title.get_text()
for p in paragraphs:
    abc_text += "\n"
    abc_text += p.get_text()  

In [25]:
file_name = "abc_article.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(abc_text)

print(f"Article saved to {file_name}")

Article saved to abc_article.txt


### Function

In [26]:
df = pl.DataFrame(
    data=[], 
    schema={
        "Person Name": pl.Utf8,
        "Incident Date": pl.Utf8,
        "URL": pl.Utf8,
        "Publisher": pl.Utf8,
        "Publication Date": pl.Date,
        "Paragraph": pl.Utf8,
        "Paragraph Index": pl.Int64
    }
)

def makeDataEntries(url):    
    soup = BeautifulSoup(getHTML(url))
    metadata = getMetaData(soup)
    publisher = getPublisher(metadata)
    publication_date = getPublicationDate(metadata)
    text = extractAllText(soup)
    addData(publisher, url, publication_date, text)
    
# the meta data functions likely need more robust error handling.
# works for non JS sites

def getMetaData(soup):
    ld_scripts = soup.find_all('script', {'type': 'application/ld+json'})   
    for script in ld_scripts:
        if not script.string:
            continue  
        # temp store this script 
        article_data = {}
        temp_data = json.loads(script.string)
        if isinstance(temp_data, dict):
            if temp_data.get('@type') == 'NewsArticle' or temp_data.get('@type') == 'ReportageNewsArticle':
                article_data = temp_data
        # where data saved as objs, check that we actually have NewsArticle obj
        if isinstance(temp_data, dict) and not article_data:
            continue  
        ld_data = json.loads(script.string)
        # arr
        if isinstance(ld_data, list):
            news_article = next((item for item in ld_data if item.get('@type') == 'NewsArticle'), None)
            if news_article:
                return news_article
        # obj
        else:
            return ld_data             
        return None
        
# these two might need more robust error handling
def getPublisher(news_article):
    pub = news_article.get("publisher", {})
    # pub_name = pub.get("name")
    return pub.get("name")
    
def getPublicationDate(news_article):
    pub_date = news_article.get("datePublished")
    if pub_date:
        date = datetime.fromisoformat(pub_date.replace("Z", "+00:00"))
        return date.date()
    return None


# returns text with line breaks
def extractAllText(soup):
    paragraphs = soup.article.find_all('p')
    text = soup.title.get_text()
    for p in paragraphs:
        text += "\n"
        text += p.get_text() 
    return text


def addData(publisher, url, publication_date, text):
    lines = text.split("\n")
    new_rows = []
    for index, line in enumerate(lines):
        if line.strip():
            row = pl.DataFrame({
                    "Person Name": "Manual entry", 
                    "Event Date": "Manual entry", 
                    "URL": url,
                    "Publisher": publisher,
                    "Publication Date": publication_date,
                    "Paragraph": line,
                    "Paragraph Index": index
                })
            df.extend(row)

In [32]:
import time
start = time.time()
makeDataEntries("https://www.ctvnews.ca/montreal/article/man-fatally-shot-by-police-in-gay-village-bei-investigating/")
end = time.time()
print("returned in ", end - start , " seconds")

returned in  0.251446008682251  seconds


In [28]:
import pathlib
# for later in python script
# dirpath = pathlib.Path(__file__).parent 
dirpath = pathlib.Path.cwd()
path = dirpath / "ctv_file.csv"

df.write_csv(path, separator=",")

In [29]:
# doesn't include nyt, cbc, wapo
errors_from_notion = [    "https://www.independent.co.uk/news/world/americas/man-leon-ford-police-shot-paralysed-four-times-pittsburgh-pennsylvania-a7904576.html",
    "https://vancouversun.com/news/kelowna-woman-sues-rcmp-officer-alleging-police-brutality-during-wellness-check",
    "https://www.propublica.org/article/new-york-city-paid-an-nba-star-millions-after-an-nypd-officer-broke-his-leg.-the-officer-paid-little-price#:~:text=Five years ago%2C NBA guard,his leg with a baton.",
"https://www.mirror.co.uk/news/uk-news/met-police-pay-out-millions-32605921",
             "https://nypost.com/2024/11/05/sports/bodycam-footage-shows-what-caused-wild-scene-with-police-fans-at-georgia-florida-game/",
    "https://kfor.com/news/local/bodycam-reveals-officer-slamming-elderly-man-to-the-ground/amp/",
              "https://www.latimes.com/sports/nfl/la-sp-seahawks-bennett-profiled-20170906-story.html",
              "https://www.seattletimes.com/sports/seahawks/las-vegas-sheriff-police-who-detained-michael-bennett-acted-appropriately-and-professionally/",
    "https://abcnews.go.com/US/deputy-charged-sonya-massey-killing-feared-life-new/story?id=112604944",
    "https://abcnews.go.com/US/former-deputy-held-bond-fatal-shooting-airman-roger/story?id=113177397",
    "https://abcnews.go.com/US/new-details-emerge-police-shooting-13-year-new/story?id=111584303",
              "https://abcnews.go.com/News/officer-fired-video-allegedly-choking-nfl-player-desmond/story?id=55096504",
    "https://www.12news.com/article/news/local/valley/black-teen-accuses-officers-of-burning-her-during-arrest/75-398972fb-631e-4b9d-9dad-dfcdc43decf3",
    "https://kansascitydefender.com/justice/independence-police-kill-mother-infant/",
    "https://apnews.com/article/utica-police-shooting-nyah-mway-myanmar-835e5c29eed93dc109108f668323d9f4",
    "https://abc7.com/post/las-vegas-police-kill-victim-of-home-invasion-who-called-911-for-help/15549861/",
    "https://apnews.com/article/shooting-chicago-police-investigation-3d075a6dc4bc8fa3535d8dd2340446f7",
    "https://abc7chicago.com/dexter-reed-shooting-medical-examiner-rules-mans-death-during-chicago-police-stop-a-homicide-says-he-was-shot-13-times/14735691/",
    "https://www.sfgate.com/news/bayarea/article/man-killed-by-police-was-armed-with-what-turned-19881990.php",
    "https://www.wkyc.com/article/news/local/akron/armed-suspect-shot-killed-akron-officer-gunfire-heard-nearby-according-to-police/95-168a0ea9-f4e0-4013-a052-24b86868315f",
    "https://www.dailymail.co.uk/news/article-4251310/Jean-Charles-Menezes-family-slam-Met-police-chief.html",
    "https://www.wkyc.com/article/news/investigations/3news-investigates-8-akron-police-fired-90-plus-shots-during-confrontation-fleeing-motorist/95-7d4a4603-8a2e-41a6-90eb-51b034863073",   "https://www.npr.org/2024/08/17/nx-s1-5079593/police-shooting-victoria-lee-fort-lee-new-jersey"
    "https://www.aljazeera.com/news/2024/10/21/london-police-officer-who-fatally-shot-chris-kaba-acquitted-of-murder",
]
notion_table = [
    "https://www.fox5atlanta.com/news/father-son-identified-in-moreland-avenue-police-shooting",
    "https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey",
    "https://www.cnn.com/2021/04/24/us/virginia-deputy-shooting-isaiah-brown/index.html",
    "https://www.nbcnews.com/news/us-news/isaiah-brown-black-man-shot-virginia-deputy-who-gave-him-n1265373",
    "https://www.nbcphiladelphia.com/news/local/leon-ford-pittsburgh-police-shooting/75915/",
    "https://www.usatoday.com/story/news/nation/2024/06/06/seattle-police-officers-beat-man-bus-stop-video/73995371007/",
    "https://www.nbcnews.com/news/us-news/2-police-officers-investigation-recorded-beating-man-batons-bus-stop-rcna155817",
    "https://www.bbc.com/news/articles/c5y3vllzm7yo",
    "https://www.cbsnews.com/news/joseph-harris-arkansas-police-officer-handcuffed-inmate-billy-lee-coram-back-patrol-car/",
    "https://www.nbcnews.com/news/us-news/video-shows-north-carolina-police-officer-beating-woman-ground-rcna125386",
    "https://www.cnn.com/2023/11/17/us/charlotte-north-carolina-officer-struck-woman/index.html",
    "https://globalnews.ca/news/2670078/caught-on-camera-florida-officer-fired-charged-for-beating-handcuffed-woman/",
    "https://www.cnn.com/2016/04/29/us/jacksonville-rookie-officer-fired/index.html",
    "https://www.theguardian.com/us-news/2021/apr/27/colorado-walmart-karen-garner-loveland-police-arrest",
    "https://abcnews.go.com/amp/US/mad-time-watch-nba-player-video-showing-police/story?id=55407317",
    "https://www.theguardian.com/uk-news/2024/oct/25/black-youth-worker-tasered-by-city-of-london-police-wins-appeal-for-damages",
    "https://www.bbc.com/news/articles/c05z7pm9llpo",
    "https://therealnews.com/new-body-cam-footage-exposes-police-tasing-wisconsin-man-for-minor-traffic-infraction",
    "https://www.yahoo.com/news/doordash-driver-tased-during-superior-035900843.html",
    "https://globalnews.ca/news/7962850/investigation-video-montreal-police-arresting-black-youth/",
    "https://www.bbc.com/news/articles/ce9x7l4v6e6o",
    "https://www.nbcnews.com/news/us-news/officers-hit-fans-florida-georgia-game-policy-sheriff-says-rcna178721",
    "https://www.cnn.com/2024/10/16/us/phoenix-police-tyron-mcalpin-bodycam/index.html",
    "https://www.cbsnews.com/newyork/news/nypd-shooting-sutter-ave-subway-station-brooklyn/",
    "https://www.bbc.com/news/articles/c93y74xl1wvo",
    "https://www.npr.org/2024/11/14/nx-s1-5191723/oklahoma-city-police-investigation-70-year-old-man-bodycam",
    "https://www.theglobeandmail.com/politics/article-indigenous-chief-says-rcmp-beat-him-up-and-manhandled-his-wife-over/",
    "https://www.foxnews.com/sports/dolphins-tyreek-hill-not-blameless-incident-police-officers-espns-stephen-smith-says",
    "https://www.cnn.com/2020/10/06/us/desmond-marrow-choking-police-aftermath/index.html",
    "https://www.nbcnews.com/news/us-news/former-tennis-star-james-blake-calls-nyc-police-change-after-n426211",
    "https://www.npr.org/sections/thetwo-way/2015/09/11/439563982/nypd-releases-video-of-officer-throwing-tennis-star-james-blake-to-the-ground",
    "https://www.abc15.com/news/local-news/investigations/man-suffers-third-degree-burns-while-being-held-on-hot-pavement-by-phoenix-police",
    "https://www.theguardian.com/us-news/2024/oct/29/phoenix-police-michael-kenyon-burn-video",
    "https://www.npr.org/2024/06/21/nx-s1-5015030/linda-tirado-journalist-shot-police-2020-george-floyd-protests-hospice-care",
    "https://www.foxnews.com/media/journalist-shot-minneapolis-police-2020-enters-hospice-care",
    "https://www.azcentral.com/story/news/local/phoenix/2020/09/02/phoenix-police-brutalized-black-teen-burned-arrest-attorney-heather-hamel/5698116002/",
    "https://www.kmbc.com/article/independence-missouri-ipd-mother-baby-shooting-identification-investigation/62883130",
    "https://www.npr.org/2024/08/23/nx-s1-5088020/roger-fortson-death-florida-sheriff-deputy-charged-manslaughter",
    "https://www.jsonline.com/story/news/local/milwaukee/2024/08/03/sam-sharpe-jr-s-faith-shined-at-memorial-service/74659400007/",
    "https://www.cnn.com/2024/07/16/us/police-fatally-shoot-man-near-republican-national-convention/index.html",
    "https://www.theguardian.com/us-news/article/2024/jul/22/sonya-massey-illinois-shooting-video",
    "https://globalnews.ca/news/10863747/siu-police-shooting-hamilton/amp/",
    "https://www.nbcnews.com/news/amp/rcna180528",
    "https://www.ksbw.com/article/da-releases-body-camera-video-armed-man-killed-carmel-police/62787490",
    "https://www.theguardian.com/uk-news/2024/nov/28/two-police-officers-served-misconduct-notices-death-girl-17-m5",
    "https://www.theguardian.com/uk-news/2024/nov/20/met-officer-under-criminal-investigation-over-road-death-of-pregnant-woman",
    "https://www.theguardian.com/uk-news/2024/nov/08/met-officer-who-killed-jean-charles-de-menezes-on-tube-defends-his-actions",
    "https://www.bbc.com/news/articles/ce8d8grkzyyo",
    ]


In [30]:
for article in notion_table:
    makeDataEntries(article)

In [31]:
df.write_csv("output.csv")

### 2024-01-07
- Works for basic news articles that do not use JS/need log in
- Leaves some major publishers todo: nyt, cbc, apnews
    - headless browser (Playwright?). Likely will have a wrapper function that identifies whether to call beautiful soup  or headless functions
- Another challenge: smaller publications have more inconsistent formatting so those are also unaccounted for in this code. 