# Article Extraction

- person name
- event date
- publication date
- paragraph text
- paragraph index

### Dependencies

In [None]:
# pip install articleDateExtractor
# pip install python-dateutil
# pip install lxml
# pip install beautifulsoup4
# pip install requests (I think this is slow, so will look into a different library)

In [None]:
import articleDateExtractor
from bs4 import BeautifulSoup, Comment
import requests

In [None]:
# these are all non-fatal incidents from our own database
articles = [
    "https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey", 
    "https://www.nbcnews.com/news/us-news/isaiah-brown-black-man-shot-virginia-deputy-who-gave-him-n1265373", 
    "https://www.cnn.com/2023/11/17/us/charlotte-north-carolina-officer-struck-woman/index.html", 
    "https://www.bbc.com/news/articles/c05z7pm9llpo",
    "https://abcnews.go.com/amp/US/mad-time-watch-nba-player-video-showing-police/story?id=55407317"]

more_articles = ["https://www.nytimes.com/2021/06/19/world/canada/montreal-police-video-teenager.html",
    "https://www.nytimes.com/2024/10/18/us/tyron-mcalpin-charges-dropped-maricopa-arizona.html",
    "https://www.nytimes.com/2020/06/24/world/canada/canada-allan-adam-indigenous.html", 
    "https://www.cbc.ca/news/canada/british-columbia/rcmp-mona-wang-lacy-browning-police-violence-kelowna-1.6952794", 
    "https://www.washingtonpost.com/nation/2021/04/20/karen-garner-video-loveland-criminal-probe/",
]

### Webhose

In [None]:
for a in articles:
    date = articleDateExtractor.extractArticlePublishedDate(a)
    print(date)

#### Notes
Seems to struggle with in more_articles. NYT extracts but throws exception first, CBC and Washington don't load.

### Beautiful Soup

In [None]:
# soup = BeautifulSoup(getHTML("https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey"))
# [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
# visible_text = soup.getText()
# print(visible_text)

In [None]:
def getHTML(url):
    res = requests.get(url) 
    return res.text

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True
def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(string=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)
# using the above
# text_from_html(getHTML("https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey"))

In [None]:
soup = BeautifulSoup(getHTML("https://www.theguardian.com/us-news/2020/aug/27/kenosha-police-officer-who-shot-jacob-blake-named-as-rusten-sheskey"))

In [None]:
soup.title

In [None]:
# soup.find_all('p')
for tag in soup.find_all('p'):
    # print(tag.sourceline)
    print(tag.string)

#### Notes
- not sure if every publisher keeps their content in p tags but this does well at isolating text at least for the guardian
- docs say you should be able to get the source line for each tag, but that wasn't working on this specific article
- might be able to pull publication date using this too if webhose doesn't work out