In [None]:
# get_all_documents.py
# Roderick Li and Jasmijn Cnossen
# Language as Data
# December 2021

"""
This script extracts all English and Dutch articles using MediaStack API and REST API.
It removes empty texts & duplicates and saves the cleaned datasets as tsv files.

First, we will define some functions.
After that, the code starts.
It will take approximately 10 minutes to run this script
"""

# import all packages
import requests
import re
from bs4 import BeautifulSoup
import html5lib
import http.client, urllib.parse, json
import pandas as pd
import time

# We keep track of the time it takes to run the code:
timer = time
start = timer.perf_counter()

########################
### DEFINE FUNCTIONS ###
########################

def url_to_string(url):
    """
    Extracts the raw text from a web page.
    It takes a URL string as input and returns the text.
    """
    
    parser_content = url_to_html(url)
    return html_to_string(parser_content)
    
def html_to_string(parser_content):
    """Extracts the textual content from an html object."""
    
    # Remove scripts
    for script in parser_content(["script", "style", "aside"]):
        script.extract()
        
    # This is a shorter way to write the code for removing the newlines.
    # It does it in one step without intermediate variables
    return " ".join(re.split(r'[\n\t]+', parser_content.get_text()))
    
def url_to_html(url):
    """Scrapes the html content from a web page. Takes a URL string as input and returns an html object. """
    
    # Get the html content
    res = requests.get(url, headers={"User-Agent": "XY"})
#     res = requests.get(url + ".pdf", headers={"User-Agent": "XY"})
    html = res.text
    parser_content = BeautifulSoup(html, 'html5lib')
    return parser_content

# We are looking for the author information at places where it can often be found.
# If we do not find it, it does not mean that it is not there.
def parse_author(html_content):
    
    # Initialize variables
    search_query = re.compile('author', re.IGNORECASE)
    name = ""
    
    # The author information might be encoded as a value of the attribute name
    attribute = html_content.find('meta', attrs={'name': search_query})
    
    # Or as a property
    property = html_content.find('meta', property=search_query)

    found_author = attribute or property
    
    if found_author:
        name = found_author['content']
   
   # If the author name cannot be found in the metadata, we might find it as an attribute of the text.
    else:
        itemprop = html_content.find(attrs={'itemprop': 'author'})
        byline = html_content.find(attrs={'class': 'byline'})
    
        found_author = itemprop or byline
        
        if found_author:
            name = found_author.text
    
    name = name.replace("by ", "")
    name = name.replace("\n", "")
    return name.strip()

#This function requires the HTML content of the result as an input parameter
#It returns the actual text content
def parse_news_text(html_content):

    # Try to find Article Body by Semantic Tag
    article = html_content.find('article')

    # Otherwise, try to find Article Body by Class Name (with the largest number of paragraphs)
    if not article:
        articles = html_content.find_all(class_=re.compile('(body|article|main)', re.IGNORECASE))
        if articles:
            article = sorted(articles, key=lambda x: len(x.find_all('p')), reverse=True)[0]

    # Parse text from all Paragraphs
    text = []
    if article:
        paragraphes = [tag.text for tag in article.find_all('p')]
        for paragraph in paragraphes:
                if re.findall("[.,!?]", paragraph):
                    text.append(paragraph)
    text = re.sub(r"\s+", " ", " ".join(text))

    return text

# Function to extract metadata from an article
def extract_metadata(article):
    # Extract the publication date
    published_at = article['published_at']
    if published_at:
        date, time = published_at.split("T")        
    else:
        date = ""
        time = ""

    # Extract meta data
    url = article['url']
    title= article['title'] 
    
    category = article['category'] # category associated with the given news article
    country = article['country'] # country code associated with given article 
    source = article['source'] # news source

    return date, time, title, url, category, country, source

# Function for the search query
def search_articles(language, keywords, date):
    """
    language: 'en' or 'nl'
    keywords: 'abortion' or 'abortus'
    date: e.g. '2021-01-01,2021-11-16'
    """
    conn = http.client.HTTPConnection('api.mediastack.com')

    params = urllib.parse.urlencode({
        'access_key': ' ',  ## enter access key
        'keywords': keywords,
        'sort': 'published_desc',
        'languages': language,
        'limit': 100,
        'date': date
        })

    conn.request('GET', '/v1/news?{}'.format(params))
    res = conn.getresponse()
    data = res.read()
    query_content=(data.decode('utf-8'))
    
    return json.loads(query_content)

# Save search query results as tsv
def save_query_as_tsv(queries, outfile):
    """
    queries: list of one or multiple queries for same language
    oufile: tsv file to save the results in
    """

    with open(outfile, 'w', encoding="utf-8") as f:
        f.write("Publication Date\tTime\tAuthor\tSource\tTitle\tURL\tText\n")
      
        for i, query in enumerate(queries):
            articles = query["data"]

            for i, article in enumerate(articles):
                # Extract metadata
                date, time, title, article_url, category, country, source = extract_metadata(article)
                
                # Extract content
                article_content = url_to_html(article_url)
                author = parse_author(article_content)
                try:
                    content = parse_news_text(article_content) # try to extract the text
                except:
                    content = "" # no text if text cannot be extracted
                
                # Remove the newlines and tabulars from the content, title and author to avoid problems when saving as tsv file
                content = content.replace("\n", "")
                content = content.replace("\t", "")
                title = title.replace("\n", "")
                title = title.replace("\t", "")
                author = author.replace("\t", "")
                author = author.replace("By ", "")
                
                # Separate fields by tabulators (\t)
                output = "\t".join([date, time, author, source, title, article_url, content])
                f.write(output +"\n")

# Put info from query in a dataframe
def query_to_dataframe(queries):
    """
    queries: list of one or multiple queries for same language
    """
    df = pd.DataFrame(columns=["Publication Date","Time","Author","Source","Title","URL","Text"])
      
    for i, query in enumerate(queries):
        articles = query["data"]

        for i, article in enumerate(articles):
            # Extract metadata
            date, time, title, article_url, category, country, source = extract_metadata(article)
            
            # Extract content
            article_content = url_to_html(article_url)
            author = parse_author(article_content)
            try:
                content = parse_news_text(article_content) # try to extract the text
            except:
                content = "" # no text if text cannot be extracted
            
            # Remove the newlines and tabulars from the content, title and author to avoid problems when saving as tsv file
            content = content.replace("\n", "")
            content = content.replace("\t", "")
            title = title.replace("\n", "")
            title = title.replace("\t", "")
            author = author.replace("\t", "")
            author = author.replace("By ", "")
            
            # Append to dataframe
            output = pd.DataFrame([[date, time, author, source, title, article_url, content]], columns=["Publication Date","Time","Author","Source","Title","URL","Text"])
            df = df.append(output, ignore_index=True)

    return df

# Function that removes duplicate and empty entries in a column of a dataframe
def remove_duplicates_and_empty(dataframe, column_name):
    """
    dataframe: e.g. content_eng or content_nld
    column_name: name of column for which to remove duplicates and empty entries, e.g. "Text"
    returns dataframe without rows containing duplicate or empty entries in column_name
    """
    unique_content = []
    for i, content in enumerate(dataframe[column_name]):
        # check whether we have already encountered this content before
        if content not in unique_content:
            unique_content.append(content)
        else:
            dataframe[column_name][i] = "" # replace the duplicate entry with empty string

    # only keep rows where column_name is not empty string
    rows_to_drop = dataframe.loc[dataframe[column_name] == ""]
    dataframe = dataframe.drop(rows_to_drop.index)

    return dataframe.reset_index(drop=True)

# Function that removes rows from dataframe where column_name is too short
def remove_short(dataframe, column_name, min_char):
    """
    dataframe: e.g. content_eng or content_nld
    column_name: name of column for which to remove short entries, e.g. "Text"
    min: the minimum number of characters an entry should have
    
    returns dataframe without rows for which entry in column_name is smaller than 'min' characters
    """
    for i, content in enumerate(dataframe[column_name]):
        # if length is shorter than 'min', remove content
        if len(content) < min_char:
            dataframe[column_name][i] = ""

    # only keep rows where column_name is not empty string
    rows_to_drop = dataframe.loc[dataframe[column_name] == ""]
    dataframe = dataframe.drop(rows_to_drop.index)
    
    return dataframe.reset_index(drop=True)

# Function to remove junk sentences from Dutch texts
def remove_junk_nld(dataframe):
    """
    dataframe: content_nld_clean / content_nld
    returns dataframe where specific junk sentences from the Dutch articles are removed
    """
    junk_text = ["GFC NIEUWSREDACTIE –",
                 "GFC NIEUWSREDACTIE-", 
                 "Je gebruikt een adblocker. Wij kunnen onze artikelen alleen gratis toegankelijk voor je maken dankzij advertenties. Wil je jouw adblocker voor ons pauzeren?", 
                 "Zoek dan via de zogenaamde ISIN code. Elk instrument, aandeel etc. heeft een unieke code. Kies vervolgens - wanneer er meerdere resultaten zijn - de notering op de beurs van uw keuze. Google de naam van het instrument, aandeel etc. met de toevoeging 'ISIN'. Als zoeken op ISIN code geen resultaten oplevert hebben wij het instrument of aandeel niet in onze koersendatabase.",
                 "1. wanneer u op de PDF-button rechtsonder klikt, krijgt u het printvenster 2. u maakt dan de afdrukkeuze PDF 3. u kiest vervolgens save 4. het wordt nu opgeslagen als PDF 5. Wij wensen u veel leesplezier",
                 "N.B. Het kan zijn dat elementen ontbreken aan deze printversie."]

    for i, content in enumerate(dataframe["Text"]):
        # for this text, remove article completely:
        if "Heeft u al een account? " in content:
            dataframe["Text"][i] = ""
        # for junk texts, remove text and leave rest of article in:
        for text in junk_text:
            if text in content:
                new_content = content.replace(text, "")
                dataframe["Text"][i] = new_content
    
    # only keep rows where column_name is not empty string
    rows_to_drop = dataframe.loc[dataframe["Text"] == ""]
    dataframe = dataframe.drop(rows_to_drop.index)

    return dataframe.reset_index(drop=True)


In [None]:
############################
### SCRAPE THE DOCUMENTS ###
############################

### ENGLISH ###
# We do 3 search queries:
query_en1 = search_articles('en', 'abortion', '2021-11-01,2021-11-10') # November 2021
query_en2 = search_articles('en', 'abortion', '2021-10-01,2021-10-31') # October 2021
query_en3 = search_articles('en', 'abortion', '2021-09-01,2021-09-30') # September 2021

### DUTCH ###
# For Dutch, we could only do 1 query because MediaStack didn't have more than 94 articles from this year
query_nl = search_articles('nl', 'abortus', '2021-01-01,2021-11-16')

# We save the results from the queries in dataframes
# The function query_to_dataframe extracts metadata from the query and cleans the text from tabulars (\t) and new lines (\n)
content_eng = query_to_dataframe([query_en1, query_en2, query_en3])
content_nld = query_to_dataframe([query_nl])

# We scrape the rest of the Dutch articles with REST API and add it to content_nld

# De Volkskrant
search_request = "https://www.volkskrant.nl/search?query=abortus"
search_results = url_to_html(search_request)
urls = search_results.select("a.teaser__link")
outputs = []
count = 0

for url in urls:        
    # Getting each url
    article_url = "https://www.volkskrant.nl/"+url["href"]

    # Getting each article
    article_content = url_to_html(article_url)

    # Extract metadata
    date = article_content.select("span.artstyle__byline__date")[0].string
    time = article_content.select("span.artstyle__byline__time")[0].string
    date = date.strip()
    day, month, year = date.split()
    day = int(day)
    if month == "oktober":
        month = 10
    if month == "november":
        month = 11
    if month == "december":
        month = 12

    if (day>10 and month==11) or (month==12): # Skip if the article is later than 10 Nov
        continue

    else:
        if len(str(day))==1:
            day = f"0{day}"
        date = f"{year}-{month}-{day}"
        title = article_content.select("h1.artstyle__header-title")[0].string.replace("\n", "")
        source = "De Volkskrant"

        author = parse_author(article_content)

        # Extract content
        content = parse_news_text(article_content)
        content = content.replace("\n", "")

        output = pd.DataFrame([[date, time, author, source, title, article_url, content]], columns=["Publication Date","Time","Author","Source","Title","URL","Text"])
        content_nld = content_nld.append(output, ignore_index=True)

        count += 1

    if count == 12:
        break

# Metro nieuws
search_request = "https://www.metronieuws.nl/?s=abortus"
search_results = url_to_html(search_request)
urls = search_results.select("a.list__link")
outputs = []
count = 0

for url in urls:
    # Getting each url
    article_url = url["href"]

    # Getting each article
    article_content = url_to_html(article_url)

    # Extract metadata
    datetime = article_content.select("span.meta__date")[0].string
    date, time = datetime.split("/")
    date = date.strip()
    day, month, year = date.split()
    day = int(day)
    if month == "aug":
        month = "08"
    if month == "sep":
        month = "09"
    if month == "okt":
        month = 10
    if month == "nov":
        month = 11
    if month == "dec":
        month = 12

    if (day>10 and month==11) or (month==12): # Skip if the article is later than 10 Nov
        continue

    else:
        if len(str(day))==1:
            day = f"0{day}"
        date = f"{year}-{month}-{day}"

        title = article_content.select("h1.article__title")[0].string
        source = "metronieuws.nl"

        author = parse_author(article_content)

        # Extract content
        content = parse_news_text(article_content)
        content = content.replace("\n", "")

        output = pd.DataFrame([[date, time, author, source, title, article_url, content]], columns=["Publication Date","Time","Author","Source","Title","URL","Text"])
        content_nld = content_nld.append(output, ignore_index=True)

        count += 1

    if count == 12:
        break

# Het Parool
search_request = "https://www.parool.nl/search?query=abortus"
search_results = url_to_html(search_request)
urls = search_results.select("a.teaser__link")
outputs = []
count = 0

for url in urls:
    # Getting each url
    article_url = "https://www.parool.nl"+url["href"]

    # Getting each article
    article_content = url_to_html(article_url)

    # Extract metadata
    time = article_content.select("span.artstyle__byline__time")[0].string

    date = article_content.select("span.artstyle__byline__date")[0].string
    day, month, year = date.split()
    day = int(day)
    dutch_month = ["januari", "februari", "maart", "april", "mei", "juni", "juli", "augustus", "september", "oktober", "november", "december"]
    for mon in dutch_month:
        if mon==month:
            month = dutch_month.index(mon)+1

    if (day>10 and month==11) or (month==12): # Skip if the article is later than 10 Nov
        continue

    else:
        if len(str(day))==1:
            day = f"0{day}"
        if len(str(month))==1:
            month = f"0{month}"
        date = f"{year}-{month}-{day}"

        title = article_content.select("h1.artstyle__header-title ")[0].string.replace("\n","")
        source = "Het Parool"

        author = parse_author(article_content)

        # Extract content
        content = parse_news_text(article_content)
        content = content.replace("\n", "")

        output = pd.DataFrame([[date, time, author, source, title, article_url, content]], columns=["Publication Date","Time","Author","Source","Title","URL","Text"])
        content_nld = content_nld.append(output, ignore_index=True)

        count += 1

    if count == 19:
        break

# Trouw
search_request = "https://www.trouw.nl/search?query=abortus"
search_results = url_to_html(search_request)
urls = search_results.select("a.teaser__link")
outputs = []
count = 0

for url in urls:
    # Getting each url
    article_url = "https://www.trouw.nl"+url["href"]

    # Getting each article
    article_content = url_to_html(article_url)

    # Extract metadata
    time = article_content.select("span.artstyle__byline__time")[0].string

    date = article_content.select("span.artstyle__byline__date")[0].string
    day, month, year = date.split()
    day = int(day)
    dutch_month = ["januari", "februari", "maart", "april", "mei", "juni", "juli", "augustus", "september", "oktober", "november", "december"]
    for mon in dutch_month:
        if mon==month:
            month = dutch_month.index(mon)+1

    if (day>10 and month==11) or (month==12): # Skip if the article is later than 10 Nov
        continue

    else:
        if len(str(day))==1:
            day = f"0{day}"
        if len(str(month))==1:
            month = f"0{month}"
        date = f"{year}-{month}-{day}"

        title = article_content.select("h1.artstyle__header-title ")[0].string.replace("\n","")
        source = "Trouw"

        author = parse_author(article_content)

        # Extract content
        content = parse_news_text(article_content)
        content = content.replace("\n", "")
        content = content.replace(' Om u deze content te kunnen laten zien, hebben wij uw toestemming nodig om cookies te plaatsen. Open uw cookie-instellingen om te kiezen welke cookies u wilt accepteren. Voor een optimale gebruikservaring van onze site selecteert u "Accepteer alles". U kunt ook alleen de sociale content aanzetten: vink hiervoor "Cookies accepteren van sociale media" aan.', "")

        output = pd.DataFrame([[date, time, author, source, title, article_url, content]], columns=["Publication Date","Time","Author","Source","Title","URL","Text"])
        content_nld = content_nld.append(output, ignore_index=True)

        count += 1

    if count == 20:
        break


#################################
### CLEAN DATAFRAMES AND TEXT ###
#################################

# Replace missing values
content_eng["Text"] = content_eng["Text"].fillna("")
content_eng["Author"] = content_eng["Author"].fillna("Unknown")

content_nld["Text"] = content_nld["Text"].fillna("")
content_nld["Author"] = content_nld["Author"].fillna("Unknown")

# Remove rows for which "Text" and/or "Title" is a duplicate or empty
content_eng_clean = remove_duplicates_and_empty(content_eng, "Text")
content_eng_clean = remove_duplicates_and_empty(content_eng_clean, "Title")

content_nld_clean = remove_duplicates_and_empty(content_nld, "Text")
content_nld_clean = remove_duplicates_and_empty(content_nld_clean, "Title")

# Remove junk sentences from Dutch articles
content_nld_clean = remove_junk_nld(content_nld_clean)

# Remove entries where Text is shorter than 500 characters. Here, extraction probably went wrong and the text is either a junk message (e.g. 'page not found' messages) or just too short to be useful.
content_eng_clean = remove_short(content_eng_clean, "Text", 500)
content_nld_clean = remove_short(content_nld_clean, "Text", 500)

print("nr_articles eng:", len(content_eng_clean))
print("nr_articles nld:", len(content_nld_clean))


# Save cleaned dataset as tsv
# English
tsv_file = "data/eng/abortion_overview_clean2.tsv"
content_eng_clean.to_csv(tsv_file, sep="\t", index=False)

# Dutch
tsv_file = "data/nld/abortus_overview_clean2.tsv"
content_nld_clean.to_csv(tsv_file, sep="\t", index=False)


### CALCULATE THE TIME IT TOOK TO RUN ALL CODE ###
stop = timer.perf_counter()
the_time = stop - start
minutes = int(the_time / 60)
print(f"Time it took to run the code: {minutes} minutes")
