# AllSides article crawler

Crawling all the articles from https://www.allsides.com/story/admin and exporting them in CSV format. For each article we extract:

- Page nr.
- Index on page
- AllSides URL
- Subject
- Topic
- Date

_and for each 3-tuple of articles_

- Title
- Source
- Label (Left/Right/Center/etc.)
- Original article URL

In [1]:
import csv
import logging
import re
import urllib.parse as urlparse

import bs4
import requests

In [2]:
url_tpl = "https://www.allsides.com/story/admin?page={}"
html_parser = "html5lib"
bias_regex = re.compile(r"Rating:\s+([\w\s]+)")


def get_soup(url):
    abs_url = urlparse.urljoin(url_tpl, url)
    resp = requests.get(abs_url)
    resp.raise_for_status()
    soup = bs4.BeautifulSoup(resp.text, html_parser)
    return soup


def parse_bundle(soup):
    """Retrieves 2/3 articles within the same subject page."""
    articles = []
    for article_div in soup.find_all("div", class_="quicktabs-views-group"):
        title_div = article_div.find("div", class_="news-title")
        title = title_div.text
        url = title_div.find("a").get("href")

        src_area = article_div.find("div", class_="source-area")
        src_div = src_area.find("div", class_="news-source")
        if src_div:
            source = src_div.text
        else:
            source = None
            logging.warning("Article source not available.")
            
        bias_div = src_area.find("img", typeof="foaf:Image")
        if bias_div:
            bias_text = bias_div.get("alt")
            label = bias_regex.search(bias_text).group(1)
        else:
            label = None
            logging.warning("Article label not available.")
        
        article = {
            "title": title,
            "source": source,
            "label": label,
            "url": url,
        }
        articles.append(article)
        
    logging.debug("Retrieved %d more articles.", len(articles))
    return articles


def get_articles(soup):
    """Retrieves all the articles within a page."""
    all_articles = []
    body = soup.find("tbody")
    if not body:
        logging.warning("Reached empty page of results.")
        return all_articles
    
    for idx, row in enumerate(body.find_all("tr")):
        logging.debug("Getting row subject with index: %d.", idx)
        name_td = row.find("td", class_="views-field-name")
        subject = name_td.text
        bundle_url = name_td.find("a").get("href")
        
        topic = row.find("td", class_="views-field-field-story-topic").text
        date = row.find("td", class_="views-field-field-story-date").text
        
        articles = parse_bundle(get_soup(bundle_url))
        for article in articles:
            article.update({
                "index_on_page": idx,
                "allsides_url": urlparse.urljoin(url_tpl, bundle_url),
                "subject": subject,
                "topic": topic,
                "date": date,
            })
        all_articles.extend(articles)
        
    return all_articles

Iterate pages and process each row as a BS object.

In [5]:
verbose = True  # make it True to see debugging messages
level = logging.DEBUG if verbose else logging.INFO
logging.root.handlers.clear()
logging.basicConfig(
    format="%(levelname)s - %(name)s - %(asctime)s - %(message)s",
    level=level
)

dump_path = "allsides.csv"
encoding = "utf-8"
csv_header = [
    "page_nr",
    "index_on_page",
    "allsides_url",
    "subject",
    "topic",
    "date",
    "title",
    "source",
    "label",
    "url",
]

max_page = 100


def postprocess_fields(article):
    decode_str = lambda string: string.encode("windows-1252").decode("utf-8")
    
    _values = []
    for key, value in article.items():
        if isinstance(value, str):
            value = value.strip()
            bad_encoding = bool(list(filter(lambda char: ord(char) > 127, value)))
            if bad_encoding:
                logging.warning("Attempting to fix bad value %r.", value)
                for _ in range(2):
                    try:
                        value = decode_str(value)
                    except UnicodeError:
                        logging.error("Couldn't fix string value, leaving like it is.")
                        break
        article[key] = value


with open(dump_path, "w", newline="", encoding=encoding) as csvfile:
    article_writer = csv.DictWriter(csvfile, fieldnames=csv_header)
    article_writer.writeheader()
    
    for page in range(max_page):
        logging.info("Crawling page %d...", page)
        soup = get_soup(url_tpl.format(page))
        articles = get_articles(soup)
        if not articles:
            break

        logging.info("Dumping page %d with a total of %d articles.", page, len(articles))
        for article in articles:
            article["page_nr"] = page
            postprocess_fields(article)
            article_writer.writerow(article)
            
        csvfile.flush()
            
logging.info("Parsing AllSides just finished!")

RuntimeError: No active exception to reraise