# AllSides sources & bias crawler

Get and save a list of rated news sources as left or right and in between.

A CSV file will be created with the following columns:

- Source
- Label
- Agree
- Disagree
- Publisher URL

In [1]:
import asyncio
import csv
import logging
import re
import urllib.parse as urlparse

import aiohttp

import bs4
import requests

In [2]:
url_tpl = "https://www.allsides.com/media-bias/media-bias-ratings?field_featured_bias_rating_value=All&field_news_source_type_tid%5B1%5D=1&field_news_source_type_tid%5B2%5D=2&field_news_source_type_tid%5B3%5D=3&field_news_bias_nid_1%5B1%5D=1&field_news_bias_nid_1%5B2%5D=2&field_news_bias_nid_1%5B3%5D=3&title=&customFilter=1&page={}"
html_parser = "html5lib"
csv_header = [
    "source",
    "label",
    "agree",
    "disagree",
    "publisher",
    "site",
]
dump_path = "media-bias.csv"
encoding = "utf-8"
skip_blocked_sites = True

verbose = True  # make it True to see debugging messages
level = logging.DEBUG if verbose else logging.INFO
logging.root.handlers.clear()
logging.basicConfig(
    format="%(levelname)s - %(name)s - %(asctime)s - %(message)s",
    level=level
)


async def get_soup(session, url):
    abs_url = urlparse.urljoin(url_tpl, url)
    text = await (await session.get(abs_url)).text()
#     resp.raise_for_status()
    soup = bs4.BeautifulSoup(text, html_parser)
    return soup


async def get_publisher_url(session, src_url, source_name):
#     import code; code.interact(local={**globals(), **locals()})
    logging.debug("Getting publisher's URL for %r.", source_name)
    soup = await get_soup(session, src_url)
    url = soup.find("div", class_="source-image-wrapper").find("a").get("href").strip()
    parsed = urlparse.urlparse(url)
    if not parsed.netloc:
        return None
    return url, parsed.netloc


async def save_pages(bias_writer, csvfile):
    async with aiohttp.ClientSession() as session:
        page = 0
        while True:
            logging.info("Crawling page %d...", page)
            url = url_tpl.format(page)
            soup = await get_soup(session, url)
            
            pub_coros = []
            extras = []
            table = soup.find("table")
            if not table:
                logging.info("Reached empty table -> end of results/pages.")
                break
                
            for row in table.find("tbody").find_all("tr"):
                src_a = row.find("td", class_="source-title").find("a")
                src_url = src_a.get("href")
                source_name = src_a.text
                label_alt = row.find("td", class_="views-field-field-bias-image").find("img").get("alt")
                label = label_alt.split(":")[-1].strip()
                feedback = row.find("td", class_="community-feedback")
                agree = int(feedback.find("span", class_="agree").text)
                disagree = int(feedback.find("span", class_="disagree").text)
                
                extras.append([source_name, label, agree, disagree])
                pub_coros.append(get_publisher_url(session, src_url, source_name))
#                 import code; code.interact(local={**globals(), **locals()})
            
            publisher_details_list = await asyncio.gather(*pub_coros)
            for idx, publisher_details in enumerate(publisher_details_list):
                if not publisher_details:
                    if skip_blocked_sites:
                        continue
                    else:
                        publisher_details = ("", "")

#                 print(source_name, label, f"{agree}/{disagree}")
                bias_writer.writerow(extras[idx] + list(publisher_details))

            page += 1
            csvfile.flush()


async def main():
    with open(dump_path, "w", newline="", encoding=encoding) as csvfile:
        bias_writer = csv.writer(csvfile)
        bias_writer.writerow(csv_header)
        await save_pages(bias_writer, csvfile)
        
        
await main()

INFO - root - 2019-05-31 17:45:26,266 - Crawling page 0...
DEBUG - root - 2019-05-31 17:45:27,977 - Getting publisher's URL for ' The Texas Observer'.
DEBUG - root - 2019-05-31 17:45:27,978 - Getting publisher's URL for 'AARP'.
DEBUG - root - 2019-05-31 17:45:27,979 - Getting publisher's URL for 'ABC News'.
DEBUG - root - 2019-05-31 17:45:27,980 - Getting publisher's URL for 'Accuracy in Media '.
DEBUG - root - 2019-05-31 17:45:27,982 - Getting publisher's URL for 'ACLU'.
DEBUG - root - 2019-05-31 17:45:27,983 - Getting publisher's URL for 'Al Cardenas'.
DEBUG - root - 2019-05-31 17:45:27,985 - Getting publisher's URL for 'Al Jazeera'.
DEBUG - root - 2019-05-31 17:45:27,986 - Getting publisher's URL for 'AllSides'.
DEBUG - root - 2019-05-31 17:45:27,988 - Getting publisher's URL for 'AllSides Community'.
DEBUG - root - 2019-05-31 17:45:27,989 - Getting publisher's URL for 'Allysia Finley (Wall Street Journal)'.
DEBUG - root - 2019-05-31 17:45:27,991 - Getting publisher's URL for 'Alter

DEBUG - root - 2019-05-31 17:45:28,229 - Getting publisher's URL for 'CNN - Editorial'.
DEBUG - root - 2019-05-31 17:45:28,231 - Getting publisher's URL for 'CNS News'.
DEBUG - root - 2019-05-31 17:45:28,232 - Getting publisher's URL for 'CNSNews.com'.
DEBUG - root - 2019-05-31 17:45:28,234 - Getting publisher's URL for 'Columbia Journalism Review'.
DEBUG - root - 2019-05-31 17:45:28,235 - Getting publisher's URL for 'Commentary Magazine'.
DEBUG - root - 2019-05-31 17:45:28,237 - Getting publisher's URL for 'Conservative HQ'.
DEBUG - root - 2019-05-31 17:45:28,238 - Getting publisher's URL for 'Cook Report'.
DEBUG - root - 2019-05-31 17:45:28,240 - Getting publisher's URL for 'Council on Foreign Relations'.
DEBUG - root - 2019-05-31 17:45:28,243 - Getting publisher's URL for 'Countable'.
INFO - root - 2019-05-31 17:45:40,001 - Crawling page 1...
DEBUG - root - 2019-05-31 17:45:46,943 - Getting publisher's URL for 'Counter Currents'.
DEBUG - root - 2019-05-31 17:45:46,944 - Getting publ

DEBUG - root - 2019-05-31 17:45:47,140 - Getting publisher's URL for 'Heather Mac Donald'.
DEBUG - root - 2019-05-31 17:45:47,142 - Getting publisher's URL for 'Henry Payne (cartoonist)'.
DEBUG - root - 2019-05-31 17:45:47,143 - Getting publisher's URL for 'Herald Democrat'.
DEBUG - root - 2019-05-31 17:45:47,145 - Getting publisher's URL for 'Honolulu Civil Beat'.
DEBUG - root - 2019-05-31 17:45:47,146 - Getting publisher's URL for 'HotAir'.
DEBUG - root - 2019-05-31 17:45:47,147 - Getting publisher's URL for 'How Do We Fix It?'.
DEBUG - root - 2019-05-31 17:45:47,149 - Getting publisher's URL for 'Howard Kurtz'.
DEBUG - root - 2019-05-31 17:45:47,150 - Getting publisher's URL for 'Howard Kurtz'.
DEBUG - root - 2019-05-31 17:45:47,151 - Getting publisher's URL for 'HuffPost'.
DEBUG - root - 2019-05-31 17:45:47,152 - Getting publisher's URL for 'Inacow'.
DEBUG - root - 2019-05-31 17:45:47,153 - Getting publisher's URL for 'Independent Institute'.
DEBUG - root - 2019-05-31 17:45:47,155 

DEBUG - root - 2019-05-31 17:46:00,298 - Getting publisher's URL for 'Matt Towery'.
DEBUG - root - 2019-05-31 17:46:00,299 - Getting publisher's URL for 'Matt Welch'.
DEBUG - root - 2019-05-31 17:46:00,301 - Getting publisher's URL for 'Matt Wuerker (cartoonist)'.
DEBUG - root - 2019-05-31 17:46:00,304 - Getting publisher's URL for 'Maureen Dowd'.
DEBUG - root - 2019-05-31 17:46:00,305 - Getting publisher's URL for 'Media Matters'.
DEBUG - root - 2019-05-31 17:46:00,307 - Getting publisher's URL for 'Media Research Center'.
DEBUG - root - 2019-05-31 17:46:00,308 - Getting publisher's URL for 'Mediaite'.
DEBUG - root - 2019-05-31 17:46:00,310 - Getting publisher's URL for 'Miami Herald'.
DEBUG - root - 2019-05-31 17:46:00,312 - Getting publisher's URL for 'Michael Barone'.
DEBUG - root - 2019-05-31 17:46:00,314 - Getting publisher's URL for 'Michael Brendan Dougherty'.
DEBUG - root - 2019-05-31 17:46:00,315 - Getting publisher's URL for 'Michael Gerson'.
DEBUG - root - 2019-05-31 17:46:

DEBUG - root - 2019-05-31 17:46:17,022 - Getting publisher's URL for 'RealClearPolitics'.
DEBUG - root - 2019-05-31 17:46:17,025 - Getting publisher's URL for 'Reason'.
DEBUG - root - 2019-05-31 17:46:17,027 - Getting publisher's URL for 'Reason Foundation'.
DEBUG - root - 2019-05-31 17:46:17,029 - Getting publisher's URL for 'Record Journal'.
DEBUG - root - 2019-05-31 17:46:17,031 - Getting publisher's URL for 'Red State'.
DEBUG - root - 2019-05-31 17:46:17,033 - Getting publisher's URL for 'redefinED'.
DEBUG - root - 2019-05-31 17:46:17,035 - Getting publisher's URL for 'Rem Reider'.
DEBUG - root - 2019-05-31 17:46:17,036 - Getting publisher's URL for 'Reuters'.
DEBUG - root - 2019-05-31 17:46:17,037 - Getting publisher's URL for 'Rich Lowry'.
DEBUG - root - 2019-05-31 17:46:17,039 - Getting publisher's URL for 'Rich Tafel'.
DEBUG - root - 2019-05-31 17:46:17,040 - Getting publisher's URL for 'Rich Zeoli'.
DEBUG - root - 2019-05-31 17:46:17,041 - Getting publisher's URL for 'Richard 

DEBUG - root - 2019-05-31 17:46:34,892 - Getting publisher's URL for 'The Philadelphia Inquirer'.
DEBUG - root - 2019-05-31 17:46:34,894 - Getting publisher's URL for 'The Red and Black'.
DEBUG - root - 2019-05-31 17:46:34,895 - Getting publisher's URL for 'The Reliable Bias'.
DEBUG - root - 2019-05-31 17:46:34,898 - Getting publisher's URL for 'The Republican'.
DEBUG - root - 2019-05-31 17:46:34,900 - Getting publisher's URL for 'The Resurgent'.
DEBUG - root - 2019-05-31 17:46:34,903 - Getting publisher's URL for 'The Root'.
DEBUG - root - 2019-05-31 17:46:34,905 - Getting publisher's URL for 'The Sacramento Bee'.
DEBUG - root - 2019-05-31 17:46:34,908 - Getting publisher's URL for 'The Saturday Evening Post'.
DEBUG - root - 2019-05-31 17:46:34,909 - Getting publisher's URL for 'The Telegraph - UK'.
DEBUG - root - 2019-05-31 17:46:34,912 - Getting publisher's URL for 'The Verge'.
DEBUG - root - 2019-05-31 17:46:34,914 - Getting publisher's URL for 'The Week'.
DEBUG - root - 2019-05-31

Some publishers are blocked (no websites offered by AllSides), therefore fewer results in the CSV file.

Now let's find a good way of associating a side with a website in case multiple candidates are available.

In [8]:
side_dict = {}

with open(dump_path, newline="") as stream:
    reader = csv.reader(stream)
    print(next(reader))
    
    for row in reader:
        side_dict.setdefault(row[5], []).append((row[0], row[1], row[2]))

for site, sides in side_dict.items():
    if len(sides) > 1:
        print(site, sides)

['source', 'label', 'agree', 'disagree', 'publisher', 'site']
www.thedailybeast.com [('Daily Beast', 'Left', '5836'), ('Newsweek', 'Lean Left', '1861')]
theweek.com [('Damon Linker', 'Lean Left', '198'), ('James Poulos', 'Right', '43'), ('Michael Brendan Dougherty', 'Lean Right', '56'), ('The Week', 'Center', '2776')]
topics.nytimes.com [('David Brooks', 'Lean Right', '213'), ('Paul Krugman', 'Left', '250')]
www.washingtonpost.com [('David Ignatius', 'Center', '91'), ('E J Dionne', 'Left', '426'), ('Ed Rogers', 'Right', '95'), ('George Will', 'Right', '367'), ('Marc A. Thiessen', 'Right', '155'), ('Washington Post', 'Lean Left', '22977')]
www.deseretnews.com [('Deseret News', 'Lean Right', '1007'), ('Morgan Jacobsen', 'Center', '51')]
fair.org [('FAIR', 'Lean Left', '81'), ('Fair.org', 'Center', '399')]
www.forbes.com [('Forbes', 'Center', '2780'), ('Ralph Benko', 'Right', '38'), ('Rick Ungar', 'Lean Left', '121')]
www.foxnews.com [('Fox News', 'Lean Right', '20073'), ('Fox News Editor