In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import datetime

In [2]:
BASE_URL = 'https://www.politifact.com/factchecks/list/?page=%d&ruling=%s'
pat = re.compile('stated on ([\w]+) ([\d]+), ([\d]+)')
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12}

In [5]:
def get_data(ruling, start_page=1, till_page=2, verbose=True):
    data_list = list()
    page_count = 1
    flag = True
    while page_count <= till_page:
        url = BASE_URL % (page_count, 'true')
        page = requests.get(url)
        page.raise_for_status()
        
        if verbose:
            print(f"{'-'*40} Extracting page #{page_count} {'-'*40}")
        soup = BeautifulSoup(page.text, 'html.parser')
        contents = soup.find('ul', class_='o-listicle__list').findAll('li')
        
        for content in contents:
            data = dict()
            data['author'] = content.find("a", class_="m-statement__name").text.strip()
            data['desc'] = content.find("div", class_="m-statement__desc").text.strip()
            data['body'] = content.find("div", class_="m-statement__quote").find("a").text.strip()
            data['url'] = content.find("div", class_="m-statement__quote").find("a").get('href')
            match = re.match(pat, data['desc'])
            data['date'] = datetime.date(int(match.group(3)), months[match.group(1).lower()], int(match.group(2)))
            
            if verbose:
                print(f".... Content of {data['date']}")

            data_list.append(data)
        
        page_count += 1
        if verbose:
            print()
    return data_list

In [8]:
true_data = get_data('true', till_page=15, verbose=False)

In [9]:
mostly_true_data = get_data('mostly-true', till_page=15, verbose=False)

In [15]:
false_data = get_data('false', till_page=15, verbose=False)

In [14]:
pants_on_fire_data = get_data('pants-fire', till_page=15, verbose=False)

In [16]:
len(true_data) + len(mostly_true_data) + len(false_data) + len(pants_on_fire_data)

1800

In [32]:
class DateEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime.date):
            return str(obj)

def save(obj, file_name):
    if not file_name.endswith('.json'):
        file_name += '.json'
        
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(obj, file, cls=DateEncoder)


In [33]:
save(true_data, "true.json")

In [34]:
save(mostly_true_data, "mostly_true.json")

In [35]:
save(false_data, "false.json")

In [36]:
save(pants_on_fire_data, "pants_on_fire.json")