In [1]:
# Resource: https://www.youtube.com/watch?v=Nz1zPkiHcbg

# Description: This program scrapes FAKE and REAL news data from politifact.com website

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import urllib.request
import time

In [3]:
# To stored the scraped data
authors = []
dates = []
statements = []
sources = []
targets = []

In [4]:
# Function to scrape the website
def scrape_function(page_number):
    URL = 'https://www.politifact.com/factchecks/list/?page=' + str(page_number)
    webpage = requests.get(URL)
    soup = BeautifulSoup(webpage.text, 'html.parser')
    # Location of the information
    statement_footers = soup.find_all('footer', attrs = {
        'class': 'm-statement__footer'
    })
    
    statement_quotes = soup.find_all('div', attrs = {
        'class': 'm-statement__quote'
    })
    
    statement_metas = soup.find_all('div', attrs = {
        'class': 'm-statement__meta'
    })
    
    statement_targets = soup.find_all('div', attrs = {
        'class': 'm-statement__meter'
    })
    
    # Looping through the statement_footer
    for footer in statement_footers:
        link = footer.text.strip()
        name_and_date = link.split()
        full_name = name_and_date[1] + " " + name_and_date[2]
        date = name_and_date[4] + ' ' + name_and_date[5] + ' ' + name_and_date[6] #month date year
        dates.append(date)
        authors.append(full_name)
    print("Done with footer")
    
    # Looping through the statement_quote
    for quote in statement_quotes:
        link = quote.find_all('a')
        statement_text = link[0].text.strip()
        statements.append(statement_text)
    print("Done with quotes")
        
    # Looping through the statement_meta
    for meta in statement_metas:
        link = meta.find_all('a')
        source_text = link[0].text.strip()
        sources.append(source_text)
    print("Done with metas")
    
    # Looping through the targets
    for target in statement_targets:
        link = target.find('div', attrs = {'class': 'c-image'}).find('img').get('alt')
        targets.append(link)
    print("Done with targets")

In [5]:
# Looping through 'n-1' webpage(s) to scrape the data
n = 10
for i in range(1, n):
    scrape_function(i)

Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets
Done with footer
Done with quotes
Done with metas
Done with targets


In [6]:
# Creating the dataframe
data = pd.DataFrame(columns = ['author', 'statement', 'source', 'date', 'target'])
data['author'] = authors
data['statement'] = statements
data['source'] = sources
data['date'] = dates
data['target'] = targets

data

Unnamed: 0,author,statement,source,date,target
0,Bill McCarthy,The AstraZeneca COVID-19 vaccine contains a Bl...,TikTok posts,"May 21, 2021",false
1,Madison Czopek,An electromagnetic radiation detector can dete...,Facebook posts,"May 21, 2021",false
2,Ciara O'Rourke,Businesses aren't allowed to ask customers if ...,Viral image,"May 21, 2021",false
3,Ciara O'Rourke,Says Dr. Anthony Fauci’s wife is the NIH offic...,Viral image,"May 21, 2021",false
4,Tom Kertscher,“COVID cases in India plummet after government...,The Gateway Pundit,"May 21, 2021",barely-true
...,...,...,...,...,...
265,Bill McCarthy,“Toxicology report was made public by the MN p...,Facebook posts,"March 30, 2021",false
266,Gabrielle Settles,"Says Jimmy Kimmel, Jimmy Fallon and Stephen Co...",Facebook posts,"March 30, 2021",false
267,Samantha Putterman,Says Google Earth blocked people from seeing t...,Facebook posts,"March 29, 2021",false
268,Miriam Valverde,“We spent billions on a border wall and quit w...,Facebook posts,"March 29, 2021",barely-true


In [7]:
# Creating a function to get a binary number from the target
def getBinaryNumTarget(text):
    if text == 'true':
        return 1
    else:
        return 0

In [8]:
# Creating a function to get a binary label true(REAL) or false(FAKE) from the target
def getBinaryTarget(text):
    if text == 'true':
        return 'REAL'
    else:
        return 'FAKE'

In [9]:
# Create two columns on the dataframe
data['BinaryTarget'] = data['target'].apply(getBinaryTarget)
data['BinaryNumTarget'] = data['target'].apply(getBinaryNumTarget)

In [10]:
data

Unnamed: 0,author,statement,source,date,target,BinaryTarget,BinaryNumTarget
0,Bill McCarthy,The AstraZeneca COVID-19 vaccine contains a Bl...,TikTok posts,"May 21, 2021",false,FAKE,0
1,Madison Czopek,An electromagnetic radiation detector can dete...,Facebook posts,"May 21, 2021",false,FAKE,0
2,Ciara O'Rourke,Businesses aren't allowed to ask customers if ...,Viral image,"May 21, 2021",false,FAKE,0
3,Ciara O'Rourke,Says Dr. Anthony Fauci’s wife is the NIH offic...,Viral image,"May 21, 2021",false,FAKE,0
4,Tom Kertscher,“COVID cases in India plummet after government...,The Gateway Pundit,"May 21, 2021",barely-true,FAKE,0
...,...,...,...,...,...,...,...
265,Bill McCarthy,“Toxicology report was made public by the MN p...,Facebook posts,"March 30, 2021",false,FAKE,0
266,Gabrielle Settles,"Says Jimmy Kimmel, Jimmy Fallon and Stephen Co...",Facebook posts,"March 30, 2021",false,FAKE,0
267,Samantha Putterman,Says Google Earth blocked people from seeing t...,Facebook posts,"March 29, 2021",false,FAKE,0
268,Miriam Valverde,“We spent billions on a border wall and quit w...,Facebook posts,"March 29, 2021",barely-true,FAKE,0


In [11]:
data[data.BinaryNumTarget == 1]

Unnamed: 0,author,statement,source,date,target,BinaryTarget,BinaryNumTarget
5,Brandon Mulder,“We now have more job openings than we do peop...,Greg Abbott,"May 21, 2021",True,REAL,1
51,Lyle Muller,“There are more job openings than there are pe...,Lindsay James,"May 12, 2021",True,REAL,1
71,Paul Specht,Says he was “one of the 2 (state Senators) who...,Jeff Jackson,"May 7, 2021",True,REAL,1
103,Chris Nichols,“San Francisco had twice as many drug overdose...,Kevin Kiley,"April 29, 2021",True,REAL,1
120,Warren Fiske,On Virginia’s new marijuana law: “You can have...,Kirk Cox,"April 28, 2021",True,REAL,1
134,Haley BeMiller,Foxconn “is the largest taxpayer in Racine Cou...,Van Wanggaard,"April 27, 2021",True,REAL,1
155,Brian Grace,“The federal American Rescue Plan will purchas...,Zach Wahls,"April 22, 2021",True,REAL,1
184,Louis Jacobson,“There is racism physically built into some of...,Pete Buttigieg,"April 15, 2021",True,REAL,1
224,Brandon Mulder,“Twice as many children are in Border Patrol c...,Greg Abbott,"April 6, 2021",True,REAL,1
249,Amy Sherman,U.S. Rep. Matt Gaetz was the lone vote in the ...,Facebook posts,"March 31, 2021",True,REAL,1
