# Fox news

## Data collection

In [1]:
import requests 
import time

from scrapy.selector import Selector
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


import numpy as np
import pandas as pd

### I. From Fox news' `U.S`/`World`/`Science` categories, scrape <ins>climate-related</ins> news published in <ins>2018-2022</ins>

#### a) Gather data from Fox news' *dynamic* website <br><br> b) Filter the results to only those that fall within our 5-year time window

In [2]:
# Create a function that generates the URLs of 3 sections where climate related news can be found

def construct_URL(section):
    base_URL = "https://www.foxnews.com/category/"
    
    if section == 'us' or section == 'world':
        URL = base_URL + section + '/environment/climate-change'
    elif section == 'science':
        URL = base_URL + section + '/planet-earth/climate'
    
    return URL

    
# Define a function that extracts the title/abstract/date/section/URL of each article
def extract_metadata(xpath_expression):
    return response.xpath(xpath_expression).getall()

In [3]:
# Store the list of URLs where climate change-related news are posted
foxnews_URLs = [construct_URL("us"), construct_URL("world"), construct_URL("science")]

In [4]:
#### NOTE: Web scraping may be temporarily unsuccessful if 'breaking news' occasionally pops up on the website ###
#### In case of error 'element click intercepted', you may want to add an if else statement to close the pop up table ###
#### Alternatively, please try again once the pop up is taken down from the website ###

### 1) Review & Access the webpage

# Install webdriver
service = ChromeService(ChromeDriverManager().install())

# Create an instance of Chrome web driver
driver = webdriver.Chrome(service = service)

In [5]:
# Initialize dictionaries to store search results for each category
search_results = {
    'us': [],
    'world': [],
    'science': []}

# Navigate to/load the website (i.e. each climate category of Fox News)

for URL in foxnews_URLs:
    
    driver.get(URL)
    
    # Allow 10 seconds of rest time   
    wait = WebDriverWait(driver, 10)
        
    # Click 'Show More' multiple times, to expand all search results
    i = 0

    while i < 3000:
        try:
            time.sleep(1)
            element = wait.until(EC.visibility_of_element_located(
                (By.XPATH, "(//div[@class='button load-more js-load-more'])[1]/a")))
            element.click()
            i += 1
        except TimeoutException:
            break



    # Retrieve & store the html source code
    response = Selector(text = driver.page_source)
    
    
    
    ### 2) Parse the html source code

    

    # Extract the title/abstract/date/section/URL of each article
    # Use the custom function:
    titles = extract_metadata('//h4[@class="title"]/a/text()')
    abstracts = extract_metadata('//p[@class="dek"]/a/text()')
    dates = extract_metadata("//span[@class='time']/text()")
    sections = extract_metadata("//span[@class='eyebrow']/a/text()")
    urls = extract_metadata('//h4[@class="title"]/a/@href')

    # Make sure that the URL is in *full* html format.
    urls = ['https://www.foxnews.com' + url for url in urls]



    # Store search results from the *current page*
    data_structure = zip(titles, abstracts, dates, sections, urls)

    # for document, title, abstract, date, section, url in data_structure:
    for title, abstract, date, section, url in data_structure:

        # Split the URL by '/'.
        # Then extract the article category (for indicating which dataframe the stored output refers to)
        article_category = URL.split('/')[4]
        
        # Apply filtering conditions:
        # a) Remove opinion pieces, videos, tweet posts and articles that fall outside of the date range
        # b) Keep only news articles published between 2018 - 2022

        if section not in ['OPINION', 'VIDEO', 'Live Coverage', 'Books'] and \
        date.endswith(('2018', '2019', '2020', '2021', '2022')):
        
            extracted_from_current_page = {
                'title': title,
                'abstract': abstract,
                'date': date,
                'section': section,
                'url': url
            }


            # 3) Store metadata (results from the current page) as a data as a list of dictionary
    
            search_results[article_category].append(extracted_from_current_page)
            
    print("Saved Successfully!")
    print(f"Number of articles under Fox News' {article_category} category: {len(search_results[article_category])}\n")   


Saved Successfully!
Number of articles under Fox News' us category: 619

Saved Successfully!
Number of articles under Fox News' world category: 442

Saved Successfully!
Number of articles under Fox News' science category: 609



In [6]:
# Close the chrome window, once we've successfully saved all data
driver.quit()

In [7]:
# Display metadata of the first 2 articles under the 'U.S.' climate category
search_results['us'][:2]

[{'title': 'Eco activists Prince Harry, Meghan pictured leaving private jet on way to gala giving sustainability award',
  'abstract': 'Royal couple Prince Harry and Meghan Markle, two outspoken environmental activists, were photographed departing a private jet in New York City this week.',
  'date': 'December 7, 2022',
  'section': 'Climate Change',
  'url': 'https://www.foxnews.com/politics/eco-activists-prince-harry-meghan-pictured-leaving-private-jet-way-gala-giving-sustainability-award'},
 {'title': 'Alaska region sees record December heat, beating temperatures from late October to April',
  'abstract': 'A community in Alaska is experiencing unusually warm weather in December. This month saw a new record by six degrees, which also beat temperatures between October and April.',
  'date': 'December 6, 2022',
  'section': 'Alaska',
  'url': 'https://www.foxnews.com/us/alaska-region-sees-record-december-heat-beating-temperatures-late-october-april'}]

### II. Compile all search results from Fox news' three categories (`U.S`/`World`/`Science`)

In [8]:
# Combine all metadata about climate-related articles from all 3 URLs
metadata_from_all_news_categories = []

# Loop through each key (news category, i.e. US/World/Science) in search_results
for news_category, metadata_in_list in search_results.items():
    
    # Convert the metadata scraped from the given news category (-> a list of dictionary) to a dataframe
    temp_df = pd.DataFrame(metadata_in_list)
    
    # Add a new column indicating the section (i.e. news category)
    temp_df['section'] = news_category
    
    # Append metadata scraped from the given news category
    metadata_from_all_news_categories.append(temp_df)

# Concatenate all metadata collected from the 3 URLS
# Remove any redundant news articles across the 3 URLs
foxnews = pd.concat(metadata_from_all_news_categories, ignore_index = True).drop_duplicates().reset_index(drop = True)


print(f"There are {foxnews.shape[0]} climate-related articles that were published throughout 2018-2022.")

foxnews.head()

There are 1644 climate-related articles that were published throughout 2018-2022.


Unnamed: 0,title,abstract,date,section,url
0,"Eco activists Prince Harry, Meghan pictured le...","Royal couple Prince Harry and Meghan Markle, t...","December 7, 2022",us,https://www.foxnews.com/politics/eco-activists...
1,"Alaska region sees record December heat, beati...",A community in Alaska is experiencing unusuall...,"December 6, 2022",us,https://www.foxnews.com/us/alaska-region-sees-...
2,"Rashida Tlaib, Ro Khanna join protests against...",Democratic Reps. Rashida Tlaib and Ro Khanna j...,"December 6, 2022",us,https://www.foxnews.com/politics/rashida-tlaib...
3,Semafor climate editor exits after complaining...,"Bill Spindle, climate editor of the recently-l...","December 6, 2022",us,https://www.foxnews.com/media/semafor-climate-...
4,Left ripped for pushing nightmarish climate na...,'The Five' co-hosts discuss how the left's cli...,"December 6, 2022",us,https://www.foxnews.com/media/left-ripped-push...


### III. Store the data as a csv file

In [9]:
foxnews.to_csv("./1_Data/Foxnews.csv", index = False)