# [Dynamic Webscraping]
## Climate-related articles from Foxnews

In [1]:
import requests 
import time

from scrapy.selector import Selector
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


import numpy as np
import pandas as pd

In [2]:
#### NOTE: Web scraping may be temporarily unsuccessful if 'breaking news' occasionally pops up on the website ###
#### In case of error 'element click intercepted', you may want to add an if else statement to close the pop up table ###
#### Alternatively, please try again once the pop up is taken down from the website ###

### 1) Review & Access the webpage

# Install webdriver
service = ChromeService(ChromeDriverManager().install())

# Create an instance of Chrome web driver
driver = webdriver.Chrome(service = service)

# Define the 3 websites to scrape
foxnews_URLs = [
    "https://www.foxnews.com/category/us/environment/climate-change",
    "https://www.foxnews.com/category/world/environment/climate-change",
    "https://www.foxnews.com/category/science/planet-earth/climate"
]

In [3]:
# Navigate to/load the website (i.e. each climate category of Fox News)
for URL in foxnews_URLs:
    
    driver.get(URL)
    
    wait = WebDriverWait(driver, 10)
        
    # first, click 'Show More' many times
    i = 0

    while i < 3000:
        try:
            time.sleep(1)
            element = wait.until(EC.visibility_of_element_located(
                (By.XPATH, "(//div[@class='button load-more js-load-more'])[1]/a")))
            element.click()
            i += 1
        except TimeoutException:
            break



    # Retrieve & store the html source code
    response = Selector(text = driver.page_source)
    
    
    
    ### 2) Parse the html source code

    # Initialize an empty list for storing search results for every page
    search_results = []

    # Extract the titles of each article
    titles = response.xpath('//h4[@class="title"]/a/text()').getall()

    # Extract the abstracts of each article
    abstracts = response.xpath('//p[@class="dek"]/a/text()').getall()

    # Extract how long ago the article was posted
    dates = response.xpath("//span[@class='time']/text()").getall()

    # Extract the sections that each article falls under
    sections = response.xpath("//span[@class='eyebrow']/a/text()").getall()

    # Extract the URLs of each article. Make sure that the URL is in full html format.
    urls = response.xpath('//h4[@class="title"]/a/@href').getall()
    urls = ['https://www.foxnews.com' + url for url in urls]



    # Store search results from the *current page*
    data_structure = zip(titles, abstracts, dates, sections, urls)

    # for document, title, abstract, date, section, url in data_structure:
    for title, abstract, date, section, url in data_structure:

        # Apply filtering conditions.
        # Remove opinion pieces, videos, tweet posts and articles that fall outside of the date range
        if section not in ['OPINION', 'VIDEO', 'Live Coverage', 'Books'] and not date.endswith(('2015', '2016', '2017')):

            extracted_from_current_page = {
                'title': title,
                'abstract': abstract,
                'date': date,
                'section': section,
                'url': url
            }


            # Append results from the current page
            search_results.append(extracted_from_current_page)


    ### 3) Store metadata as a data frame
    foxnews_data = pd.DataFrame(search_results)
    
    # Then display the number of articles:
    
    # a. Split the URL by '/'
    URL_components = URL.split('/')
    
    # b. Extract the article category (for indicating which dataframe the stored output refers to)
    article_category = URL_components[4]
    
    # c. Print out the number of articles as well as the first 3 rows
    print(f"Number of articles under Fox News' {article_category} category: {foxnews_data.shape[0]}")
    
    # Save the data as a csv file
    filename = f"Foxnews_{article_category}.csv"
    foxnews_data.to_csv(filename, index = False)
    
    print("Saved Successfully!\n")

Number of articles under Fox News' us category: 1067
Saved Successfully!

Number of articles under Fox News' world category: 682
Saved Successfully!

Number of articles under Fox News' science category: 759
Saved Successfully!



In [4]:
# Close the chrome window, once we've successfully saved all data
driver.quit()

In [5]:
# List of categories where we can find climate-related news on Foxnews
categories = ['us', 'world', 'science']

# Create an empty dictionary for storing articles from each of the 3 categories
dataframes = {}


for category in categories:
    # Read in each csv file and store in a dictionary
    dataframes[category] = pd.read_csv(f"Foxnews_{category}.csv")

# Merge climate change related articles in the US, World, Science sections. Remove duplicate documents/articles
foxnews_climate = pd.concat(dataframes.values()).drop_duplicates().reset_index(drop = True)
foxnews_climate.head()

Unnamed: 0,title,abstract,date,section,url
0,Climate activist smears red paint on Washingto...,A climate activist with a radical protest grou...,9:36,Climate Change,https://www.foxnews.com/politics/climate-activ...
1,Obama-appointed judge upholds major oil projec...,A federal court turned down a legal challenge ...,2 days ago,Federal Courts,https://www.foxnews.com/politics/obama-appoint...
2,Biden admin roasted for offering to pay Americ...,The Biden administration was widely roasted th...,5 days ago,Climate Change,https://www.foxnews.com/politics/biden-admin-r...
3,Meet the little-known group funded by left-win...,The Boston-based climate nonprofit Ceres is sp...,5 days ago,Climate Change,https://www.foxnews.com/politics/meet-little-k...
4,Texas voters overwhelmingly approve measure gr...,Texans voted overwhelmingly in favor of a ball...,5 days ago,Texas,https://www.foxnews.com/politics/texas-voters-...


In [6]:
foxnews_climate.shape

(2333, 5)

In [7]:
foxnews_climate.to_csv("Foxnews.csv", index = False)

In [8]:
# for section in foxnews_climate['section'].unique():
#    print(section)