In [13]:
import os
import warnings
import requests
import string 
import numpy as np
import nltk
import time
from numpy import linalg as LA
import regex as re
import pandas as pd
from lda import LDA
from datetime import datetime
from bs4 import BeautifulSoup
from alive_progress import alive_bar
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from selenium.webdriver.chrome.options import Options
from alive_progress import alive_bar

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)


In [14]:
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))
PARENT_DIR = os.path.dirname(ROOT_DIR)
GUARDIAN_DIR = os.path.join(ROOT_DIR, "data", "Guardian.csv")
REUTERS_DIR = os.path.join(ROOT_DIR, "data", "Reuters.csv")
CNN_DIR = os.path.join(PARENT_DIR, "data", "CNN.csv")
DAILYMAIL_DIR = os.path.join(PARENT_DIR, "data", "DailyMail.csv")
AP_DIR = os.path.join(PARENT_DIR, "data", "AssociatedPress.csv")
GLOVE_DIR = os.path.join(ROOT_DIR, "glove_data", "results", "vectors.txt")
FOX_DIR = os.path.join(PARENT_DIR, "data", "Fox.csv")

# Fox

In [15]:

class Fox:
    def __init__(self) -> None:
        self.source = "FOX"
        self.dir = FOX_DIR

    def seleniumParams(self):
        s = Service(ChromeDriverManager(chrome_type=ChromeType.BRAVE, path=ROOT_DIR).install())
        o = webdriver.ChromeOptions()
        o.add_argument("headless")
        self.driver = webdriver.Chrome(service=s, options=o)
        ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
        self.wait = WebDriverWait(self.driver, 10, ignored_exceptions=ignored_exceptions)

    def fromScratch(self):
        if not os.path.exists(self.dir):
            self.old_data = pd.DataFrame(columns=["Date", "URL", "Title", "Text"])
            print(f"{self.source}: No CSV file found. Creating...")
            self.from_scratch = True
        else:
            self.old_data = pd.read_csv(self.dir)
            self.from_scratch = False

    def concatData(self):
        result = pd.concat([self.old_data, self.new_data])
        result = result.drop_duplicates(subset=["Text"])
        result = result.set_index("Date")
        result = result.sort_index(ascending=False)
        return result

    def URLFetcher(self):
        self.urls = []
        self.dates = []

        if self.from_scratch == False:
            last_url = self.old_data.iloc[0, 1]
        elif self.from_scratch == True:
            last_url = "https://www.foxnews.com/world/crimea-effect-russia-vladimir-putin-hitting-seven-year-hitch"

        self.seleniumParams()
        with alive_bar(title=f"→ {self.source}: Fetching URLs in pages", bar=None, spinner="dots", force_tty=True) as bar:
            url = "https://www.foxnews.com/category/world/conflicts/ukraine"
            title_tag = "//article//header//h4//a"
            button_tag = "//section[@class='collection collection-article-list has-load-more']//div[@class='button load-more js-load-more']"
            exc_list = ["/v/"]
            inc_list = ["foxnews"]
            self.driver.get(url)
            for i in range(0,2):
                bar()
                titles = self.wait.until(EC.presence_of_all_elements_located((By.XPATH, title_tag)))
                for title in titles[-10:]:
                    url = title.get_attribute("href")
                    if not any(s in url for s in exc_list) and any(s in url for s in inc_list):
                        self.urls.append(url)
                    if last_url == url:
                        break
                if last_url == url:
                    break
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                self.wait.until(EC.element_to_be_clickable((By.XPATH, button_tag))).click()   
            self.driver.quit()
        self.unique_urls = list(dict.fromkeys(self.urls))

    def articleScraper(self):
        bodies = []
        titles = []
        dates = []
        urls = []
        rep = {"CLICK HERE TO GET THE FOX NEWS APP": ""}

        def replaceAll(text, dic):
            for i, j in dic.items():
                text = text.replace(i, j)
            return text

        with alive_bar(len(self.unique_urls), title=f"→ {self.source}: Article scraper", spinner="dots_waves", bar="smooth", force_tty=True) as bar:
            for url in self.unique_urls:
                try:
                    article_tag = ["article-body"]
                    title_tags = ["headline"]
                    text_tags = ["zn-body__paragraph"]
                    opener_tag = ["zn-body__paragraph speakable"]
                    html_text = requests.get(url).text
                    soup = BeautifulSoup(html_text, "lxml")
                    title = soup.find("h1", class_=title_tags).text
                    date = soup.find("time").text
                    date = str(datetime.strptime(date[1:-6], "%B %d, %Y %H:%M"))[:-9]
                    print(date)
                    article = soup.find("div", class_=article_tag)
                    paragraphs = article.find_all("p")
                    body = ""
                    for _ in paragraphs:
                        body += " " + _.text
                    body = replaceAll(body, rep)
                    body = " ".join(body.split())
                    print(body)
                    bodies.append(" ".join(body.split()))  # Local tag
                    titles.append(title)
                    urls.append(url)
                    dates.append(url[20:][:10].replace("/", "-"))
                    bar()
                except Exception as e:
                    print(f"URL couldn't be scraped: {url} because {e}")
                    pass
        data = pd.DataFrame({"URL": urls, "Date": dates, "Title": titles, "Text": bodies})
        self.new_data = data

    def scraper(self):
        self.fromScratch()
        self.URLFetcher()
        self.articleScraper()
        # data = self.concatData()
        # lenAfter = len(data) - len(self.old_data)
        # if lenAfter == 0:
        #     print(f"→ No new articles found. Total articles: {len(data)}")
        # else:
        #     print(f"→ {lenAfter} new articles saved to {self.source}.csv! Total articles: {len(data)}")
        # print("")
        # data.to_csv(self.dir, index=True)

        # return data

In [16]:
fox = Fox()
fox.scraper()

FOX: No CSV file found. Creating...

→ FOX: Fetching URLs in pages (!) 2 in 2.4s (0.84/s) 
on 0: 2022-08-27                                                                                                        
on 0: Fox News correspondent Lucas Tomlinson reports from Kyiv, Ukraine on the aftermath of Russia attacking Odessa on 'Special Report.' A Russian convoy transporting heavy military equipment is headed for Crimea, reports said Saturday, in an apparent effort by Moscow to beef up its offensive in Ukraine. The equipment was reportedly seen being transported near Kerch Bridge, also known as the Crimean bridge, which connects the Crimean Peninsula with Russia for rail and vehicle transport. Railway lories were reportedly loaded up with machinery like howitzers, heavy armored vehicles, tanks, infantry fighting vehicles, trucks and fuel tanks, according to Radio Free Europe. FILE - A vehicle runs down the road-and-rail Crimean Bridge passing over the Kerch Strait and linking southern

In [17]:
print(fox.unique_urls[-328])

IndexError: list index out of range