In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException,  StaleElementReferenceException, WebDriverException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from alive_progress import alive_bar
from IPython.display import clear_output
import time
import pandas as pd
import numpy as np
import os
import csv
import threading

In [24]:
# Definitions
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))
PARENT_DIR = os.path.dirname(ROOT_DIR)

def check_exists(tag):
    try:
        driver.find_element(By.XPATH, tag)
    except NoSuchElementException:
        return False
    return True

def replaceAll(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

# Reuters Login information
email = "@gmail.com"
password = "@"

In [3]:
# Driver configChromeDriverManager(path=r".\\").install()
s = Service(GeckoDriverManager(path=ROOT_DIR).install())
clear_output()
driver = webdriver.Firefox(service=s)
WAIT = WebDriverWait(driver, 10, ignored_exceptions=(NoSuchElementException, StaleElementReferenceException))

# XPath Tags
title_tag = "//article[@class='story ']//div[@class='story-content']//child::a"

# Scraper
urls = []
pgs_ukr = 5 #879
pgs_rus = 5 #668

print("-> Launching Browser...")

with alive_bar(pgs_ukr+pgs_rus, title="-> Scraper", spinner="dots_waves", bar="smooth", force_tty=True) as bar:

    f = open(PARENT_DIR + "/data/ReutersURLs.csv", "w+")
    f.close()

    for page in range(1, pgs_ukr+1):

        reuters = "https://www.reuters.com/news/archive/ukraine?view=page&page=" + str(page) + "&pageSize=10"
        driver.get(reuters)

        titles = WAIT.until(EC.presence_of_all_elements_located((By.XPATH, title_tag)))
        for title in titles:
            urls.append(title.get_attribute("href"))
            with open(PARENT_DIR + "/data/ReutersURLs.csv",'a', newline='') as f:
                csv.writer(f).writerow([title.get_attribute("href")])
        
        bar()


    for page in range(1, pgs_rus+1):

        reuters = "https://www.reuters.com/news/archive/russia?view=page&page=" + str(page)
        driver.get(reuters)

        titles = WAIT.until(EC.presence_of_all_elements_located((By.XPATH, title_tag)))
        for title in titles:
            urls.append(title.get_attribute("href"))
            with open(PARENT_DIR + "/data/ReutersURLs.csv",'a', newline='') as f:
                csv.writer(f).writerow([title.get_attribute("href")])

        bar()

unique_urls = list(dict.fromkeys(urls))
print(f"-> {len(unique_urls)} URLs fetched successfully!")
driver.quit()

-> Launching Browser...
-> Scraper |████████████████████████████████████████| 10/10 [100%] in 13.1s (0.76/s)                                    
-> 79 URLs fetched successfully!


In [54]:
with open(PARENT_DIR + "/data/ReutersURLs.csv", newline='') as f:
    data = [line.rstrip('\r\n') for line in f]
    unique_urls = list(dict.fromkeys(data))

In [None]:
# Driver config
s = Service(GeckoDriverManager(path=ROOT_DIR).install())
clear_output()
driver = webdriver.Firefox(service=s)

# Logging in
email_tag = "//form//input[@type='email']"
password_tag = "//form//input[@type='password']"
sign_in_tag = "//form//div[@class='button__container__3sgvk']"
main_page_check = "//nav[@aria-label='Main navigation']"

driver.get("https://www.reuters.com/signin/")

driver.find_element(By.XPATH, email_tag).send_keys(email)
driver.find_element(By.XPATH, password_tag).send_keys(password)
driver.find_element(By.XPATH, sign_in_tag).click()
while not check_exists(main_page_check): time.sleep(1)

# Parsing news

# CSS Selector Tags
title_tag = "//header//h1"
text_tag = "//div[@class='article-body__content__17Yit paywall-article']//p[@data-testid!='Body']"
date_tag = "//header//time//span[1]"

# Undesirable substrings
rep = {
    "read more": "",
}

# Scraper
texts = []
titles = []
dates = []
urls = []
print("-> Launching Browser...")

with alive_bar(len(unique_urls), title="-> Scraper", spinner="dots_waves", bar="smooth", force_tty=True) as bar:
    for url in unique_urls:

        driver.get(url)

        title = driver.find_element(By.XPATH, title_tag).text

        tempDate = driver.find_element(By.XPATH, date_tag).text

        text = ""
        phrases = driver.find_elements(By.XPATH, text_tag)
        for phrase in phrases:
            text += " " + phrase.text
        text = replaceAll(text, rep)

        texts.append(text)
        titles.append(title)
        dates.append(tempDate)
        urls.append(url)

        bar()

driver.quit()

splits = np.array_split(unique_urls, 3)

dict_data = {"URL":urls,"Date":dates,"Title": titles, "Text": texts}
data = pd.DataFrame(dict_data)

print("Database generated successfully!")


In [42]:
os.makedirs(PARENT_DIR + "/data", exist_ok=True)
data.to_csv(PARENT_DIR + "/data/Reuters.csv", index=True, header=True)
data.head()

Unnamed: 0,URL,Date,Title,Text
0,https://www.reuters.com/article/us-ukraine-cri...,"July 9, 2022",Russia threatens broad Ukraine offensive as U....,"KYIV, July 9 (Reuters) - Ukrainian defenders ..."
1,https://www.reuters.com/article/g20-usa-china/...,"July 9, 2022","Blinken, China's Wang Yi hold talks covering U...","NUSA DUA, Indonesia, July 9 (Reuters) - U.S. ..."
2,https://www.reuters.com/article/us-britain-pol...,"July 9, 2022",Two more minsters join lengthening list of can...,"LONDON, July 9 (Reuters) - Two cabinet minist..."
3,https://www.reuters.com/article/us-ukraine-cri...,"July 9, 2022","Zelenskiy sacks Ukraine's envoy to Germany, ot...","KYIV, July 9 (Reuters) - Ukrainian President ..."
4,https://www.reuters.com/article/us-ukraine-cri...,"July 9, 2022",Russian forces unlikely to leave southern Ukra...,"LONDON, July 9 (Reuters) - Russia is unlikely..."
