In [396]:
import os
import warnings
import requests
import string 
import numpy as np
import nltk
import time
from numpy import linalg as LA
import regex as re
import pandas as pd
from lda import LDA
from datetime import datetime
from bs4 import BeautifulSoup
from alive_progress import alive_bar
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from selenium.webdriver.chrome.options import Options
from alive_progress import alive_bar

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)


In [397]:
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))
PARENT_DIR = os.path.dirname(ROOT_DIR)
GUARDIAN_DIR = os.path.join(ROOT_DIR, "data", "Guardian.csv")
REUTERS_DIR = os.path.join(ROOT_DIR, "data", "Reuters.csv")
CNN_DIR = os.path.join(PARENT_DIR, "data", "CNN.csv")
GLOVE_DIR = os.path.join(ROOT_DIR, "glove_data", "results", "vectors.txt")

# CNN

In [398]:
s = Service(ChromeDriverManager(chrome_type=ChromeType.BRAVE, path=ROOT_DIR).install())
o = webdriver.ChromeOptions()
o.add_argument('headless')
driver = webdriver.Chrome(service=s,options=o)
ignored_exceptions = (
    NoSuchElementException,
    StaleElementReferenceException,
)
wait2 = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)




In [399]:
class CNN: 
    def __init__(self) -> None:
        self.source = "CNN"

    def fromScratch(self):
            if not os.path.exists(CNN_DIR):
                self.old_data = pd.DataFrame(columns=["Date", "URL", "Title", "Text"])
                self.from_scratch = True
            else:
                self.old_data = pd.read_csv(CNN_DIR)
                self.from_scratch = False

    def concatData(self):
        result = pd.concat([self.old_data, self.new_data])
        result = result.drop_duplicates(subset=["Text"])
        result = result.set_index("Date")
        result = result.sort_index(ascending=False)
        return result

    def URLFetcher(self):
            self.urls = []
            self.dates = []

            if self.from_scratch == False:
                last_url = self.old_data.iloc[0,1]
            elif self.from_scratch == True:
                last_url = "https://www.cnn.com/2021/05/11/politics/romania-nato-exercises-russia/index.html"

            with alive_bar(title=f"→ {self.source}: Fetching URLs in pages", bar=None, spinner="dots", force_tty=True) as bar:
                for page in range(0,95): # 95
                    url = "https://edition.cnn.com/search?q=ukraine+russia&from="+str(page*50)+"&size=50&page=1&sort=newest&types=article&section="
                    title_tag="//div//a[@class='container__link __link']"
                    exc_list=['/tennis/','/live-news/','/opinions/','/tech/','/sport/','/us/','/football/','/china/','/style/','/business-food/','/americas/','/travel/','/business/']
                    inc_list=['/2022/','/2021/']
                    driver.get(url)
                    try:
                        titles = wait2.until(EC.presence_of_all_elements_located((By.XPATH, title_tag))) 
                        for title in titles:
                            url = title.get_attribute("href")
                            if not any(s in url for s in exc_list) and any(s in url for s in inc_list):
                                self.urls.append(url)
                            if last_url == url:
                                break
                        if last_url == url:
                            break
                    except Exception as e:
                        print(f"Error in page {page}: {e}")
                        pass
                    bar()
                print(f"latest page: {page}")
                driver.quit()
            self.unique_urls = list(dict.fromkeys(self.urls))
            print(f"-> {len(self.unique_urls)} URLs fetched successfully!")

    def articleParser(self):
        bodies = []
        titles = []
        dates = []
        urls = []
        rep = {"This story has been updated with additional information.": ""}

        def replaceAll(text, dic):
            for i, j in dic.items():
                text = text.replace(i, j)
            return text

        with alive_bar(len(self.unique_urls), title=f"→ {self.source}: Article scraper", spinner="dots_waves", bar="smooth", force_tty=True) as bar:
            for url in self.unique_urls:
                try:
                    title_tags = ["pg-headline"]
                    text_tags = ["zn-body__paragraph"]
                    opener_tag = ["zn-body__paragraph speakable"]
                    html_text = requests.get(url).text
                    soup = BeautifulSoup(html_text, "lxml")
                    title = soup.find("h1", class_=title_tags).text
                    opener = soup.find("p", class_=opener_tag).text
                    paragraphs = soup.find_all("div", class_=text_tags)
                    body = opener + ""
                    for _ in paragraphs:
                        body += " " + _.text
                    body = replaceAll(body, rep)
                    bodies.append(re.sub(r"^[^\)]*\)", "", " ".join(body.split()))) # Local tag
                    titles.append(title)
                    urls.append(url)
                    dates.append(url[20:][:10].replace("/","-"))
                    bar()
                except Exception as e:
                    print(f"URL couldn't be scraped: {url} because {e}")
                    pass
        data = pd.DataFrame({"URL": urls, "Date": dates, "Title": titles, "Text": bodies})
        self.new_data = data

    def scraper(self):
        self.fromScratch()
        self.URLFetcher()
        self.articleParser()
        data = self.concatData()
        lenAfter = len(data) - len(self.old_data)
        if lenAfter == 0:
            print(f"→ No new articles found. Total articles: {len(data)}")
        else:
            print(f"→ {lenAfter} new articles saved to {self.source}.csv! Total articles: {len(data)}")
        data.to_csv(CNN_DIR, index=True)

        return data

In [400]:
cnn= CNN()
cnn.scraper()

on 60: latest page: 60                                                                                                  
→ CNN: Fetching URLs in pages (!) 60 in 1:33.6 (0.64/s) 
-> 1953 URLs fetched successfully!
on 531: URL couldn't be parsed: https://www.cnn.com/2022/05/05/perspectives/eu-oil-russia-embargo/index.html because 'NoneType' object has no attribute 'text'
on 572: URL couldn't be parsed: https://www.cnn.com/2022/04/28/europe/mariupol-azovstal-steel-plant-intl-cmd/index.html because 'NoneType' object has no attribute 'text'
on 705: URL couldn't be parsed: https://www.cnn.com/2022/04/12/europe/olena-zelenska-ukraine-first-lady-amanpour-cmd-intl/index.html because 'NoneType' object has no attribute 'text'
on 751: URL couldn't be parsed: https://www.cnn.com/2022/04/07/europe/ukraine-mariupol-russia-deportation-cmd-intl/index.html because 'NoneType' object has no attribute 'text'
on 1055: URL couldn't be parsed: https://www.cnn.com/2022/03/16/politics/ukraine-zelensky-congress-

Unnamed: 0_level_0,URL,Title,Text
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-27,https://www.cnn.com/2022/08/27/europe/russia-u...,Donetsk People's Republic says it is ready to ...,The Donetsk People's Republic (DPR) says it is...
2022-08-27,https://www.cnn.com/2022/08/27/europe/darya-du...,Darya Dugina's death provides a glimpse into R...,When a car bomb exploded on the outskirts of M...
2022-08-27,https://www.cnn.com/2022/08/27/politics/russia...,How Ukraine is using resistance warfare develo...,As the war in Ukraine has passed the six-month...
2022-08-27,https://www.cnn.com/2022/08/27/europe/russia-b...,Russia blocks final draft of nuclear disarmame...,A month-long meeting on nuclear disarmament en...
2022-08-26,https://www.cnn.com/2022/08/26/europe/ukraine-...,At least 21 ​​'filtration​​​' sites identified...,Researchers at Yale University ​say they have ...
...,...,...,...
2021-05-24,https://www.cnn.com/2021/05/24/politics/gordon...,Key impeachment witness sues Pompeo over $1.8 ...,President Donald Trump's former ambassador to ...
2021-05-19,https://www.cnn.com/2021/05/19/politics/blinke...,Blinken and Lavrov hold first high-level meeti...,US Secretary of State Antony Blinken met with ...
2021-05-18,https://www.cnn.com/2021/05/18/world/meanwhile...,Why the United Nations is stuck on the Israeli...,The deadly ongoing Israeli-Palestinian conflic...
2021-05-18,https://www.cnn.com/2021/05/18/politics/us-nor...,Biden administration decides not to sanction c...,The Biden administration has decided against s...
