In [85]:
import os
import warnings
import requests
import string
import numpy as np
import nltk
import time
from numpy import linalg as LA
import regex as re
import pandas as pd
from lda import LDA
from datetime import datetime
from bs4 import BeautifulSoup
from alive_progress import alive_bar
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from selenium.webdriver.chrome.options import Options
from alive_progress import alive_bar

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)


In [86]:
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))
PARENT_DIR = os.path.dirname(ROOT_DIR)
GUARDIAN_DIR = os.path.join(ROOT_DIR, "data", "Guardian.csv")
REUTERS_DIR = os.path.join(ROOT_DIR, "data", "Reuters.csv")
CNN_DIR = os.path.join(PARENT_DIR, "data", "CNN.csv")
DAILYMAIL_DIR = os.path.join(PARENT_DIR, "data", "DailyMail.csv")
AP_DIR = os.path.join(PARENT_DIR, "data", "AssociatedPress.csv")
GLOVE_DIR = os.path.join(ROOT_DIR, "glove_data", "results", "vectors.txt")
FOX_DIR = os.path.join(PARENT_DIR, "data", "Fox.csv")


# Fox

In [87]:

class AssociatedPress:
    def __init__(self) -> None:
        self.source = "AssociatedPress"
        self.dir = AP_DIR

    def fromScratch(self):
        if not os.path.exists(self.dir):
            self.old_data = pd.DataFrame(columns=["Date", "URL", "Title", "Text"])
            self.from_scratch = True
        else:
            self.old_data = pd.read_csv(self.dir)
            self.from_scratch = False

    def concatData(self):
        result = pd.concat([self.old_data, self.new_data])
        result = result.drop_duplicates(subset=["Text"])
        result = result.set_index("Date")
        result = result.sort_index(ascending=False)
        return result

    def URLFetcher(self):
        self.urls = []
        self.dates = []

        if self.from_scratch == False:
            last_url = self.old_data.iloc[0, 1]
        elif self.from_scratch == True:
            last_url = "https://www.dailymail.co.uk/wires/ap/article-9373269/Irans-final-report-Ukraine-jet-crash-blames-human-error.html"

        with alive_bar(title=f"→ {self.source}: Fetching URLs in pages", bar=None, spinner="dots", force_tty=True) as bar:
            for page in range(0, 12):  # 95
                leading_url = "https://www.dailymail.co.uk"
                url = "https://www.dailymail.co.uk/home/search.html?offset=" + str(page * 50) + "&size=50&sel=site&searchPhrase=ukraine+russia&sort=recent&channel=ap&type=article&days=all"
                title_tag = "sch-res-title"
                exc_list = ["AP-News-Brief", "Roundup", "Results", "WTA", "Standings", "AP-Week", "Highlights"]
                inc_list = ["/ap/"]
                try:
                    html_text = requests.get(url).text
                    soup = BeautifulSoup(html_text, "lxml")
                    headlines = soup.find_all("h3", class_=title_tag)
                    for headline in headlines:
                        _ = headline.find("a", href=True)
                        url = leading_url + _["href"]
                        if not any(s in url for s in exc_list) and any(s in url for s in inc_list):
                            self.urls.append(url)
                            print(url)
                        if last_url == url:
                            break
                    if last_url == url:
                        break
                except Exception as e:
                    print(f"Error in page {page}: {e}")
                    pass
                bar()
        self.unique_urls = list(dict.fromkeys(self.urls))
        print(f"→ {len(self.unique_urls)} URLs fetched successfully!")

    def articleScraper(self):
        bodies = []
        titles = []
        dates = []
        urls = []
        rep = {"The Mail on Sunday can reveal:": "", "RELATED ARTICLES": "", "Share this article": ""}

        def replaceAll(text, dic):
            for i, j in dic.items():
                text = text.replace(i, j)
            return text

        with alive_bar(len(self.unique_urls), title=f"→ {self.source}: Article scraper", spinner="dots_waves", bar="smooth", force_tty=True) as bar:
            print(f"LENGTH:{len(self.unique_urls)}")
            for url in self.unique_urls:
                try:
                    title_tags = ["pg-headline"]
                    text_tags = ["mol-para-with-font"]
                    date_box_tag = ["article-timestamp article-timestamp-published"]
                    html_text = requests.get(url).text
                    soup = BeautifulSoup(html_text, "lxml")
                    title = soup.find("h2").text
                    date_box = soup.find("span", class_=date_box_tag)
                    date = date_box.find("time")
                    paragraphs = soup.find_all("p", class_=text_tags)
                    body = ""
                    for _ in paragraphs:
                        body += " " + _.text
                    body = replaceAll(body, rep)
                    bodies.append(re.sub(r".+?(?=\) -)\) - ", "", " ".join(body.split())))
                    titles.append(title)
                    print(title)
                    urls.append(url)
                    print(url)
                    dates.append(date.get("datetime")[:10])
                    bar()
                except Exception as e:
                    print(f"URL couldn't be scraped: {url} because {e}")
                    pass
        data = pd.DataFrame({"URL": urls, "Date": dates, "Title": titles, "Text": bodies})
        self.new_data = data

    def scraper(self):
        # self.fromScratch()
        # self.URLFetcher()
        # self.articleScraper()
        data = self.concatData()
        lenAfter = len(data) - len(self.old_data)
        if lenAfter == 0:
            print(f"→ No new articles found. Total articles: {len(data)}")
        else:
            print(f"→ {lenAfter} new articles saved to {self.source}.csv! Total articles: {len(data)}")
        data.to_csv(self.dir, index=True)

        return data


In [88]:
ap = AssociatedPress()
ap.scraper()

on 0: https://www.dailymail.co.uk/wires/ap/article-11161013/US-asks-farmers-Can-plant-2-crops-instead-1.html            
on 0: https://www.dailymail.co.uk/wires/ap/article-11160875/Serbia-populist-leader-Vucic-says-pride-ban-enforced.html   
on 0: https://www.dailymail.co.uk/wires/ap/article-11160677/Russian-liberal-sentenced-comparing-Stalin-Hitler.html      
on 0: https://www.dailymail.co.uk/wires/ap/article-11160655/Russian-prosecutors-ask-24-year-sentence-ex-reporter.html   
on 0: https://www.dailymail.co.uk/wires/ap/article-11160599/UN-First-grain-shipment-departs-Ukraine-war-torn-Yemen.html 
on 0: https://www.dailymail.co.uk/wires/ap/article-11160567/German-inflation-near-half-century-record-dip.html          
on 0: https://www.dailymail.co.uk/wires/ap/article-11160265/Germany-upbeat-energy-security-Russia-cuts-gas-France.html  
on 0: https://www.dailymail.co.uk/wires/ap/article-11160063/1st-Ukraine-grain-ship-Horn-Africa-reaches-Djibouti.html    
on 0: https://www.dailymail.co.u

In [93]:
print(len(ap.urls))

521


In [95]:
print(len(set(ap.urls)))

521


In [90]:
data = ap.concatData()

In [97]:
ap.new_data.to_csv(ROOT_DIR + "asd.csv", index=False)

In [92]:
data.describe()

Unnamed: 0,URL,Title,Text
count,308,308,308
unique,308,308,308
top,https://www.dailymail.co.uk/wires/ap/article-1...,US asks farmers: Can you plant 2 crops instead...,There is only so much farmland in the United S...
freq,1,1,1
