In [14]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import re
import unicodedata

In [34]:
config = {
    "newsgpt_articles_url": "https://newsgpt.ai/ai-news/page/",
    "saved_images_path": "./newsgpt_images",
    "csv_file_path": "./newsgpt_dataset.csv"
}

In [28]:
def sanitize_filename(filename: str, replace=' ', max_length=255) -> str:
    # Normalize Unicode characters to their closest ASCII representation
    filename = unicodedata.normalize('NFKD', filename).encode('ascii', 'ignore').decode('ascii')
    
    # Replace unwanted characters
    filename = re.sub(r'[\/:*?"<>|\r\n\t]+', replace, filename)
    
    # Replace spaces with a chosen character (e.g., underscore)
    filename = re.sub(r'\s+', replace, filename).strip(replace)
    
    # Ensure the filename is not too long
    filename = filename[:max_length].rstrip(replace)
    
    filename = filename.lower()
    
    return filename

In [36]:
def get_article_urls(num_pages: int = 1) -> list[str]:
    article_urls = []
    base_url = config["newsgpt_articles_url"]
    for page in range(1,num_pages+1):
        page_url = base_url + str(page)

        res = requests.get(page_url)
        soup = bs(res.content, features='html.parser')

        # get each article tag element
        links = soup.find_all('a', class_="awb-custom-text-color awb-custom-text-hover-color")

        for link in links:
            article_urls.append(link['href'])

    return article_urls

In [46]:
def parse_article(url: str) -> tuple:
    res = requests.get(url)
    soup = bs(res.content, features='html.parser')

    title = soup.find('h1', class_="fusion-title-heading title-heading-left").text

    text_section = soup.find('div', class_="fusion-content-tb fusion-content-tb-1")
    text = text_section.find_all('p') if text_section else []
    text = [p_tag.text for p_tag in text]
    text = ''.join(text)

    img_url = soup.find('img', alt=title)['src']
    try:
        img_res = requests.get(img_url)
    except:
        print(f"error retireving img for article: {title}, url: {img_url}, status_code: {img_res.status_code}")
        return None
    
    img_filename = f"{sanitize_filename(title)}.jpg"

    os.makedirs(config["saved_images_path"], exist_ok=True)

    file_path = os.path.join(config["saved_images_path"], img_filename)

    if not os.path.exists(file_path):
        with open(file_path, "wb") as img_file:
            img_file.write(img_res.content)
    else:
        print(f"File {img_filename} already exists.")

    return title, text, img_filename

In [None]:
article_urls = get_article_urls()

titles = []
texts = []
img_filenames = []
for url in article_urls:
    title, text, img_filename = parse_article(url)
    titles.append(title)
    texts.append(texts)
    img_filenames.append(img_filename)

File dominos pizza announces new meal deal, loyalty program, and expansion plans.jpg already exists.
Following a sluggish start to the year, Domino’s Pizza has announced plans to launch a new meal deal and a customer loyalty program. The company’s chief executive, Andrew Rennie, stated that the slow start was partly due to a strategic decision to limit marketing efforts during January, traditionally a quieter period for the business. The company is now set to ramp up its promotional activities with the introduction of a £4 lunch offer. This deal, expected to be launched in the coming weeks, will include smaller items from the menu such as pizzas, wraps, and cookies. In addition to the new meal deal, the pizza chain is also planning to roll out a loyalty program for regular customers later this year. While details of the program remain under wraps due to market sensitivity, Rennie expressed confidence in the initiative, stating that the data-driven approach to its development indicates 

In [35]:
newsgpt_df = pd.DataFrame({
    'Title': titles,
    'Text': texts,
    'Image': img_filenames,
    'Label': 1
})

newsgpt_df.to_csv(config["csv_file_path"], index=False)