In [13]:
import requests
from bs4 import BeautifulSoup as bs
import unicodedata
import pandas as pd
import os
import re

In [14]:
import unicodedata

def sanitize_filename(filename):
    # Replace non-English characters with their English letter equivalents
    normalized_filename = ''.join(c for c in unicodedata.normalize('NFD', filename) if unicodedata.category(c) != 'Mn')
    # Replace invalid characters with underscores
    return re.sub(r'[<>:"/\\|?*]', '_', normalized_filename)


In [15]:
def parse_news_article(link):
    response = requests.get(link)
    soup = bs(response.content, features='html.parser')
    
    # get title
    title = soup.find('h1', class_="fusion-title-heading title-heading-left").text
    
    # get text content
    p_section = soup.find('div', class_="fusion-content-tb fusion-content-tb-1")
    p = p_section.find_all('p') if p_section else []
    p = [x.text for x in p]
    
    # image
    img_url = soup.find('img', alt=title)['src']
    img_response = requests.get(img_url)

    os.makedirs('images', exist_ok=True)
    
    # title_fixed = title.replace(" ", "_").replace("/", "_").replace(";", "").replace("'", "").replace(",", "").replace("“", "").replace("”", "").replace(":", "").replace("’", "").replace("-", "")
    sanitize_img_filename = sanitize_filename(title)
    img_filename = f"images/{sanitize_img_filename}.jpg"
    
    # Save image to the file
    if img_response.status_code == 200:
        with open(img_filename, 'wb') as img_file:
            # print(img_filename)
            img_file.write(img_response.content)
    else:
        print(f"Failed to fetch image from {img_url}. Status code: {img_response.status_code}")

        
    img_filename = img_filename.split('/')[1]
    
    # return parsed article
    return title, p, img_filename

In [16]:
def get_articles_links(homepage_url, news_links):
    response = requests.get(homepage_url)
    soup = bs(response.content, features='html.parser')

    # parse through main news page
    news_a_tags = soup.find_all('a', class_="awb-custom-text-color awb-custom-text-hover-color")

    for link in news_a_tags:
        news_links.append(link['href'])

In [17]:
news_links = []

# loops through 5 pages of homepage and stores the link from every news article
for page_num in range(1,3):
    base_url = 'https://newsgpt.ai/ai-news/page/'
    updated_url = base_url + str(page_num) 
    get_articles_links(updated_url, news_links)

In [18]:
titles = []
paragraphs_df = []
images = []

print(len(news_links))
os.makedirs('AI_articles', exist_ok=True)

hello = 1
test_set = set()
for link in news_links:
    t,p,i = parse_news_article(link)
    sanitized_title = sanitize_filename(t)

    titles.append(sanitized_title)

    if (test_set.__contains__(sanitized_title)):
        print("ALREADY DONE " + str(hello))
        hello += 1
    

    test_set.add(sanitized_title)
    p_single_string = ' '.join(p)
    paragraphs_df.append(p_single_string)
    # paragraphs.extend(p)
    images.append(i)

    # # Concatenate all paragraphs into a single string
    # full_text = '\n\n'.join(p)

    # # Save the concatenated text into a single file in 'AI_texts' folder
    # file_path = os.path.join("./AI_articles", f"{sanitized_title}.txt")
    # with open(file_path, 'w', encoding='utf-8') as file:
    #     file.write(full_text)
    
    

60


In [19]:
# create pd df to store the data
df = pd.DataFrame({
    'Title': titles,
    'Paragraph': paragraphs_df,
    'Image': images
})

In [20]:
df

Unnamed: 0,Title,Paragraph,Image
0,Tesla Berlin Factory Resumes Production After ...,Tesla’s factory near Berlin is gradually resum...,Tesla Berlin Factory Resumes Production After ...
1,"Tragic Death of Itai Chen, U.S.-Israeli Hostag...","The tragic news of the death of Itai Chen, a U...","Tragic Death of Itai Chen, U.S.-Israeli Hostag..."
2,Felipe Massa Sues Bernie Ecclestone and FIA fo...,"Former Ferrari driver, Felipe Massa, has initi...",Felipe Massa Sues Bernie Ecclestone and FIA fo...
3,Tragedy Strikes Iditarod_ Two Dogs Die in Grue...,"Over the weekend, the annual Iditarod sled dog...",Tragedy Strikes Iditarod_ Two Dogs Die in Grue...
4,England Faces Measles Surge_ Over 1 Million Un...,In the wake of a significant increase in measl...,England Faces Measles Surge_ Over 1 Million Un...
5,Necture Secures $7M Series A Funding for Fleet...,"Austrian start-up Necture, which has developed...",Necture Secures $7M Series A Funding for Fleet...
6,Doriane Pin’s Exceptional Talent Shines in F1 ...,"Doriane Pin, a rising star in the world of For...",Doriane Pin’s Exceptional Talent Shines in F1 ...
7,VAST Data Unveils Revolutionary AI Cloud Archi...,VAST Data has unveiled a novel AI cloud archit...,VAST Data Unveils Revolutionary AI Cloud Archi...
8,iPad Pro 11-Inch Model Faces Supply Constraint...,"The upcoming iPad Pro, particularly the 11-inc...",iPad Pro 11-Inch Model Faces Supply Constraint...
9,LATAM Airlines Boeing 787-9 Dreamliner Inciden...,The Transport Accident Investigation Commissio...,LATAM Airlines Boeing 787-9 Dreamliner Inciden...


In [21]:
# Save to CSV
df.to_csv('AI_data.csv', index=False)

In [22]:
def count_items_in_folder(folder_path):
    count = 0
    for _, _, files in os.walk(folder_path):
        count += len(files)
    return count

folder_path = "./AI_articles"
num_items = count_items_in_folder(folder_path)
print(num_items)

0
