In [12]:
# pip3 install newspapers4k
import newspaper, requests, json

def snake_case(string):
    return string.lower().replace(" ", "_")

def press_image_filepath(file_name):
    return f"press_images/{file_name}.jpg"

def save_article_image(article):
    '''Saves image to press_images folder if it exists. Returns relative path
       to image. If image does not exist, returns relative path to default image.
    '''
    image_response = requests.get(article.top_image)

    # Returns default image if no image is found
    if image_response.status_code != 200:
        return press_image_filepath("default_press_image.jpg")
    
    # Saves image and returns file path 
    save_image_filepath = press_image_filepath(snake_case(article.title))
    with open(save_image_filepath, "wb") as image_file:
        image_file.write(image_response.content)
    image_file.close()
    return save_image_filepath

def make_article_dictionary(article):
    '''Makes article dictionary that represents how we want to store article info in a JSON'''
    return {
        "title": article.title,
        "publication_source_url": article.source_url,
        "original_url": article.original_url,
        "text": article.text,
        "top_image_relative_path": save_article_image(article)
    }

def update_press_articles_in_press_json():
    with open("articles.txt", "r") as article_file:
        article_urls = article_file.read().splitlines()
    article_file.close()

    with open("press.json", "r") as press_json_file:
        try:
            press_json = json.load(press_json_file)
        except json.decoder.JSONDecodeError:
            press_json = []
        existing_urls = [entry["original_url"] for entry in press_json]
    press_json_file.close()

    for url in article_urls:
        # Check if article has already been saved
        if url in existing_urls:
            continue
        
        # Downloads article and notifies user if download fails
        try:
            article = newspaper.article(url)
        except:
            print(f"The url {url} was not downloaded properly.\n Please edit 'press.json' with its information manually.")
            continue
        
        # Adds new article to running list
        article_dictionary = make_article_dictionary(article)
        press_json.append(article_dictionary)

    # saves list to press JSON file
    with open('press.json', "w") as press_json_file:
        json.dump(press_json, press_json_file, indent=4)      
    press_json_file.close()

In [15]:
update_press_articles_in_press_json()

The url https://venturebeat.com/ai/uh-oh-fine-tuning-llms-compromises-their-safety-study-finds/ was not downloaded properly.
 Please edit 'press.json' with its information manually.
