In [1]:
import os
import requests
import shutil
from bs4 import BeautifulSoup

In [None]:

# URL to fetch
url = "https://www.carbonbrief.org/daily-brief/"

# Send an HTTP GET request to the URL
response = requests.get(url)
if response.status_code != 200:
    raise Exception(f"Failed to fetch URL: {url} (status code: {response.status_code})")

# Get the HTML content from the response
html_content = response.text

# Ensure the 'temp' directory exists
os.makedirs("temp", exist_ok=True)

# Save the raw HTML content to a file
html_file_path = os.path.join("temp", "daily_brief.html")
with open(html_file_path, "w", encoding="utf-8") as html_file:
    html_file.write(html_content)
print(f"HTML content saved to: {html_file_path}")

# Parse the HTML content using BeautifulSoup for article extraction
soup = BeautifulSoup(html_content, "html.parser")

# Find all article elements on the page.
# (This selector may need to be updated if the website structure changes.)
articles = soup.find_all("article")

output_lines = []

if articles:
    for article in articles:
        # Extract the title from header tags (commonly h1, h2, or h3)
        title_element = article.find(["h1", "h2", "h3"])
        title = title_element.get_text(strip=True) if title_element else "No Title Found"

        # Try to get the URL from the title link, if available
        link = ""
        if title_element:
            a_tag = title_element.find("a")
            if a_tag and a_tag.has_attr("href"):
                link = a_tag["href"]

        # Extract a brief excerpt if available (e.g., the first paragraph)
        excerpt_element = article.find("p")
        excerpt = excerpt_element.get_text(strip=True) if excerpt_element else "No excerpt available."

        # Build the output for this article
        output_lines.append(f"Title: {title}")
        if link:
            output_lines.append(f"Link: {link}")
        output_lines.append(f"Excerpt: {excerpt}")
        output_lines.append("-" * 80)
else:
    # If no article elements are found, fall back to saving the entire page text.
    output_lines.append("No specific article blocks were found. Here is the entire page text:")
    output_lines.append(soup.get_text(separator="\n", strip=True))

# Join the lines into a single string
output_text = "\n".join(output_lines)

# Save the extracted article information to a text file
text_file_path = os.path.join("temp", "daily_brief.txt")
with open(text_file_path, "w", encoding="utf-8") as text_file:
    text_file.write(output_text)

print(f"Article information saved to: {text_file_path}")


In [None]:
# Define the input HTML file path (from the previous step)
input_file_path = os.path.join("temp", "daily_brief.html")

# Read the HTML content from the file
with open(input_file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Extract all <div> elements with class "dailystory"
dailystory_divs = soup.find_all("div", class_="dailystory")

# Extract all <div> elements with class "breadCrumbNormal" that include the text "Daily Briefing"
breadCrumb_divs = []
for div in soup.find_all("div", class_="breadCrumbNormal"):
    if "Daily Briefing" in div.get_text():
        breadCrumb_divs.append(div)

# Build a new HTML document that includes only the filtered divs
filtered_html = "<html>\n"
filtered_html += "<head>\n"
filtered_html += "  <meta charset='utf-8'>\n"
filtered_html += "  <title>Filtered Daily Brief</title>\n"
filtered_html += "</head>\n"
filtered_html += "<body>\n"

# Append the breadCrumb divs first (if any)
for div in breadCrumb_divs:
    filtered_html += str(div) + "\n"

# Append the dailystory divs
for div in dailystory_divs:
    filtered_html += str(div) + "\n"

filtered_html += "</body>\n</html>"

# Define the output HTML file path
output_file_path = os.path.join("temp", "filtered_daily_brief.html")

# Write the filtered HTML content to the output file
with open(output_file_path, "w", encoding="utf-8") as outfile:
    outfile.write(filtered_html)

print(f"Filtered HTML saved to: {output_file_path}")


In [None]:
# Define the input/output HTML file path
file_path = os.path.join("temp", "filtered_daily_brief.html")

# Read the existing HTML content
with open(file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Remove the <a> tags but keep their inner text by unwrapping them
for a_tag in soup.find_all("a"):
    a_tag.unwrap()

# Remove all <span> formatting by unwrapping them (keeping their inner text)
for span in soup.find_all("span"):
    span.unwrap()

# Write the modified HTML back to the same file
with open(file_path, "w", encoding="utf-8") as file:
    file.write(str(soup))

print(f"All <a> tags have been removed and <span> tags have been unwrapped in {file_path}.")


In [None]:
# Define the input HTML file path (adjust if needed)
input_file_path = os.path.join("temp", "filtered_daily_brief.html")

# Read the HTML content from the file
with open(input_file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Find the first <div> with class "dateCat"
date_cat_div = soup.find("div", class_="dateCat")

# Extract the text from the div or use a fallback if not found
date_text = date_cat_div.get_text(strip=True) if date_cat_div else "an unknown date"

# Create the formatted output text
output_text = f"This is your Daily Carbon Brief, published on {date_text}"

# Define the output directory path
output_dir = os.path.join("temp", "daily_brief_components")

# Delete the directory if it already exists
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define the output file path
output_file_path = os.path.join(output_dir, "001.txt")

# Write the output text to the file
with open(output_file_path, "w", encoding="utf-8") as outfile:
    outfile.write(output_text)

print(f"Output saved to: {output_file_path}")


In [None]:
# Define the input HTML file path
input_file_path = os.path.join("temp", "filtered_daily_brief.html")

# Read the HTML content from the file
with open(input_file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Find all <div> elements with class "dailystory"
dailystory_divs = soup.find_all("div", class_="dailystory")

# Define the output directory and ensure it exists
output_dir = os.path.join("temp", "daily_brief_components")
os.makedirs(output_dir, exist_ok=True)

# File numbering starts at 2 because 001.txt was used for the date component.
file_number = 2

# Process each dailystory div
for story_div in dailystory_divs:
    output_lines = []

    # 1. Extract and add the story heading from <strong class="storyheading">
    heading_tag = story_div.find("strong", class_="storyheading")
    heading_text = heading_tag.get_text(strip=True) + "." if heading_tag else "No Heading Found"
    output_lines.append(heading_text)

    # 2. Extract and process the story credits from <div class="storycredits">
    credits_tag = story_div.find("div", class_="storycredits")
    if credits_tag:
        # Get the full text and remove "Read Article"
        credits_text = credits_tag.get_text(strip=True).replace("Read Article", "").strip()
        if credits_text:
            # Format the credits as "Published in {agency}"
            output_lines.append(f"Published by {credits_text}.")

    # 3. Extract the story content from <div class="storycont">
    story_cont_div = story_div.find("div", class_="storycont")
    if story_cont_div:
        # Find all <p> tags inside the story content div
        paragraphs = story_cont_div.find_all("p")
        # Extract each paragraph's text, stripping any extra whitespace
        paragraph_texts = [p.get_text(strip=True) for p in paragraphs]
        # Join the paragraphs with a newline between each
        output_lines.append("\n".join(paragraph_texts))

    # Combine all parts with double newlines separating sections
    output_text = "\n\n".join(output_lines)

    # Create a sequential filename (e.g., "002.txt", "003.txt", etc.)
    file_name = f"{file_number:03d}.txt"
    output_file_path = os.path.join(output_dir, file_name)

    # Write the output text to the file
    with open(output_file_path, "w", encoding="utf-8") as out_file:
        out_file.write(output_text)

    print(f"Story saved to: {output_file_path}")

    file_number += 1


In [None]:
# Define the directory containing the component files
components_dir = os.path.join("temp", "daily_brief_components")
# Define the output file path
output_file_path = os.path.join("temp", "daily_brief.txt")

# Get a list of all .txt files in the components directory and sort them (alphabetical sorting works if they are numbered with leading zeros)
component_files = [f for f in os.listdir(components_dir) if f.endswith(".txt")]
component_files.sort()

# Initialize a list to hold the content of each component file
all_components = []

# Read each component file in order
for file_name in component_files:
    file_path = os.path.join(components_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read().strip()
        all_components.append(content)

# Join all the component texts with two newlines between each
daily_brief_text = "\n\n".join(all_components)

# Write the combined content to daily_brief.txt
with open(output_file_path, "w", encoding="utf-8") as output_file:
    output_file.write(daily_brief_text)

print(f"Combined daily brief saved to: {output_file_path}")