# Web Scraper for Paleric Blog

This is meant for educational purposes, to scrape and export the blog posts from the Chamorro blog written by Pale' Eric Forbes and compile the texts into an e-book for easier access and reference for Chamorro language learners.

**Name:** Schyuler Lujan <br>
**Date Started:** 4-Nov-2024 <br>
**Date Completed:** 4-Nov-2024

In [1]:
# Import libraries
from bs4 import BeautifulSoup
import requests
import time

In [2]:
def get_older_urls(homepage):
    """
    Get all the Older Posts links so we can navigate through the entire blog and get the links for each post
    """
    # Request the homepage and parse the HTML
    page = requests.get(homepage)
    soup = BeautifulSoup(page.content, "html.parser")
    
    # Get the Older Posts link on the homepage; this is our starting point
    older_urls = [homepage] # Initialize list for storing all links
    older_anchors = soup.find_all('a', class_="blog-pager-older-link")
    
    # Using the first Older Posts link, navigate to each subsequent Older Posts page until end of blog
    while len(older_anchors) > 0:
        older_urls.extend([anchor.get("href") for anchor in older_anchors])
        next_page = older_urls[-1]
        page = requests.get(next_page)
        soup = BeautifulSoup(page.content, "html.parser")
        older_anchors = soup.find_all("a", class_="blog-pager-older-link")
        
    return older_urls    

In [3]:
def get_post_urls(navigation_urls):
    """
    We will get all the urls for each individual post and save them to a list
    To do this, we will iterate through all of the Older Posts urls
    On each page, we will find all h3 tags with class_="post-title entry-title" and extract the url
    """
    post_urls = [] # Initialize list
    
    # On each page, get the URLS for each blog post
    for url in navigation_urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        post_anchors = soup.find_all("h3", class_="post-title entry-title")
        post_urls.extend([anchor.find("a").get("href") for anchor in post_anchors])
    
    return post_urls

In [10]:
def get_text(post_urls, blog_title):
    """
    We will extract the actual blog content, format it nicely and export it to a single html file
    We will iterate through post_urls to navigate to each blog post, then do the following:
    - Get the post title -> find h3 class_="post-title entry-title"
    - Get the post date -> find h2 class_="date-header"
    - Get the post content -> find h3 class_="post-title entry-title"
    - Remove all images to lighten the file -> find img tags
    - Create an HTML structure for formatting, and append each post to this structure
    """
    
    # Initialize the HTML structure for formatting, reading the HTML as UTF-8 to process special characters
    combined_html_content = f"""
    <html>
    <meta charset="UTF-8"><title>{blog_title}</title></head>
    <body>
    """
    
    for url in post_urls:
        page = requests.get(url)
        page.encoding = "utf-8"
        soup = BeautifulSoup(page.text, "html.parser")
        
        # Extract blog post title and blog content
        title = soup.find("h3", class_="post-title entry-title").get_text(strip=True)
        date = soup.find("h2", class_="date-header").get_text(strip=True)
        content = soup.find('div', class_="post-body entry-content")
        
        # Remove images from content
        for img in content.find_all('img'):
            img.decompose()
        
        # Append the blog post title and content to the HTML structure
        combined_html_content += f"""
        <section>
        <h1>{title}</h1>
        {date}
        {content.prettify()}
        </section>
        """
        
    # Close the HTML structure
    combined_html_content += f"""
    </body>
    </html>
    """
    
    return combined_html_content

In [5]:
homepage = 'https://paleric.blogspot.com/'

In [6]:
navigation_urls = get_older_urls(homepage)

In [7]:
post_urls = get_post_urls(navigation_urls)

In [8]:
#test_post_urls = post_urls[:6] # For testing
blog_title = "Paleric Blog"

In [14]:
blog_content = get_text(post_urls, blog_title)

In [13]:
# Test export
#with open("testexport.html", "w", encoding="utf-8") as file:
#    file.write(blog_content)

In [15]:
# Final export
with open("paleric.html", "w", encoding="utf-8") as file:
    file.write(blog_content)