# Comparative analysis of Robert Frost's poems

## Webscraping

In [1]:
# Dependencies

# to scrape the data from websites
from requests import get
from bs4 import BeautifulSoup

# to make the scraping more human-like by controlling the crawl rate
from IPython.core.display import clear_output
from time import sleep, time
from random import randint

# just in case of errors
from warnings import warn

In [2]:
# State the source of the data (there are four pages to scrape)
pages = [str(i) for i in range(1,5)]
url = "https://www.poetryfoundation.org/search?query=Robert%20Frost&refinement=poems&page="

In [3]:
# Create an empty list for parsed data from web scraping
# So there's no need to scrape repeatedly
soups = []
    
# Prepare the loop through the pages
start_time = time()
requests = 0

# Loop through the pages
for page in pages:
    # Make a get request for the contents of the URL
    response = get(url + page)
    
    # Pause the loop
    sleep(randint(8,15))
    
    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)
    
    # If there's an error, throw an error warning
    if response.status_code != 200:
        warn(f"Request: {requests}; Status code: {response.status_code}")
    
    # If there are more than 5 requests, break the loop
    if requests > 5:
        warn("Number of requests more than expected.")
        break
    
    # Scrape with BeautifulSoup
    page_soup = BeautifulSoup(response.text, "html.parser")
    soups.append(page_soup)

Request 4; Frequency: 0.07980943292158231 requests/sec


In [4]:
# Create empty lists for links and titles of poems authored by Robert Frost
# NB: There are poems in the list that refer to him but not authored by him
links = []
titles = []

for soup in soups:
    soup1 = soup.find_all("div", class_ = "c-feature")
    
    for item in soup1:
        if item.find("div", class_ = "c-feature-sub").text[4:-1] == "Robert Frost":
            soup2 = item.find("div", class_ = "c-feature-hd") 
            if soup2 is not None:
                titles.append(soup2.a.text)
                links.append(soup2.a["href"])
            else:
                continue # go to the next item
        else:
            continue # go to the next item

## Load the poems into a dataframe

In [5]:
# Dependencies
import pandas as pd

In [6]:
# Load the titles and the links in a dataframe
poems = pd.DataFrame({"title": titles, 
                      "link": links})
print(f"Robert Frost has {len(poems)} poems that are currently shared in the Poetry Foundation.")

Robert Frost has 53 poems that are currently shared in the Poetry Foundation.


In [7]:
# Preview the poem title and their links in the dataframe
poems.head()

Unnamed: 0,title,link
0,October,https://www.poetryfoundation.org/poems/53084/o...
1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...
2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...
3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...
4,Birches,https://www.poetryfoundation.org/poems/44260/b...


## Scrape each poem and add the words to the dataframe

In [None]:
# Create an empty list of lines of poetry
poem_lines = []

# Prepare for looping through the links
start_time = time()
requests = 0

# Loop through the links
for link in poems["link"]:
    
    # Make a get request for the contents of the link
    response2 = get(link)
    
    # Pause the loop
    sleep(randint(8,15))
    
    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)
    
    # If there's an error, throw an error warning
    if response.status_code != 200:
        warn(f"Request: {requests}; Status code: {response.status_code}")
    
    # If there are more than 70 requests (70 poems), break the loop
    if requests > 70:
        warn("Number of requests more than expected.")
        break
        
    # Scrape with BeautifulSoup
    poem_soup = BeautifulSoup(response2.text, "html.parser")
    
    # List the lines found in the URL
    lines_html = poem_soup.find("div", {"class": "o-poem"})
    
    if lines_html is not None:
        lines_html = list(lines_html)[1:-1]
    else: 
        lines_html = ""

Request 25; Frequency: 0.08210563866809754 requests/sec


In [None]:
url2 = poems["link"][69]

response2 = get(url2)
soup = BeautifulSoup(response2.text, "html.parser")
soup2 = soup.find("div", {"class": "o-poem"})

if soup2 != None:
    soup3 = list(soup2)[1:-1]
    print(soup3)
else: print("Nope")    