# Comparative analysis of Robert Frost's poems

## Webscraping

In [1]:
# Dependencies

# to scrape the data from websites
from requests import get
from bs4 import BeautifulSoup

# to make the scraping more human-like by controlling the crawl rate
from IPython.core.display import clear_output
from time import sleep, time
from random import randint

# just in case of errors
from warnings import warn

In [2]:
# State the source of the data (there are four pages to scrape)
pages = [str(i) for i in range(1,5)]
url = "https://www.poetryfoundation.org/search?query=Robert%20Frost&refinement=poems&page="

In [3]:
# Create an empty list for parsed data from web scraping
# So there's no need to scrape repeatedly
soups = []
    
# Prepare the loop through the pages
start_time = time()
requests = 0

# Loop through the pages
for page in pages:
    # Make a get request for the contents of the URL
    response = get(url + page)
    
    # Pause the loop
    sleep(randint(8,15))
    
    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)
    
    # If there's an error, throw an error warning
    if response.status_code != 200:
        warn(f"Request: {requests}; Status code: {response.status_code}")
    
    # If there are more than 5 requests, break the loop
    if requests > 5:
        warn("Number of requests more than expected.")
        break
    
    # Scrape with BeautifulSoup
    page_soup = BeautifulSoup(response.text, "html.parser")
    soups.append(page_soup)

Request 4; Frequency: 0.0888336426321358 requests/sec


In [4]:
# Create empty lists for links and titles of poems authored by Robert Frost
# NB: There are poems in the list that refer to him but not authored by him
links = []
titles = []

for soup in soups:
    soup1 = soup.find_all("div", class_ = "c-feature")
    
    for item in soup1:
        if item.find("div", class_ = "c-feature-sub").text[4:-1] == "Robert Frost":
            soup2 = item.find("div", class_ = "c-feature-hd") 
            if soup2 is not None:
                titles.append(soup2.a.text)
                links.append(soup2.a["href"])
            else:
                continue # go to the next item
        else:
            continue # go to the next item

## Load the poems into a dataframe

In [5]:
# Dependencies
import pandas as pd
import numpy as np

In [6]:
# Load the titles and the links in a dataframe
poems = pd.DataFrame({"title": titles, 
                      "link": links})
print(f"Robert Frost has {len(poems)} poems that are currently shared by the Poetry Foundation.")

Robert Frost has 53 poems that are currently shared by the Poetry Foundation.


In [7]:
# Preview the poem title and their links in the dataframe
poems.head(6)

Unnamed: 0,title,link
0,October,https://www.poetryfoundation.org/poems/53084/o...
1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...
2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...
3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...
4,Birches,https://www.poetryfoundation.org/poems/44260/b...
5,Christmas Trees,https://www.poetryfoundation.org/poems/57834/c...


## Scrape each poem, clean the lines of each poem for special characters, and add the lines to the dataframe

In [8]:
# Create an empty list of lines of poetry
poem_lines = []

# Prepare for looping through the links
start_time = time()
requests = 0

# Loop through the links
for link in poems["link"]:
    
    # Make a get request for the contents of the link
    response2 = get(link)
    
    # Pause the loop
    sleep(randint(8,15))
    
    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)
    
    # If there's an error, throw an error warning
    if response.status_code != 200:
        warn(f"Request: {requests}; Status code: {response.status_code}")
    
    # If there are more than 53 requests (53 poems), break the loop
    if requests > len(poems):
        warn("Number of requests more than expected.")
        break
        
    # Scrape with BeautifulSoup
    poem_soup = BeautifulSoup(response2.text, "html.parser")
    
    # List the lines found in the URL
    lines_html = poem_soup.find("div", {"class": "o-poem"})
    
    if lines_html is not None:
        lines_html = list(lines_html)[1:-1]
    else: 
        lines_html = ""
    
    poem_lines.append(lines_html)

Request 53; Frequency: 0.08308599128190762 requests/sec


In [9]:
# Create a function that loops through a list of poems and extracts lines of text
def extract_text(poem_list):
    poem_lines2 = []
    for poem in poem_list:
        # Create an empty list
        lines = []
        
        # Loop through each poem (a list of lines), extract the text, add to the empty list
        for line in poem:
            lines.append(line.text)
        poem_lines2.append(lines)    
            
    return poem_lines2      

In [10]:
# Use the extract_text function to get the text from relatively clean lists of lines
poem1 = extract_text(poem_lines[0:5])
poem3 = extract_text(poem_lines[6:54])

# Perform further cleaning of the list for the 6th poem
poem2 = []
for line in poem_lines[5][2:-1]:
    poem2.append(line.text)
    
# Concatenate the three poem lists to poem_lines2
# NB: Put the list of lines in a big list so that the big list is counted as an itemm in poem_lines3
poem_lines3 = []
poem_lines3 = poem1 + [poem2] + poem3

# Check the size of poem_lines3
len(poem_lines3)

53

In [11]:
# Remove special characters and their adjacent white spaces from each list of strings (lines) in each poem
clean = [[x.strip() for x in i] for i in poem_lines3]

# Remove empty strings from the list of strings in each poem
clean = [[x for x in i if x] for i in clean]

# Stringify the lists of text per poem
clean2 = []
for poem in clean:
    str1 = ' '.join(poem)
    clean2.append(str1)

In [12]:
# Add the clean lines in the poems dataframe
poems["lines"] = clean2

# Preview the dataframe
poems.head()

Unnamed: 0,title,link,lines
0,October,https://www.poetryfoundation.org/poems/53084/o...,"O hushed October morning mild, Thy leaves have..."
1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...,The buzz saw snarled and rattled in the yard A...
2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...,I have been one acquainted with the night. I h...
3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...,My long two-pointed ladder's sticking through ...
4,Birches,https://www.poetryfoundation.org/poems/44260/b...,When I see birches bend to left and right Acro...


In [13]:
# Remove rows with no text
poems["lines"].replace("", np.nan, inplace = True)

# Drop rows with NaNs in the lines column
poems.dropna(subset = ["lines"], inplace = True)

# Preview the size of the dataframe after dropping the rows with NaNs in the lines column
len(poems)

37

### Save the dataframe in a SQLIte database

In [14]:
# Dependencies
import sqlite3

In [15]:
# Create a SQLite database
conn = sqlite3.connect("Poetry.db")

In [16]:
# Create a database table from the dataframe
poems.to_sql("Frost", conn, if_exists = "replace")

# Preview the database table
pd.read_sql_query("select * from Frost;", conn)

Unnamed: 0,index,title,link,lines
0,0,October,https://www.poetryfoundation.org/poems/53084/o...,"O hushed October morning mild, Thy leaves have..."
1,1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...,The buzz saw snarled and rattled in the yard A...
2,2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...,I have been one acquainted with the night. I h...
3,3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...,My long two-pointed ladder's sticking through ...
4,4,Birches,https://www.poetryfoundation.org/poems/44260/b...,When I see birches bend to left and right Acro...
5,5,Christmas Trees,https://www.poetryfoundation.org/poems/57834/c...,The city had withdrawn into itself And left at...
6,6,Dust of Snow,https://www.poetryfoundation.org/poems/44262/d...,The way a crow Shook down on me The dust of sn...
7,7,Fire and Ice,https://www.poetryfoundation.org/poems/44263/f...,"Some say the world will end in fire, Some say ..."
8,8,Fireflies in the Garden,https://www.poetryfoundation.org/poems/42892/f...,"Here come real stars to fill the upper skies, ..."
9,9,"For Once, Then, Something",https://www.poetryfoundation.org/poems/44264/f...,Others taunt me with having knelt at well-curb...
