# Collecting William Butler Yeats' poems

## Webscraping

In [1]:
# Dependencies
from pprint import pprint
from string import digits

# to scrape the data from websites
from requests import get
from bs4 import BeautifulSoup

# to make the scraping more human-like by controlling the crawl rate
from IPython.core.display import clear_output
from time import sleep, time
from random import randint

# just in case of errors
from warnings import warn

In [2]:
def webscraper(url_list):
    """ Define a function that conducts webscraping """
    # Create an empty list of page soups
    soups = []
    
    # Prepare the loop through the pages
    start_time = time()
    requests = 0
    
    # Iterate through the pages
    for url in url_list:
        if url == "":
            page_soup = ""
                
        else:
            # Make a get request for the contents of the URL
            response = get(url)

            # Pause the loop
            sleep(randint(8,15))

            # Monitor the requests
            requests += 1
            elapsed_time = time() - start_time
            print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
            clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

            # If there's an error, throw an error warning
            if response.status_code != 200:
                warn(f"Request: {requests}; Status code: {response.status_code}")

            # If there are more requests than expected, break the loop
            if requests > len(url_list):
                warn("Number of requests more than expected.")
                break

            # Scrape with BeautifulSoup
            page_soup = BeautifulSoup(response.text, "html.parser")
            
        soups.append(page_soup)
    
    return soups

In [3]:
def extract_text(poem_list):
    """ Loops through a list of poems and extracts lines of text"""
    poem_lines2 = []
    for poem in poem_list:
        # Create an empty list
        lines = []
        
        # Loop through each poem (a list of lines), extract the text, add to the empty list
        for line in poem:
            lines.append(line.text)
        poem_lines2.append(lines)    
            
    return poem_lines2      

In [4]:
# State the source of the data (there are five pages to scrape)
pages = [str(i) for i in range(1,6)]
url = "https://www.poetryfoundation.org/search?query=yeats&refinement=poems&page="

In [5]:
# Create a list of urls for the list of poems
urls = [url + page for page in pages]

# Use the webscraper function to get the website contents for poem urls
soups = webscraper(urls)

# Preview the html content of the first poem
pprint(soups[0])


<!DOCTYPE doctype html>

<html class="no-js" lang="en-us">
<head>
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
            new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
            j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
            'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
    })(window,document,'script','dataLayer','GTM-NTFVGQ3');</script>
<!-- End Google Tag Manager -->
<title>Search | Poetry Foundation</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="always" name="referrer"/>
<meta content="Poems, readings, poetry news, and the entire 100-year archive of POETRY magazine." name="description"/>
<link href="https://www.poetryfoundation.org/search" rel="canonical"/>
<meta content="Chicago" name="geo.region"/>
<meta content="41.8954002,-87.63025809999999" name="geo.posit

In [6]:
# Create empty lists for links and titles of poems authored by William Butler Yeats
# NB: There are poems in the list that refer to him but not authored by him
links = []
titles = []

for soup in soups:
    soup1 = soup.find_all("div", class_ = "c-feature")
    
    for item in soup1:
        if item.find("div", class_ = "c-feature-sub").text[4:-1] == "William Butler Yeats":
            soup2 = item.find("div", class_ = "c-feature-hd") 
            if soup2 is not None:
                titles.append(soup2.a.text)
                links.append(soup2.a["href"])
            else:
                continue # go to the next item
        else:
            continue # go to the next item

## Load the poems into a dataframe

In [7]:
# Dependencies
import pandas as pd
import numpy as np

In [8]:
# Load the titles and the links in a dataframe
poems = pd.DataFrame({"title": titles, 
                      "link": links})
print(f"William Butler Yeats has {len(poems)} poems that are currently shared by the Poetry Foundation.")

William Butler Yeats has 84 poems that are currently shared by the Poetry Foundation.


In [9]:
# Preview the poem title and their links in the dataframe
poems.head()

Unnamed: 0,title,link
0,Under Ben Bulben,https://www.poetryfoundation.org/poems/43298/u...
1,A Coat,https://www.poetryfoundation.org/poetrymagazin...
2,A Dialogue of Self and Soul,https://www.poetryfoundation.org/poems/43294/a...
3,A Drinking Song,https://www.poetryfoundation.org/poems/50337/a...
4,A Meditation in Time of War,https://www.poetryfoundation.org/poems/57318/a...


In [None]:
poems.tail()

## Scrape each poem, clean the lines of each poem for special characters, and add the lines to the dataframe

In [10]:
# Iterate webscraping using the list of urls from the poems dataframe
soups2 = webscraper(poems["link"])

Request 84; Frequency: 0.08052303316182197 requests/sec


In [11]:
# Create an empty list of lines of poetry
poem_lines = []

for soup in soups2:
    try:
        # List the lines found in the URL
        lines_html = soup.find("div", {"class": "o-poem"})

        if lines_html is not None:
            lines_html = list(lines_html)[1:-1]
        else: 
            lines_html = ""
     
    except NavigableString:
        lines_html = ""
    
    poem_lines.append(lines_html)

In [12]:
# Use the extract_text function to get the text from relatively clean lists of lines
# Poem [47] is intentionally blank because "A Full Moon in March" is a play
poems1 = extract_text(poem_lines[0:15])
poems2 = [line.text for line in poem_lines[15][2:-1]]
poems3 = extract_text(poem_lines[16:20])
poems4 = [line.text for line in poem_lines[20][2:-1]]
poems5 = extract_text(poem_lines[21:48])

In [13]:
# Concatenate the poems into one list
pm_list = poems1 + [poems2] + poems3 + [poems4] + poems5

# Join the lines into a string per poem 
# The resulting list (n = 46) is a list of poems scraped from Poetry Foundation
poem_lines1 = [",".join(poem) for poem in pm_list]

In [None]:
# Create a list of poems with missing lines (images were uploaded by the Poetry Foundation, not text)
missing_titles = poems["title"][48:-1].tolist()
missing_titles

In [None]:
# Missing titles
# Source1: www.bartleby.com
# Source2: www.public-domain-poetry.com

# Create a dictionary of urls following the order in the missing titles list
mt_dict = {
    1: "http://www.public-domain-poetry.com/william-butler-yeats/needles-eye-271", # A Needle's Eye
    2: "https://www.bartleby.com/147/13.html", # Beggar to Beggar cried
    3: "http://www.public-domain-poetry.com/william-butler-yeats/conjunctions-272", # Conjunctions
    4: "https://www.bartleby.com/148/33.html", # Ego Dominus Tuus
    5: "https://www.bartleby.com/147/24.html", # Fallen Majesty
    6: "http://www.public-domain-poetry.com/william-butler-yeats/he-and-she-276", # He and She
    7: "https://www.bartleby.com/300/22.html", # Love and the Birds
    8: "https://www.bartleby.com/148/20.html", # Memory
    9: "http://www.public-domain-poetry.com/william-butler-yeats/meru-270", # Meru
    10: "https://www.bartleby.com/148/17.html", # On Woman
    11: "https://www.bartleby.com/147/7.html", # Paudeen
    12: "http://www.public-domain-poetry.com/william-butler-yeats/ribb-at-the-tomb-of-baile-and-aillinn-281", # Ribh at the Tomb of Baile and Aillinn
    13: "http://www.public-domain-poetry.com/william-butler-yeats/ribb-considers-christian-love-insufficient-277", # Ribh Considers Christian Love Insufficient
    14: "http://www.public-domain-poetry.com/william-butler-yeats/ribb-denounces-patrick-280", # Ribh Prefers an Older Theology
    15: "https://www.bartleby.com/147/15.html", # Running to Paradise
    16: "https://www.bartleby.com/148/16.html", # The Dawn
    17: "http://www.public-domain-poetry.com/william-butler-yeats/four-ages-of-man-273", # The Four Ages of Man
    18: "https://www.bartleby.com/300/86.html", # The Grey Rock
    19: "https://www.bartleby.com/148/19.html", # The Hawk
    20: "http://www.public-domain-poetry.com/william-butler-yeats/mountain-tomb-64", # The Mountain Tomb
    21: "https://www.bartleby.com/300/1453.html", # The Mountain Tomb
    22: "https://www.bartleby.com/300/249.html", # The Peacock
    23: "https://www.bartleby.com/300/643.html", # The Phoenix
    24: "https://www.bartleby.com/300/251.html", # The Player Queen
    25: "https://www.bartleby.com/300/23.html", # The Realists
    26: "https://www.bartleby.com/148/12.html", # The Scholars
    27: "https://www.bartleby.com/300/642.html", # The Thorn Tree
    28: "https://www.bartleby.com/300/133.html", # The Two Kings
    29: "", # The Watch-Fire
    30: "https://www.bartleby.com/300/248.html", # The Witch
    31: "https://www.bartleby.com/300/644.html", # There is a Queen in China
    32: "http://www.public-domain-poetry.com/william-butler-yeats/three-songs-to-the-same-tune-366", # Three Songs to the Same Tune
    33: "https://www.bartleby.com/147/22.html", # To a Child Dancing in the Wind
    34: "https://www.bartleby.com/300/20.html", # To a Child Dancing upon the Shore
    35: "https://www.bartleby.com/147/8.html", # To a Shade
}

# Create a list of missing titles from the dictionary
mt_url = list(mt_dict.values())

In [None]:
# Scrape the list of URLs to get the html contents
soups3 = webscraper(mt_url)

In [None]:
# List of indices in soups list for poems from Public Domain Poetry (pdp)
pdp = [0, 2, 5, 8, 11, 12, 13, 16, 19, 31]

# Remove the line counts (for poems with line counts)
remove_numbers = str.maketrans("", "", digits)

# Create an empty list of lines of poetry
poem_lines2 = []

for x in range(0, len(soups3)):          
    # For soup derived from Public Domain Poetry (pdp)
    if x in pdp:
        tab = soups3[x].findAll("table")[10]
        poemlines = tab.findAll("font", {"class": "t3a"})
        lines_html = [ele.text.strip() for ele in poemlines][0] # Extract the innerHTML
    
    # For empty soup (28th soup in soups3)
    elif x == 28:
        lines_html = ""
    
    # For 4th soup
    elif x == 3:
        tab = soups3[x].findAll("table")[7]
        poemlines = tab.findAll("td")
        lines = [line.text.strip() for line in poemlines] # Extract innerHTML
        filtered = [l.translate(remove_numbers) for l in lines] # Remove line count
        lines_html = ",".join(filtered)  # Join list elements into one string
        
    # For 24th soup
    elif x == 23:
        tab = soups3[x].findAll("table")[7]
        poemlines = tab.findAll("td")
        lines = [line.text.strip() for line in poemlines]
        remove = "Song from an Unfinished Play." # Note below title removed
        lines = [line.replace(remove, "") for line in lines] # Note below title removed
        filtered = [l.translate(remove_numbers) for l in lines] # Remove line count
        lines_html = filtered[0] # List elements joined into one string
        
    # For 28th soup
    elif x == 27:
        tab = soups3[x].findAll("table")[7]
        poemlines = tab.findAll("td")
        lines = [line.text.strip() for line in poemlines]
        remove = "Note 1. Eochaid is pronounced Yohee." # Endnote the needs to be removed (part 1)
        remove2 = "[back]" # Endnote the needs to be removed (part 2)
        lines = [line.replace(remove, "") for line in lines] # Endnote the needs to be removed (part 1)
        lines = [line.replace(remove2, "") for line in lines] # Endnote the needs to be removed (part 2)
        filtered = [l.translate(remove_numbers) for l in lines] # Remove line count
        lines_html = filtered[0] # List elements joined into one string 
        
    # For 35th soup
    elif x == 34:
        tab = soups3[x].findAll("table")[7]
        poemlines = tab.findAll("td")
        lines = [line.text.strip() for line in poemlines]
        remove = "September 29th, 1913." # Endnote of poem to be removed
        lines = [line.replace(remove, "") for line in lines] # Endnote of poem to be removed
        filtered = [l.translate(remove_numbers) for l in lines] # Remove line count
        del filtered[1::2] # Remove odd line
        lines_html = ",".join(filtered) # Join list elements into one string   
        
    # For soup derived from Bartleby
    else:
        tab = soups3[x].findAll("table")[7]
        poemlines = tab.findAll("td")
        lines = [line.text.strip() for line in poemlines] # Extract innerHTML
        filtered = [l.translate(remove_numbers) for l in lines] # Remove line count
        del filtered[1::2] # Remove odd line
        lines_html = ",".join(filtered) # Join list elements into one string   
    
    poem_lines2.append(lines_html)

In [None]:
poem_lines3 = poem_lines1 + poem_lines2

In [None]:
len(poem_lines3)

In [None]:
# Remove special characters and their adjacent white spaces from each list of strings (lines) in each poem
clean = [[x.strip() for x in i] for i in poem_lines3]

# Remove empty strings from the list of strings in each poem
clean = [[x for x in i if x] for i in clean]

# Stringify the lists of text per poem
clean2 = []
for poem in clean:
    str1 = '\n'.join(poem)
    clean2.append(str1)

In [None]:
clean2

In [None]:
# Add the clean lines in the poems dataframe
poems["lines"] = clean2

# Add a column for poet name
poems["poet"] = "William Butler Yeats"

# Preview the dataframe
poems.head()

In [None]:
# Remove rows with no text
poems["lines"].replace("", np.nan, inplace = True)

# Drop rows with NaNs in the lines column
poems.dropna(subset = ["lines"], inplace = True)

# Preview the size of the dataframe after dropping the rows with NaNs in the lines column
len(poems)

### Save the dataframe in a SQLIte database

In [None]:
# Dependencies
import sqlite3

In [None]:
# Create a SQLite database
conn = sqlite3.connect("db/Poetry.db")

In [None]:
# Create a database table from the dataframe
poems.to_sql("Yeats", conn, if_exists = "replace")

# Preview the database table
pd.read_sql_query("select * from Yeats;", conn)

In [None]:
conn.close()