# Collecting Rudyard Kipling's poems

### Dependencies

In [1]:
# General
from pprint import pprint
import string
from string import digits
import pandas as pd
import numpy as np

# to scrape the data from websites
from requests import get
from bs4 import BeautifulSoup

# to make the scraping more human-like by controlling the crawl rate
from IPython.core.display import clear_output
from time import sleep, time
from random import randint

# just in case of errors
from warnings import warn

### Define functions for extracting information from websites

In [2]:
def webscraper(url_list):
    """ Define a function that conducts webscraping """
    # Create an empty list of page soups
    soups = []
    
    # Prepare the loop through the pages
    start_time = time()
    requests = 0
    
    # Iterate through the pages
    for url in url_list:
        if url == "":
            page_soup = ""
                
        else:
            # Make a get request for the contents of the URL
            response = get(url)

            # Pause the loop
            sleep(randint(8,15))

            # Monitor the requests
            requests += 1
            elapsed_time = time() - start_time
            print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
            clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

            # If there's an error, throw an error warning
            if response.status_code != 200:
                warn(f"Request: {requests}; Status code: {response.status_code}")

            # If there are more requests than expected, break the loop
            if requests > len(url_list):
                warn("Number of requests more than expected.")
                break

            # Scrape with BeautifulSoup
            page_soup = BeautifulSoup(response.text, "html.parser")
            
        soups.append(page_soup)
    
    return soups

### Webscraping for titles and links to poems

In [3]:
# Sources of list of Kipling's poems (1885–1918)
urls = ["https://www.bartleby.com/364/",
       "https://www.bartleby.com/people/Kipling.html"]

# Use the webscraper function to get the website contents for poem urls
soups = webscraper(urls)

# Preview the html content of the first poem
pprint(soups[0])
print("==========")
pprint(soups[1])

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "https://www.w3.org/TR/REC-html40/loose.dtd">

<html>
<head>
<title>Rudyard Kipling. 1922. Verse: 1885-1918</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="These 416 selections represent the best of the Nobel prize-winning poet" name="description"/>
<meta content="Rudyard Kipling. 1922. Verse: 1885-1918" name="keywords"/>
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
 new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
 j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
 })(window,document,'script','dataLayer','GTM-56MW5HV');</script>
<!-- End Google Tag Manager -->
<style type="text/css">
 .FormLt1 { font-family: arial, helvetica;
 font-size: 12px;
 color: #000000;
 font-weight: normal;
 background-color: #fff

In [4]:
# Create a list of urls for the two soups
poem_tables = [soup.findAll("table") for soup in soups]

# First soup
link_list1 = [line.a["href"] for line in poem_tables[0][8].findAll("li")]
title_list1 = [line.a.text for line in poem_tables[0][8].findAll("li")]
url_list1 = ["https://www.bartleby.com" + link for link in link_list1]

# Second soup
dd = poem_tables[1][5].findAll("dd")
link_list2 = [d.a["href"] for d in dd[7:len(dd) - 1]]
title_list2 = [d.a.text for d in dd[7:len(dd) - 1]]
url_list2 = ["https://www.bartleby.com" + link for link in link_list2]

# Concatenate the two lists of urls
url_list = url_list1 + url_list2

# Concatenate the two lists of titles
title_list = title_list1 + title_list2

# Create a dataframe
poems = pd.DataFrame({"title": title_list, "link": url_list})
poems.head()

Unnamed: 0,title,link
0,Prelude,https://www.bartleby.com/364/1.html
1,A General Summary,https://www.bartleby.com/364/2.html
2,Army Headquarters,https://www.bartleby.com/364/3.html
3,"Study of an Elevation, in Indian Ink",https://www.bartleby.com/364/4.html
4,Delilah,https://www.bartleby.com/364/5.html


### Webscraping for the actual poems

In [None]:
# Use the webscraper function to get the website contents for poem urls
soups2 = webscraper(url_list)

# Preview the html content of the first poem
pprint(soups2[2])

Request 407; Frequency: 0.08382705652093488 requests/sec


In [None]:
# Remove the line counts (for poems with line counts)
remove_numbers = str.maketrans("", "", digits)

# Create an empty list of lines of poetry
poem_lines1 = []

for x in range(0, len(soups2)): 
    tab = soups2[x].findAll("table")[7]
    poemlines = tab.findAll("td")
    lines = [line.text.strip() for line in poemlines] # Extract innerHTML
    filtered = [l.translate(remove_numbers) for l in lines] # Remove line count
    del filtered[1::2] # Remove odd line
    lines_html = filtered[0] # Join list elements into one string
    
    poem_lines1.append(lines_html)

In [None]:
len(poem_lines1)

In [None]:
# Remove special characters and their adjacent white spaces from each list of strings (lines) in each poem
clean = []
for i in range(len(poem_lines1)):
    clean.append("".join(filter(lambda x: x in string.printable, poem_lines1[i])))

# Replace ",," and carriage returns (\r) with line breaks (\n)
clean1 = [i.replace("\r", "\n") for i in clean]
clean2 = [i.replace(",,", "") for i in clean1]
clean2

In [None]:
# Add the clean lines in the poems dataframe
poems["lines"] = clean2

# Add a column for poet name
poems["poet"] = "Rudyard Kipling"

# Preview the dataframe
poems.head()

In [None]:
# Remove rows with no text
poems["lines"].replace("", np.nan, inplace = True)

# Drop rows with NaNs in the lines column
poems.dropna(subset = ["lines"], inplace = True)

# Preview the size of the dataframe after dropping the rows with NaNs in the lines column
len(poems)

### Save the dataframe in a SQLIte database

In [None]:
# Dependencies
import sqlite3

In [None]:
# Create a SQLite database
conn = sqlite3.connect("db/Poetry.db")

In [None]:
# Create a database table from the dataframe
poems.to_sql("Kipling", conn, if_exists = "replace")

# Preview the database table
pd.read_sql_query("select * from Kipling;", conn)

In [None]:
conn.close()