# Natural language processing as critique for nursery rhymes

## Webscraping 

In [1]:
# Dependencies

# to scrape the data from websites
from requests import get
from bs4 import BeautifulSoup

# to make the scraping more human-like by controlling the crawl rate
from IPython.core.display import clear_output
from time import sleep, time
from random import randint

# just in case of errors
from warnings import warn

# using pandas for creating dataframes and analysing data
import pandas as pd
import numpy as np

### Load nursery rhyme information in a dataframe

In [2]:
# State the source of the data
url = "https://www.nurseryrhymes.org/nursery-rhymes.html"

In [3]:
# Create an empty list for parsed data from web scraping
# So there's no need to scrape repeatedly
soup = []
    
# Prepare the loop through the pages
start_time = time()
requests = 0

# Make a get request for the contents of the URL
response = get(url)
    
# Pause the loop
sleep(randint(8,15))
    
# Monitor the requests
requests += 1
elapsed_time = time() - start_time
print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

# If there's an error, throw an error warning
if response.status_code != 200:
    warn(f"Request: {requests}; Status code: {response.status_code}")

# Scrape with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

Request 1; Frequency: 0.10050531290470885 requests/sec


In [4]:
# Create empty lists for links and titles of nursery rhymes
links = []
titles = []

# the links and the titles of the nursery rhymes are in a table for the source url
info = soup.find_all("td", class_ = "nursery-rhymes-info") 

for i in info:
    i2 = i.find("a") # contains the link and the title
    if i2 != None:
        links.append(i2["href"]) # contains the link
        titles.append(i2.text) # contains the title
    else:
        continue # go to the next item in the list of table 

In [5]:
# Preview the list of links
links

['a-sailor-went-to-sea.html',
 'a-tisket-a-tasket.html',
 'a-wise-old-owl.html',
 'a-you-re-adorable.html',
 'abc-song.html',
 'alice-the-camel.html',
 'all-the-pretty-little-horses.html',
 'alphabet-song.html',
 'animal-alphabet-song.html',
 'the-animal-fair.html',
 'apple-on-a-stick.html',
 'baa-baa-black-sheep.html',
 'baby-bumble-bee.html',
 'betty-botter.html',
 'billy-boy.html',
 'bingo.html',
 'blow-wind-blow.html',
 'bobby-shaftoes-gone-to-the-sea.html',
 'brahms-lullaby.html',
 'brother-john.html',
 'brush-brush-brush-your-teeth.html',
 'brush-your-teeth-song.html',
 'bye-baby-bunting.html',
 'our-baby-cheeks-of-rose-tiny-toes.html',
 'chubby-cheeks.html',
 'cobbler-cobbler.html',
 'cock-a-doodle-doo.html',
 'cold-and-raw-the-north-wind-doth-blow.html',
 'cotton-eyed-joe.html',
 'cradle-song-golden-slumbers.html',
 'daisy-bell-bicycle-built-for-two.html',
 'days-of-the-week.html',
 'did-you-ever-see-a-lassie.html',
 'diddle-diddle-dumpling.html',
 'ding-dong-bell.html',
 'doct

In [6]:
# Preview list of titles
titles

['A Sailor Went To Sea',
 'A-Tisket, A-Tasket',
 'A Wise Old Owl',
 "A, You're Adorable",
 'ABC Song',
 'Alice The Camel',
 'All That Pretty Horses',
 'Alphabet Song',
 'Animal Alphabet Song',
 'Animal Fair, The',
 'Apple On a Stick',
 'Baa Baa Black Sheep',
 'Baby Bumble Bee',
 'Betty Botter',
 'Billy Boy',
 'Bingo (B-I-N-G-O)',
 'Blow Wind, Blow',
 "Bobby Shaftoe's Gone To The Sea",
 'Brahms Lullaby',
 'Brother John',
 'Brush, Brush, Brush Your Teeth',
 'Brush-Your-Teeth-Song',
 'Bye Baby Bunting',
 'Cheeks of Rose',
 'Chubby Cheeks',
 'Cobbler, Cobbler',
 'Cock A Doodle Doo',
 'Cold And Raw The North Wind Doth Blow',
 'Cotton-Eyed Joe',
 'Cradle Song',
 'Daisy Bell',
 'Days of the Week',
 'Did You Ever See a Lassie?',
 'Diddle Diddle Dumpling',
 'Ding Dong Little Bell',
 'Doctor Foster',
 'Down Down Baby',
 'Early to Bed',
 'Eeny, Meeny, Miny, Moe',
 'Finger Family Song',
 'Five Fat Sausages',
 'Five Little Ducks',
 'Five Little Monkeys',
 'Five Little Snowmen',
 'Five Little Speckl

In [7]:
# Load the titles and the links in a dataframe
rhymes = pd.DataFrame({"Title": titles,
                       "URLs": links})

# Preview the dataframe
rhymes.head()

Unnamed: 0,Title,URLs
0,A Sailor Went To Sea,a-sailor-went-to-sea.html
1,"A-Tisket, A-Tasket",a-tisket-a-tasket.html
2,A Wise Old Owl,a-wise-old-owl.html
3,"A, You're Adorable",a-you-re-adorable.html
4,ABC Song,abc-song.html


In [8]:
print(f'There are {len(rhymes["Title"])} nursery rhymes in the repository.')

There are 189 nursery rhymes in the repository.


### Scrape and clean the nursery rhyme text for special characters; store the cleaned text in the dataframe

In [None]:
# Create an empty list of lyrics for all nursery rhymes
lyrics_list = []

# Prepare the loop through the pages
start_time = time()
requests = 0

for x in range(0, len(rhymes["URLs"])):
    url = "https://nurseryrhymes.org/" + rhymes["URLs"][x]

    # Create an empty list per nursery rhyme   
    soup2 = []

    # Make a get request for the contents of the URL
    response = get(url)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

    # If there's an error, throw an error warning
    if response.status_code != 200:
        warn(f"Request: {requests}; Status code: {response.status_code}")
        
    # If there are more than 189 requests (189 nursery rhymes), break the loop
    if requests > len(rhymes):
        warn("Number of requests more than expected.")
        break    

    # Scrape with BeautifulSoup
    soup2 = BeautifulSoup(response.text, "html.parser")

    # Get the lyrics for the nursery rhyme
    lyrics = soup2.find("em") 
    lyrics_list.append(lyrics)
    
# Preview the lyrics list
lyrics_list

Request 135; Frequency: 0.07140203476979347 requests/sec


In [None]:
# Create empty list of rhyme lines
rhyme_lines = []

# Prepare for looping through the links
start_time = time()
requests = 0

# Loop through the links
for link in rhymes["URLs"]:
    # Complete the url for each nursery rhyme
    url = "https://nurseryrhymes.org/" + link 
    
    # Make a get request for the contents of the link
    response2 = get(url)
    
    # Pause the loop
    sleep(randint(8,15))
    
    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print(f"Request {requests}; Frequency: {requests/elapsed_time} requests/sec")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)
    
    # If there's an error, throw an error warning
    if response2.status_code != 200:
        warn(f"Request: {requests}; Status code: {response2.status_code}")
    
    # If there are more than 189 requests (189 nursery rhymes), break the loop
    if requests > len(rhymes):
        warn("Number of requests more than expected.")
        break
        
    # Scrape with BeautifulSoup
    rhyme_soup = BeautifulSoup(response2.text, "html.parser")
    
    # List the lines found in the URL
    rhymes_html = rhyme_soup.find("div", {"id": "nursery-rhymes-lyrics"})
    
    if rhymes_html is not None:
        rhymes_html = list(rhymes_html)
    else: 
        rhymes_html = ""
    
    rhyme_lines.append(rhymes_html)

In [None]:
rhyme_lines

### Store the cleaned nursery rhymes into a SQLite database