# Text mining and analyses using nursery rhymes

In [1]:
# Dependencies for creating the dataframe and for preliminary data analyses
import pandas as pd
import sqlite3
from pprint import pprint
import numpy as np
import math
import os
import re, string
from time import time
from IPython.core.display import clear_output

# Dependencies for natural language processing
import nltk
# nltk.download("punkt")
# nltk.download('stopwords')
# nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Load the data from the database into a dataframe

In [2]:
# Load the database table into a dataframe
conn = sqlite3.connect("db/nursery_rhymes.sqlite") # connect to the database
df = pd.read_sql_query("SELECT * FROM Nursery_Rhymes;", conn) # get the contents of the Nursery_Rhymes table
conn.close() # close the connection to the database

In [3]:
# Preview the dataframe
df.head()

Unnamed: 0,index,Title,URLs,Lyrics
0,0,A Sailor Went To Sea,a-sailor-went-to-sea.html,"A sailor went to sea, sea, sea \n To se..."
1,1,"A-Tisket, A-Tasket",a-tisket-a-tasket.html,"A-tisket, a-tasket\nA green and yellow basket\..."
2,2,A Wise Old Owl,a-wise-old-owl.html,A wise old owl lived in an oak.\nThe more he s...
3,3,"A, You're Adorable",a-you-re-adorable.html,"""A"" you're adorable\n ""B"" you're so beaut..."
4,4,ABC Song,abc-song.html,"A for Apple, A for Ant\nB for Ball, B for Bat\..."


## Natural language processing

### Data cleaning: Tokenise, remove stop words, lemmatise

In [4]:
# Define stop words and punctuations for removal
stops = stopwords.words("english")
stops.extend(["ca", "n't", "'re", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "dont",
              "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'s", "", "ch", "wo", "'d", "'ll", "'m", "im"])

# Define punctuation marks for removal
exclude = list(set(string.punctuation)) + [",", "``", "''", "...", "---------", "¦"]

# Define numbers for removal
numbers = '[0-9]'

# Define soecial characters for removal
sp_char = "[â\x80\x99, â\x80\x98, â\x80\x93¦]"

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [5]:
# Create tokens
tokens = []
for x in range(0, len(df["Lyrics"])):
    if df["Lyrics"][x] != None:
        words = word_tokenize(df["Lyrics"][x])
        words1 = [word.lower() for word in words]
        words2 = [re.sub(numbers, "", word) for word in words1]
        words3 = [re.sub(sp_char, "", word) for word in words2]
        words4 = [word for word in words3 if word not in stops and word not in exclude]
        tokens.append(words4)
    else:
        print(f"webscrape nursery rhyme no. {x}: {df['Title'][x]}")

In [6]:
# Preview the tokenised nursery rhyme lyrics
tokens

[['sailor',
  'went',
  'sea',
  'sea',
  'sea',
  'see',
  'could',
  'see',
  'see',
  'see',
  'could',
  'see',
  'see',
  'see',
  'bottom',
  'deep',
  'blue',
  'sea',
  'sea',
  'sea',
  'sailor',
  'went',
  'knee',
  'knee',
  'knee',
  'see',
  'could',
  'knee',
  'knee',
  'knee',
  'could',
  'knee',
  'knee',
  'knee',
  'bottom',
  'deep',
  'blue',
  'knee',
  'knee',
  'knee',
  'sea',
  'sea',
  'sea',
  'sailor',
  'went',
  'chop',
  'chop',
  'chop',
  'see',
  'could',
  'chop',
  'chop',
  'chop',
  'could',
  'chop',
  'chop',
  'chop',
  'bottom',
  'deep',
  'blue',
  'chop',
  'chop',
  'chop',
  'sea',
  'sea',
  'sea',
  'knee',
  'knee',
  'knee'],
 ['a-tisket',
  'a-tasket',
  'green',
  'yellow',
  'basket',
  'wrote',
  'letter',
  'love',
  'way',
  'dropped',
  'dropped',
  'dropped',
  'yes',
  'way',
  'dropped',
  'little',
  'boy',
  'picked',
  'put',
  'pocket'],
 ['wise',
  'old',
  'owl',
  'lived',
  'oak',
  'saw',
  'less',
  'spoke',
  'l