# Text mining and analyses using nursery rhymes

In [1]:
# Dependencies for creating the dataframe and for preliminary data analyses
import pandas as pd
import sqlite3
from pprint import pprint
import numpy as np
import math
import os
import re, string
from time import time
from IPython.core.display import clear_output

# Dependencies for natural language processing
import nltk
# nltk.download("punkt")
# nltk.download('stopwords')
# nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Load the data from the database into a dataframe

In [2]:
# Load the database table into a dataframe
conn = sqlite3.connect("db/nursery_rhymes.sqlite") # connect to the database
df = pd.read_sql_query("SELECT * FROM Nursery_Rhymes;", conn) # get the contents of the Nursery_Rhymes table
conn.close() # close the connection to the database

In [3]:
# Preview the dataframe
df.head()

Unnamed: 0,index,Title,URLs,Lyrics
0,0,A Sailor Went To Sea,a-sailor-went-to-sea.html,"A sailor went to sea, sea, sea \n To se..."
1,1,"A-Tisket, A-Tasket",a-tisket-a-tasket.html,"A-tisket, a-tasket\nA green and yellow basket\..."
2,2,A Wise Old Owl,a-wise-old-owl.html,A wise old owl lived in an oak.\nThe more he s...
3,3,"A, You're Adorable",a-you-re-adorable.html,"""A"" you're adorable\n ""B"" you're so beaut..."
4,4,ABC Song,abc-song.html,"A for Apple, A for Ant\nB for Ball, B for Bat\..."


## Natural language processing

### Data cleaning: Tokenise, remove stop words, lemmatise

In [4]:
# Define stop words and punctuations for removal
stops = stopwords.words("english")
stops.extend(["ca", "n't", "'re", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "dont",
              "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'s", "", "ch", "wo", "'d", "'ll", "'m", "im", 
              "could"])

# Define punctuation marks for removal
exclude = list(set(string.punctuation)) + [",", "``", "''", "...", "---------", "¦"]

# Define numbers for removal
numbers = '[0-9]'

# Define soecial characters for removal
sp_char = "[â\x80\x99, â\x80\x98, â\x80\x93¦]"

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [5]:
# Create tokens
tokens = []
filtered_poems = []
for x in range(0, len(df["Lyrics"])):
    if df["Lyrics"][x] != None:
        # Tokenise the lyrics
        words = word_tokenize(df["Lyrics"][x])
        
        # Convert letters to lower case
        words1 = [word.lower() for word in words]
        
        # Remove the numbers (written as digits) in the lyrics
        words2 = [re.sub(numbers, "", word) for word in words1]
        
        # Remove the special characters in the tokens
        words3 = [re.sub(sp_char, "", word) for word in words2]
        
        # Lemmatise the tokens
        words4 = [lemmatiser.lemmatize(word, pos = "v") for word in words3]
        words5 = [lemmatiser.lemmatize(word, pos = "n") for word in words4]
        words6 = [lemmatiser.lemmatize(word, pos = "r") for word in words5]
        words7 = [lemmatiser.lemmatize(word, pos = "a") for word in words6]
        
        # Remove the stop words and the punctuation marks
        words8 = [word for word in words7 if word not in stops and word not in exclude]
        
        # Remove the two-letter words
        words9 = [wd for wd in words8 if len(wd) > 2]
        
        # Convert the list of tokens into one strong per nursery rhyme
        preprocessed_text = " ".join(words9)
        
        # Add the words to the tokens and the filtered lists
        tokens.append(words9)
        filtered_poems.append(preprocessed_text)
    else:
        # Print out the nursery rhymes without lyrics
        print(f"webscrape nursery rhyme no. {x}: {df['Title'][x]}")

In [6]:
# Add the tokens and the filtered poem into the dataframe
df["Tokens"] = tokens
df["Filtered_Poem"] = filtered_poems

# Preview the updated dataframe
df.head()

Unnamed: 0,index,Title,URLs,Lyrics,Tokens,Filtered_Poem
0,0,A Sailor Went To Sea,a-sailor-went-to-sea.html,"A sailor went to sea, sea, sea \n To se...","[sailor, sea, sea, sea, see, see, see, see, se...",sailor sea sea sea see see see see see see see...
1,1,"A-Tisket, A-Tasket",a-tisket-a-tasket.html,"A-tisket, a-tasket\nA green and yellow basket\...","[a-tisket, a-tasket, green, yellow, basket, wr...",a-tisket a-tasket green yellow basket write le...
2,2,A Wise Old Owl,a-wise-old-owl.html,A wise old owl lived in an oak.\nThe more he s...,"[wise, old, owl, live, oak, saw, speak, speak,...",wise old owl live oak saw speak speak hear lik...
3,3,"A, You're Adorable",a-you-re-adorable.html,"""A"" you're adorable\n ""B"" you're so beaut...","[adorable, beautiful, cute, full, charm, darli...",adorable beautiful cute full charm darling exc...
4,4,ABC Song,abc-song.html,"A for Apple, A for Ant\nB for Ball, B for Bat\...","[apple, ant, ball, bat, carrot, car, duck, doo...",apple ant ball bat carrot car duck door elevat...


In [8]:
# Determine the number of unique words per rhyme
unique_words = [list(set(rhyme)) for rhyme in df["Tokens"]]
length_unique_words = [len(rhyme) for rhyme in unique_words]

# Calculate lexical diversity (unique words relative to the whole nursery rhyme lyric)
lex_div = [round(length_unique_words[i]/len(df["Tokens"][i]), 3) for i in range(0, len(df["Tokens"]))]

# Add lexical diversity calculation in dataframe
df["Lexical_Diversity"] = lex_div

# Preview the dataframe
df.head()

Unnamed: 0,index,Title,URLs,Lyrics,Tokens,Filtered_Poem,Lexical_Diversity
0,0,A Sailor Went To Sea,a-sailor-went-to-sea.html,"A sailor went to sea, sea, sea \n To se...","[sailor, sea, sea, sea, see, see, see, see, se...",sailor sea sea sea see see see see see see see...,0.133
1,1,"A-Tisket, A-Tasket",a-tisket-a-tasket.html,"A-tisket, a-tasket\nA green and yellow basket\...","[a-tisket, a-tasket, green, yellow, basket, wr...",a-tisket a-tasket green yellow basket write le...,0.8
2,2,A Wise Old Owl,a-wise-old-owl.html,A wise old owl lived in an oak.\nThe more he s...,"[wise, old, owl, live, oak, saw, speak, speak,...",wise old owl live oak saw speak speak hear lik...,0.769
3,3,"A, You're Adorable",a-you-re-adorable.html,"""A"" you're adorable\n ""B"" you're so beaut...","[adorable, beautiful, cute, full, charm, darli...",adorable beautiful cute full charm darling exc...,0.97
4,4,ABC Song,abc-song.html,"A for Apple, A for Ant\nB for Ball, B for Bat\...","[apple, ant, ball, bat, carrot, car, duck, doo...",apple ant ball bat carrot car duck door elevat...,0.981
