# Text mining and analyses using nursery rhymes

In [1]:
# Dependencies for creating the dataframe and for preliminary data analyses
import pandas as pd
import sqlite3
from pprint import pprint
import numpy as np
import math
import os
import re, string
from time import time
from IPython.core.display import clear_output

# Dependencies for natural language processing
import nltk
# nltk.download("punkt")
# nltk.download('stopwords')
# nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Load the data from the database into a dataframe

In [2]:
# Load the database table into a dataframe
conn = sqlite3.connect("db/nursery_rhymes.sqlite") # connect to the database
df = pd.read_sql_query("SELECT * FROM Nursery_Rhymes;", conn) # get the contents of the Nursery_Rhymes table
conn.close() # close the connection to the database

In [3]:
# Preview the dataframe
df.head()

Unnamed: 0,index,Title,URLs,Lyrics
0,0,A Sailor Went To Sea,a-sailor-went-to-sea.html,"A sailor went to sea, sea, sea \n To se..."
1,1,"A-Tisket, A-Tasket",a-tisket-a-tasket.html,"A-tisket, a-tasket\nA green and yellow basket\..."
2,2,A Wise Old Owl,a-wise-old-owl.html,A wise old owl lived in an oak.\nThe more he s...
3,3,"A, You're Adorable",a-you-re-adorable.html,"""A"" you're adorable\n ""B"" you're so beaut..."
4,4,ABC Song,abc-song.html,"A for Apple, A for Ant\nB for Ball, B for Bat\..."


## Natural language processing

### Data cleaning: Tokenise, remove stop words, lemmatise

In [4]:
# Remove stop words from the list
stops = stopwords.words("english")
exclude = list(set(string.punctuation))

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [16]:
words = word_tokenize(df["Lyrics"][0])
words1 = [word for word in words if word not in stops or word not in exclude]
words1

['A',
 'sailor',
 'went',
 'to',
 'sea',
 ',',
 'sea',
 ',',
 'sea',
 'To',
 'see',
 'what',
 'he',
 'could',
 'see',
 ',',
 'see',
 ',',
 'see',
 'But',
 'all',
 'that',
 'he',
 'could',
 'see',
 ',',
 'see',
 ',',
 'see',
 'Was',
 'the',
 'bottom',
 'of',
 'the',
 'deep',
 'blue',
 'sea',
 ',',
 'sea',
 ',',
 'sea',
 '!',
 'A',
 'sailor',
 'went',
 'to',
 'knee',
 ',',
 'knee',
 ',',
 'knee',
 'To',
 'see',
 'what',
 'he',
 'could',
 'knee',
 ',',
 'knee',
 ',',
 'knee',
 'But',
 'all',
 'that',
 'he',
 'could',
 'knee',
 ',',
 'knee',
 ',',
 'knee',
 'Was',
 'the',
 'bottom',
 'of',
 'the',
 'deep',
 'blue',
 'knee',
 ',',
 'knee',
 ',',
 'knee',
 '!',
 '(',
 'Sea',
 ',',
 'sea',
 ',',
 'sea',
 ')',
 'A',
 'sailor',
 'went',
 'to',
 'chop',
 ',',
 'chop',
 ',',
 'chop',
 'To',
 'see',
 'what',
 'he',
 'could',
 'chop',
 ',',
 'chop',
 ',',
 'chop',
 'But',
 'all',
 'that',
 'he',
 'could',
 'chop',
 ',',
 'chop',
 ',',
 'chop',
 'Was',
 'the',
 'bottom',
 'of',
 'the',
 'deep',
 'bl