## Robert Frost, meet Natural Language Processing

### Extract the data

In [1]:
# Dependencies to read the SQLite database
import pandas as pd
import sqlite3

In [2]:
# Connect to the poetry database
conn = sqlite3.connect("Poetry.db")

# Load the data into a dataframe
df = pd.read_sql_query("select * from Frost;", conn)
conn.close()

# Print the dataframe
df.head()

Unnamed: 0,index,title,link,lines
0,0,October,https://www.poetryfoundation.org/poems/53084/o...,"O hushed October morning mild, Thy leaves have..."
1,1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...,The buzz saw snarled and rattled in the yard A...
2,2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...,I have been one acquainted with the night. I h...
3,3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...,My long two-pointed ladder's sticking through ...
4,4,Birches,https://www.poetryfoundation.org/poems/44260/b...,When I see birches bend to left and right Acro...


In [3]:
# Choose the relevant columns
df1 = df[["title", "lines"]]

In [4]:
# Put all letters in lower case
df1["lines"] = df1["lines"].str.lower()

# Preview the dataframe
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,title,lines
0,October,"o hushed october morning mild, thy leaves have..."
1,"‘Out, Out—’",the buzz saw snarled and rattled in the yard a...
2,Acquainted with the Night,i have been one acquainted with the night. i h...
3,After Apple-Picking,my long two-pointed ladder's sticking through ...
4,Birches,when i see birches bend to left and right acro...


### Transform the data

In [5]:
# Dependencies
import re, string

import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Tokenise, Remove Stop Words, Lemmatise
Reference for lemmatisation: https://marcobonzanini.com/2015/01/26/stemming-lemmatisation-and-pos-tagging-with-python-and-nltk/

In [6]:
# Remove stop words from the list
stops = stopwords.words("english")
exclude = set(string.punctuation)

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [7]:
# Transform the poem in preparation for word counts
words_list = []
preprocessed_text = []
for poem in df1["lines"]:
    
    # Create a list of words per poem after the words are converted to lowercase    
    words = word_tokenize(poem)
    
    # Filter to remove stop words and punctuations    
    words2 = [word for word in words if word not in stops and word not in exclude]
    
    # Lemmatise each word (if it's a verb, convert to root verb)
    words3 = [lemmatiser.lemmatize(word, pos = "v") for word in words2]
    
    # Add the filtered list of words (representing each poem)
    words_list.append(words3)
    
    # Convert the list of strings back to one string
    words4 = " ".join(words3)
    
    # Add the filtered list of words (representing each poem)
    preprocessed_text.append(words4)

df1["tokens"] = words_list    
df1["filteredPoem"] = preprocessed_text
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,title,lines,tokens,filteredPoem
0,October,"o hushed october morning mild, thy leaves have...","[hush, october, morning, mild, thy, leave, rip...",hush october morning mild thy leave ripen fall...
1,"‘Out, Out—’",the buzz saw snarled and rattled in the yard a...,"[buzz, saw, snarl, rattle, yard, make, dust, d...",buzz saw snarl rattle yard make dust drop stov...
2,Acquainted with the Night,i have been one acquainted with the night. i h...,"[one, acquaint, night, walk, rain—and, back, r...",one acquaint night walk rain—and back rain out...
3,After Apple-Picking,my long two-pointed ladder's sticking through ...,"[long, two-pointed, ladder, 's, stick, tree, t...",long two-pointed ladder 's stick tree toward h...
4,Birches,when i see birches bend to left and right acro...,"[see, birch, bend, leave, right, across, line,...",see birch bend leave right across line straigh...


In [8]:
# Create a function that counts the number of words in each poem
def word_count(word_list):
    return len(word_list)

In [9]:
# Determine the length of each filtered poem
lengths = []
for poem in df1["tokens"]:
    length = word_count(poem)
    lengths.append(length)

# Add the filtered poem lengths in the df
df1["poemLength"] = lengths
df1.head()

Unnamed: 0,title,lines,tokens,filteredPoem,poemLength
0,October,"o hushed october morning mild, thy leaves have...","[hush, october, morning, mild, thy, leave, rip...",hush october morning mild thy leave ripen fall...,83
1,"‘Out, Out—’",the buzz saw snarled and rattled in the yard a...,"[buzz, saw, snarl, rattle, yard, make, dust, d...",buzz saw snarl rattle yard make dust drop stov...,159
2,Acquainted with the Night,i have been one acquainted with the night. i h...,"[one, acquaint, night, walk, rain—and, back, r...",one acquaint night walk rain—and back rain out...,54
3,After Apple-Picking,my long two-pointed ladder's sticking through ...,"[long, two-pointed, ladder, 's, stick, tree, t...",long two-pointed ladder 's stick tree toward h...,142
4,Birches,when i see birches bend to left and right acro...,"[see, birch, bend, leave, right, across, line,...",see birch bend leave right across line straigh...,252


In [10]:
# Longest and shortest poems
longest_poem = df1["poemLength"].max()
shortest_poem = df1["poemLength"].min()

for i in range(0, len(df1["poemLength"])):
    if df1["poemLength"][i] == longest_poem:
        print(f'Longest poem: {df1["title"][i]}; Filtered poem length: {df1["poemLength"][i]} words')
    if df1["poemLength"][i] == shortest_poem:
        print(f'Shortest poem: {df1["title"][i]}; Filtered poem length: {df1["poemLength"][i]} words')      

Shortest poem: Dust of Snow; Filtered poem length: 15 words
Longest poem: The Death of the Hired Man; Filtered poem length: 787 words


### Word importance
Source: https://stevenloria.com/tf-idf/

In [11]:
# Dependencies
import math
from textblob import TextBlob as tb

In [12]:
# Create a function that calculates term frequency
def tf(word, poem):
    return poem.words.count(word) / len(poem.words)

# Create a function that determines the number of documents that contain a certain word
def n_docs(word, poemlist):
    return sum(1 for poem in poemlist if word in poem.words)

# Create a function that determines the inverse document frequency (IDF)
# IDF = how common a word is among all the documents in poemlist
def idf(word, poemlist):
    return math.log(len(poemlist) / (1 + n_docs(word, poemlist)))

def tdidf(word, poem, poemlist):
    return tf(word, poem) * idf(word, poemlist)

In [13]:
# Create the poemlist from df["lines"]
poemlist = [tb(poem) for poem in df1["filteredPoem"]]
poemlist

[TextBlob("hush october morning mild thy leave ripen fall tomorrow ’ wind wild waste crow forest call tomorrow may form go hush october morning mild begin hours day slow make day seem us less brief hearts averse beguile beguile us way know release one leaf break day noon release another leaf one tree one far away retard sun gentle mist enchant land amethyst slow slow grapes ’ sake whose leave already burn frost whose cluster fruit must else lost— grapes ’ sake along wall"),
 TextBlob("buzz saw snarl rattle yard make dust drop stove-length stick wood sweet-scented stuff breeze draw across lift eye could count five mountain range one behind sunset far vermont saw snarl rattle snarl rattle run light bear load nothing happen day do call day wish might say please boy give half hour boy count much save work sister stand beside apron tell ‘ supper. ’ word saw prove saw know supper mean leap boy ’ hand seem leap— must give hand however neither refuse meet hand boy ’ first outcry rueful laugh s

In [14]:
# Create an empty list to be filled with text blobs from cleaning poemlist
poemlist2 = []

# Loop through the poemlist
for i in range(0, len(poemlist)):
    
    # Remove words that are shorter than 3 characters
    new_string = ' '.join([w for w in str(poemlist[i]).split() if len(w) > 3])
    
    # Replace emm dash with space
    new_string2 = new_string.replace("—", " ")
    
    # Convert string to text blob
    new_string2 = tb(new_string2)
    
    # Append the text blob to the list of text blobs
    poemlist2.append(new_string2)
    
poemlist2

[TextBlob("hush october morning mild leave ripen fall tomorrow wind wild waste crow forest call tomorrow form hush october morning mild begin hours slow make seem less brief hearts averse beguile beguile know release leaf break noon release another leaf tree away retard gentle mist enchant land amethyst slow slow grapes sake whose leave already burn frost whose cluster fruit must else lost  grapes sake along wall"),
 TextBlob("buzz snarl rattle yard make dust drop stove-length stick wood sweet-scented stuff breeze draw across lift could count five mountain range behind sunset vermont snarl rattle snarl rattle light bear load nothing happen call wish might please give half hour count much save work sister stand beside apron tell supper. word prove know supper mean leap hand seem leap  must give hand however neither refuse meet hand first outcry rueful laugh swing toward hold hand half appeal half keep life spill all  since enough know work though child heart  spoil hand off  doctor come

In [15]:
# Calculate the most important words
impt_words = []
for i, poem in enumerate(poemlist2):
    scores = {word: tdidf(word, poem, poemlist2) for word in poem.words}
    sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse = True)
    
    for word, score in sorted_words[:5]:
        impt_words.append((i + 1, word, round(score, 5)))

In [16]:
# Create a dataframe of important words per poem
df2 = pd.DataFrame(impt_words, columns = ["PoemNo", "Word", "TD-IDF"])

# Add titles for each poem in df2
titles = []
for i in range(0, len(df)):
    for p in df2.PoemNo:
        if i == p - 1:
            title = df["title"][i]
            titles.append(title) 

df2["PoemTitle"] = titles

# Remove 
df2.head()

Unnamed: 0,PoemNo,Word,TD-IDF,PoemTitle
0,1,slow,0.1142,October
1,1,hush,0.08842,October
2,1,october,0.08842,October
3,1,mild,0.08842,October
4,1,beguile,0.08842,October


In [17]:
df3 = pd.DataFrame(df2.groupby(["PoemTitle", "Word"]).min())
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,PoemNo,TD-IDF
PoemTitle,Word,Unnamed: 2_level_1,Unnamed: 3_level_1
Acquainted with the Night,acquaint,3,0.12416
Acquainted with the Night,city,3,0.10691
Acquainted with the Night,furthest,3,0.06208
Acquainted with the Night,outwalked,3,0.06208
Acquainted with the Night,rain,3,0.10691
After Apple-Picking,apple-picking,4,0.04863
After Apple-Picking,apples,4,0.09726
After Apple-Picking,ladder,4,0.04863
After Apple-Picking,sleep,4,0.09096
After Apple-Picking,thousand,4,0.03708


### Visualise the important words

In [18]:
# Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "whitegrid")
import numpy as np

from ipywidgets import widgets, interactive

In [19]:
# Create a widget containing poem titles (sorted alphabetically)
titles = list(df.title)
titles.sort()

poem_title = widgets.Dropdown(options = ["Choose a poem..."] + titles, value = "Choose a poem...", 
                              description = "Title:", disabled = False)

In [20]:
# Create a filter based on title
def plot_it(poem_title):
    if poem_title != "Choose a poem...":
        df3 = df2[df2["PoemTitle"] == poem_title]
        
        plt.figure(figsize = (10, 6))
        sns.set(font_scale = 1.5)
        graph = sns.barplot(y = "Word", x = "TD-IDF", data = df3, palette = "Blues_d")

In [21]:
# Plot the data by poem title
interactive(plot_it, poem_title = poem_title)

interactive(children=(Dropdown(description='Title:', options=('Choose a poem...', 'Acquainted with the Night',…