## Robert Frost, meet Natural Language Processing

### Extract the data

In [1]:
# Dependencies to read the SQLite database
import pandas as pd
import sqlite3

In [2]:
# Connect to the poetry database
conn = sqlite3.connect("Poetry.db")

# Load the data into a dataframe
df = pd.read_sql_query("select * from Frost;", conn)
conn.close()

# Print the dataframe
df.head()

Unnamed: 0,index,title,link,lines
0,0,October,https://www.poetryfoundation.org/poems/53084/o...,"O hushed October morning mild, Thy leaves have..."
1,1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...,The buzz saw snarled and rattled in the yard A...
2,2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...,I have been one acquainted with the night. I h...
3,3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...,My long two-pointed ladder's sticking through ...
4,4,Birches,https://www.poetryfoundation.org/poems/44260/b...,When I see birches bend to left and right Acro...


In [3]:
df = df[["title", "lines"]]

### Transform the data

In [4]:
# Dependencies
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Tokenise, Remove Stop Words, Lemmatise

In [5]:
# Remove stop words from the list
stops = stopwords.words("english")
punctuations = [",", ".", "/", "?", "!", ";", ":", "-", "’", "‘",""]

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [6]:
# Transform the poem in preparation for word counts
words_list = []
for poem in df["lines"]:
    
    # Create a list of words per poem     
    words = word_tokenize(poem.lower())
    
    # Filter to remove stop words and punctuations    
    words2 = [word for word in words if word not in stops and word not in punctuations]
    
    # Lemmatise each word (if it's a verb, convert to root verb)
    words3 = [lemmatiser.lemmatize(word, pos = "v") for word in words2]
    
    # Add the filtered list of words (representing each poem)
    words_list.append(words3)

df["tokens"] = words_list    
df.head()

Unnamed: 0,title,lines,tokens
0,October,"O hushed October morning mild, Thy leaves have...","[hush, october, morning, mild, thy, leave, rip..."
1,"‘Out, Out—’",The buzz saw snarled and rattled in the yard A...,"[buzz, saw, snarl, rattle, yard, make, dust, d..."
2,Acquainted with the Night,I have been one acquainted with the night. I h...,"[one, acquaint, night, walk, rain—and, back, r..."
3,After Apple-Picking,My long two-pointed ladder's sticking through ...,"[long, two-pointed, ladder, 's, stick, tree, t..."
4,Birches,When I see birches bend to left and right Acro...,"[see, birch, bend, leave, right, across, line,..."


In [7]:
# Create a function that counts the number of words in each poem
def word_count(word_list):
    return len(word_list)

In [8]:
# Determine the length of each filtered poem
lengths = []
for poem in df["tokens"]:
    length = word_count(poem)
    lengths.append(length)

# Add the filtered poem lengths in the df
df["poemLength"] = lengths
df.head()

Unnamed: 0,title,lines,tokens,poemLength
0,October,"O hushed October morning mild, Thy leaves have...","[hush, october, morning, mild, thy, leave, rip...",80
1,"‘Out, Out—’",The buzz saw snarled and rattled in the yard A...,"[buzz, saw, snarl, rattle, yard, make, dust, d...",150
2,Acquainted with the Night,I have been one acquainted with the night. I h...,"[one, acquaint, night, walk, rain—and, back, r...",54
3,After Apple-Picking,My long two-pointed ladder's sticking through ...,"[long, two-pointed, ladder, 's, stick, tree, t...",142
4,Birches,When I see birches bend to left and right Acro...,"[see, birch, bend, leave, right, across, line,...",252


In [9]:
# Longest and shortest poems
longest_poem = df["poemLength"].max()
shortest_poem = df["poemLength"].min()

for i in range(0, len(df["poemLength"])):
    if df["poemLength"][i] == longest_poem:
        print(f'Longest poem: {df["title"][i]}; Filtered poem length: {df["poemLength"][i]} words')
    if df["poemLength"][i] == shortest_poem:
        print(f'Shortest poem: {df["title"][i]}; Filtered poem length: {df["poemLength"][i]} words')
        

Shortest poem: Dust of Snow; Filtered poem length: 15 words
Longest poem: The Death of the Hired Man; Filtered poem length: 654 words
