## Robert Frost, meet Natural Language Processing

### Extract the data

In [None]:
# Dependencies to read the SQLite database
import pandas as pd
import sqlite3

In [None]:
# Connect to the poetry database
conn = sqlite3.connect("Poetry.db")

# Load the data into a dataframe
df = pd.read_sql_query("select * from Frost;", conn)
conn.close()

# Print the dataframe
df.head()

In [None]:
df = df[["title", "lines"]]

### Transform the data

In [None]:
# Dependencies
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#### Tokenise, Remove Stop Words, Lemmatise

In [None]:
# Remove stop words from the list
stops = stopwords.words("english")
punctuations = [",", ".", "/", "?", "!", ";", ":", "-", "’", "‘",""]

In [None]:
# Create a list of words per poem and add that to an empty array
words_list = []
for poem in df["lines"]:
    words = word_tokenize(poem.lower())
    words2 = [word for word in words if word not in stops and word not in punctuations]
    words_list.append(words2)

df["tokens"] = words_list    
df.head()

In [None]:
# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

lemmatised = []
for word in df["tokens"][0]:
    lemma = lemmatiser.lemmatize(word, pos = "v")
    lemmatised.append(lemma)
    
lemmatised

In [None]:
# Create a function that counts the number of words in each poem
def word_count(word_list):
    return len(word_list)

# Define the function as a user-defined function (UDF)
countWords = udf(word_count, IntegerType())

In [None]:
# Create a function that calculates the difference between two columns, for each row
def difference(a,b):
    return a - b

# State that this function is UDF
diffLength = udf(difference, IntegerType())

In [None]:
# Use the tokeniser to split the string of words into a list of words
spark_df2 = tokeniser.transform(spark_df)

# Remove stop words from the wordList
spark_df3 = remover.transform(spark_df2)\
                   .select("title","wordList", "filteredList")

# Stem the words
spark_df4 = spark_df3.withColumn("stemmed", stemmer_udf(col("filteredList")))
spark_df4.show()

In [None]:
# Create a new dataframe that has the word count per poem
spark_df5 = spark_df4.withColumn("wordCount", countWords(col("wordList")))
spark_df5 = spark_df5.withColumn("filteredCount", countWords(col("filteredList")))
spark_df5 = spark_df5.withColumn("stopWordsCount", diffLength("wordCount", "filteredCount"))
spark_df5.show()

In [None]:
# Summary statistics
spark_df5.select("wordCount", "filteredCount").describe().show()

In [None]:
# The longest poem by Robert Frost
print(f"--Longest Poem-- \
      \nTitle: '{spark_df5.rdd.max(key = lambda x: x['wordCount'])[0]}', \
      \nNumber of words: {spark_df5.rdd.max(key = lambda x: x['wordCount'])[4]}")

# Shortest poem by Robert Frost
print(f"\n--Shortest Poem-- \
      \nTitle: '{spark_df5.rdd.min(key = lambda x: x['wordCount'])[0]}', \
      \nNumber of words: {spark_df5.rdd.min(key = lambda x: x['wordCount'])[4]}")

In [None]:
text = spark_df5.select("stemmed").show()
text

In [None]:
type(text)