## Robert Frost, meet Natural Language Processing

### Extract the data

In [1]:
# Dependencies to read the SQLite database
import pandas as pd
import sqlite3

In [2]:
# Connect to the poetry database
conn = sqlite3.connect("Poetry.db")

# Load the data into a dataframe
df = pd.read_sql_query("select * from Frost;", conn)
conn.close()

# Print the dataframe
df

Unnamed: 0,index,title,link,lines
0,0,October,https://www.poetryfoundation.org/poems/53084/o...,"O hushed October morning mild, Thy leaves have..."
1,1,"‘Out, Out—’",https://www.poetryfoundation.org/poems/53087/o...,The buzz saw snarled and rattled in the yard A...
2,2,Acquainted with the Night,https://www.poetryfoundation.org/poems/47548/a...,I have been one acquainted with the night. I h...
3,3,After Apple-Picking,https://www.poetryfoundation.org/poems/44259/a...,My long two-pointed ladder's sticking through ...
4,4,Birches,https://www.poetryfoundation.org/poems/44260/b...,When I see birches bend to left and right Acro...
5,5,Christmas Trees,https://www.poetryfoundation.org/poems/57834/c...,The city had withdrawn into itself And left at...
6,6,Dust of Snow,https://www.poetryfoundation.org/poems/44262/d...,The way a crow Shook down on me The dust of sn...
7,7,Fire and Ice,https://www.poetryfoundation.org/poems/44263/f...,"Some say the world will end in fire, Some say ..."
8,8,Fireflies in the Garden,https://www.poetryfoundation.org/poems/42892/f...,"Here come real stars to fill the upper skies, ..."
9,9,"For Once, Then, Something",https://www.poetryfoundation.org/poems/44264/f...,Others taunt me with having knelt at well-curb...


### Transform the data

In [3]:
# Dependencies
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import col, udf, array
from pyspark.sql.types import IntegerType, ArrayType, StringType

from nltk.stem.porter import *

#### Load the data in PySpark

In [4]:
# Create a Spark session
spark = SparkSession.builder.appName("frost").getOrCreate()

# Create a Spark dataframe
spark_df = spark.createDataFrame(df)

#### Tokenise, Remove Stop Words, Stem

In [5]:
# Create a tokeniser that can split the lines of text into a list of words (strings)
tokeniser = Tokenizer(inputCol = "lines", outputCol = "wordList")

# Create a remover to filter the words in the list of text
remover = StopWordsRemover(inputCol = "wordList", outputCol = "filteredList")

# Create a stemmer that reduces words to their roots
stemmer = PorterStemmer()

In [6]:
# Create a function that stems the words to retain their roots
def stem(word_list):
    newList = []
    for word in word_list:
        word_stem = stemmer.stem(word)
        
        if len(word_stem) > 2:
            newList.append(word_stem)
    return newList        
        

# Create a UDF that stems the words in the filteredList
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

In [7]:
# Create a function that counts the number of words in each poem
def word_count(word_list):
    return len(word_list)

# Define the function as a user-defined function (UDF)
countWords = udf(word_count, IntegerType())

In [8]:
# Create a function that calculates the difference between two columns, for each row
def difference(a,b):
    return a - b

# State that this function is UDF
diffLength = udf(difference, IntegerType())

In [9]:
# Use the tokeniser to split the string of words into a list of words
spark_df2 = tokeniser.transform(spark_df)

# Remove stop words from the wordList
spark_df3 = remover.transform(spark_df2)\
                   .select("title","wordList", "filteredList")

# Stem the words
spark_df4 = spark_df3.withColumn("stemmed", stemmer_udf(col("filteredList")))
spark_df4.show()

+--------------------+--------------------+--------------------+--------------------+
|               title|            wordList|        filteredList|             stemmed|
+--------------------+--------------------+--------------------+--------------------+
|             October|[o, hushed, octob...|[o, hushed, octob...|[hush, octob, mor...|
|         ‘Out, Out—’|[the, buzz, saw, ...|[buzz, saw, snarl...|[buzz, saw, snarl...|
|Acquainted with t...|[i, have, been, o...|[one, acquainted,...|[one, acquaint, n...|
| After Apple-Picking|[my, long, two-po...|[long, two-pointe...|[long, two-point,...|
|             Birches|[when, i, see, bi...|[see, birches, be...|[see, birch, bend...|
|     Christmas Trees|[the, city, had, ...|[city, withdrawn,...|[citi, withdrawn,...|
|        Dust of Snow|[the, way, a, cro...|[way, crow, shook...|[way, crow, shook...|
|        Fire and Ice|[some, say, the, ...|[say, world, end,...|[say, world, end,...|
|Fireflies in the ...|[here, come, real...|[come, real

In [10]:
# Create a new dataframe that has the word count per poem
spark_df5 = spark_df4.withColumn("wordCount", countWords(col("wordList")))
spark_df5 = spark_df5.withColumn("filteredCount", countWords(col("filteredList")))
spark_df5 = spark_df5.withColumn("stopWordsCount", diffLength("wordCount", "filteredCount"))
spark_df5.show()

+--------------------+--------------------+--------------------+--------------------+---------+-------------+--------------+
|               title|            wordList|        filteredList|             stemmed|wordCount|filteredCount|stopWordsCount|
+--------------------+--------------------+--------------------+--------------------+---------+-------------+--------------+
|             October|[o, hushed, octob...|[o, hushed, octob...|[hush, octob, mor...|      128|           84|            44|
|         ‘Out, Out—’|[the, buzz, saw, ...|[buzz, saw, snarl...|[buzz, saw, snarl...|      295|          157|           138|
|Acquainted with t...|[i, have, been, o...|[one, acquainted,...|[one, acquaint, n...|      108|           54|            54|
| After Apple-Picking|[my, long, two-po...|[long, two-pointe...|[long, two-point,...|      283|          142|           141|
|             Birches|[when, i, see, bi...|[see, birches, be...|[see, birch, bend...|      503|          245|           258|


In [11]:
# Summary statistics
spark_df5.select("wordCount", "filteredCount").describe().show()

+-------+------------------+------------------+
|summary|         wordCount|     filteredCount|
+-------+------------------+------------------+
|  count|                37|                37|
|   mean| 260.7027027027027| 131.2972972972973|
| stddev|286.52427324601865|150.85457766870584|
|    min|                34|                15|
|    max|              1399|               753|
+-------+------------------+------------------+



In [12]:
# The longest poem by Robert Frost
print(f"--Longest Poem-- \
      \nTitle: '{spark_df5.rdd.max(key = lambda x: x['wordCount'])[0]}', \
      \nNumber of words: {spark_df5.rdd.max(key = lambda x: x['wordCount'])[4]}")

# Shortest poem by Robert Frost
print(f"\n--Shortest Poem-- \
      \nTitle: '{spark_df5.rdd.min(key = lambda x: x['wordCount'])[0]}', \
      \nNumber of words: {spark_df5.rdd.min(key = lambda x: x['wordCount'])[4]}")

--Longest Poem--       
Title: 'The Death of the Hired Man',       
Number of words: 1399

--Shortest Poem--       
Title: 'Dust of Snow',       
Number of words: 34
