In [14]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [15]:
# construct a SparkSession, giving it a relevant appName
spark = SparkSession.builder.appName(
    "Analyzing the vocabulary of Pride and Prejudice."
).getOrCreate()

In [16]:
# read the book in from txt
book = spark.read.text("../data/gutenberg_books/1342-0.txt")

In [17]:
# split the lines of the book into arrays of words
lines = book.select(F.split(book.value, " ").alias("line"))

In [18]:
# split lines into words
words = lines.select(F.explode(F.col("line")).alias("word"))

In [19]:
# lowercase the words
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))

In [20]:
# use regex to strip all characters except those a-z from words
words_clean = words_lower.select(F.regexp_extract(F.col("word_lower"), "[a-z]+", 0).alias("word"))

In [21]:
# remove empty words
words_nonull = words_clean.filter(F.col("word") != "")

In [22]:
# group by word, then product a count for each word
results = words_nonull.groupby(F.col("word")).count()


In [23]:
results.orderBy("count", ascending=False).show(10)

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
+----+-----+
only showing top 10 rows



In [12]:
results.write.csv("../data/simple_count.csv")

In [11]:
results.coalesce(1).write.csv("../data/simple_count_single_partition.csv")