In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName(
    "Counting word occurences from a book."
).getOrCreate()

spark.sparkContext.setLogLevel("WARN")

### Exercise 3.3

1) By modifying the `word_count_submit.py` program, return the number of distinct words in Jane Austen’s Pride and Prejudice. (Hint: results contains one
record for each unique word.)

2) (Challenge) Wrap your program in a function that takes a file name as a parameter. It should return the number of distinct words.

In [25]:
# If you need to read multiple text files, replace `1342-0` by `*`.
results = (
    spark.read.text("../../data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .agg(F.countDistinct(F.col("word")).alias("count"))
)

In [27]:
results.show()

+-----+
|count|
+-----+
| 6595|
+-----+



### Exercise 3.4

Taking `word_count_submit.py`, modify the script to return a sample of five words that
appear only once in Jane Austen’s Pride and Prejudice.

In [36]:
results = (
    spark.read.text("../../data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
    .where(F.col("count") == 1)
    .limit(5)
)

In [38]:
results.show()

+------------+-----+
|        word|count|
+------------+-----+
|   imitation|    1|
|     solaced|    1|
|premeditated|    1|
|     elevate|    1|
|   destitute|    1|
+------------+-----+



### Exercise 3.5

1) Using the substring function (refer to PySpark’s API or the pyspark shell if
needed), return the top five most popular first letters (keep only the first letter
of each word).

2) Compute the number of words starting with a consonant or a vowel. (Hint: The
isin() function might be useful.)

In [43]:
results = (
    spark.read.text("../../data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.substring(F.col("word"), 1, 1).alias('letter'))
    .count()
    .orderBy("count", ascending=False)
    .limit(5)
)

In [44]:
results.show()

+------+-----+
|letter|count|
+------+-----+
|     t|16101|
|     a|13684|
|     h|10419|
|     w| 9091|
|     s| 8791|
+------+-----+



In [62]:
results = (
    spark.read.text("../../data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .select(F.substring(F.col("word"), 1, 1).alias('letter'))
    .select(F.when(F.col("letter").isin("a", "e", "i", "o", "u"), "vowel").otherwise("consonant").alias("category"))
    .groupby(F.col("category"))
    .count()
)

In [63]:
results.show()

+---------+-----+
| category|count|
+---------+-----+
|consonant|88653|
|    vowel|33522|
+---------+-----+



### Exercise 3.6

Let’s say you want to get both the count() and sum() of a GroupedData object. Why
doesn’t this code work? Map the inputs and outputs of each method.

`my_data_frame.groupby("my_column").count().sum()`

The code my_data_frame.groupby("my_column").count().sum() doesn't work because the sum() method is not applicable to a GroupedData object in PySpark. The sum() method is used for aggregating numerical columns and not for aggregating the counts.

To get both the count and sum of a column in a DataFrame, you need to perform separate aggregation operations:

In [None]:
result = (my_data_frame.groupby("my_column")
          .agg(F.count("my_column").alias("count"), F.sum("my_column").alias("sum")))