In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    explode,
    greatest,
    length,
    lower,
    regexp_extract,
    split
)
from pyspark.sql.utils import AnalysisException

In [2]:
# construct a SparkSession, giving it a relevant appName
spark = SparkSession.builder.appName(
    "End of Chapter 2 exercises."
).getOrCreate()

### Exercise 2.2

Given the following data frame, programmatically count the number of columns that
aren’t strings (answer = only one column isn’t a string).
`createDataFrame()` allows you to create a data frame from a variety of sources,
such as a pandas data frame or (in this case) a list of lists.

In [30]:
exo2_2_df = spark.createDataFrame([["test", "more test", 10_000_000_000]], ["one", "two", "three"])

In [12]:
def non_type_columns(df, target_type="string"):
    # define the target type we (don't) want to count

    # Get the columns of the target type
    target_columns = [col for col, dtype in df.dtypes if dtype != target_type]

    # Count the number of columns of the target type
    num_columns = len(target_columns)

    return num_columns

In [13]:
target_type = "string"
print(f"Number of non-{target_type} columns: {non_type_columns(exo2_2_df, target_type)}")

Number of non-string columns: 1


### Exercise 2.3

Rewrite the following code snippet, removing the `withColumnRenamed` method. Which
version is clearer and easier to read?

In [20]:
# The `length` function returns the number of characters in a string column.
exo2_3_df = (
 spark.read.text("../../data/gutenberg_books/1342-0.txt")
 .select(length(col("value")))
 .withColumnRenamed("length(value)", "number_of_char")
)

In [25]:
exo2_3_df = (
 spark.read.text("../../data/gutenberg_books/1342-0.txt")
 .select(length(col("value")).alias("number_of_char"))
)

### Exercise 2.4
Assume a data frame exo2_4_df. The following code block gives an error. What is the
problem, and how can you solve it?

In [29]:
exo2_4_df = spark.createDataFrame([["key", 10_000, 20_000]], ["key", "value1", "value2"])

In [35]:
# `greatest` will return the greatest value of the list of column names,
# skipping null value

# The following statement will return an error
try:
    exo2_4_mod = exo2_4_df.select(
        greatest(col("value1"), col("value2")).alias("maximum_value")
    ).select("key", "max_value")
except AnalysisException as err:
    print(err)

[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `key` cannot be resolved. Did you mean one of the following? [`maximum_value`].;
'Project ['key, 'max_value]
+- Project [greatest(value1#43L, value2#44L) AS maximum_value#54L]
   +- LogicalRDD [key#42, value1#43L, value2#44L], false



In [42]:
try:
    exo2_4_mod = exo2_4_df.select(
        col('key'),
        greatest(col("value1"), col("value2")).alias("maximum_value")
    )
except AnalysisException as err:
    print(err)

### Exercise 2.5

Let’s take our words_nonull data frame, available in the next listing.

In [54]:
book = spark.read.text("../../data/gutenberg_books/1342-0.txt")
lines = book.select(split(book.value, " ").alias("line"))
words = lines.select(explode(col("line")).alias("word"))
words_lower = words.select(lower(col("word")).alias("word_lower"))
words_clean = words_lower.select(
 regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word")
)
words_nonull = words_clean.where(col("word") != "")

a) Remove all of the occurrences of the word is.

b) (Challenge) Using the length function, keep only the words with more than three
characters.

In [55]:
words_without_is = words_nonull.filter(col("word") != "is")

words_filtered = words_nonull.filter(length(col("word")) > 3)

### Exercise 2.6

The where clause takes a Boolean expression over one or many columns to filter the
data frame. Beyond the usual Boolean operators (>, <, ==, <=, >=, !=), PySpark provides other functions that return Boolean columns in the pyspark.sql.functions
module.

A good example is the `isin()` method (applied on a Column object, like
`col(…).isin(…))`, which takes a list of values as a parameter, and will return only the
records where the value in the column equals a member of the list.`

 Let’s say you want to remove the words is, not, the and if from your list of words,
using a single `where()` method on the `words_nonull` data frame. Write the code to
do so.

In [56]:
words_without_is_not_the_if = words_nonull.where(~col("word").isin(["is", "not", "the", "if"]))

### Exercise 2.7

One of your friends comes to you with the following code. They have no idea why it
doesn’t work. Can you diagnose the problem in the try block, explain why it is an
error, and provide a fix?

In [47]:
try:
    book = spark.read.text("../../data/gutenberg_books/1342-0.txt")
    book = book.printSchema()
    lines = book.select(split(book.value, " ").alias("line"))
    words = lines.select(explode(col("line")).alias("word"))
except AnalysisException as err:
    print(err)


root
 |-- value: string (nullable = true)



AttributeError: 'NoneType' object has no attribute 'select'

In [51]:
book = spark.read.text("../../data/gutenberg_books/1342-0.txt")
book.printSchema()
lines = book.select(split(book.value, " ").alias("line"))
words = lines.select(explode(col("line")).alias("word"))

root
 |-- value: string (nullable = true)

