In [17]:
from fractions import Fraction
from typing import Tuple, Optional
from operator import add
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from py4j.protocol import Py4JJavaError

In [2]:
spark = SparkSession.builder.getOrCreate()

# RDD  functions and methods are under the SparkContext object,
# which is an attribute of SparkSession. We alias it here for convenience.
sc = spark.sparkContext

In [3]:
# Collection of multiple unrelated types
collection = [1, "two", 3.0, ("four", 4), {"five": 5}]

In [4]:
# Promote the list to an RDD using parallelize method
collection_rdd = sc.parallelize(collection)

In [5]:
print(collection_rdd)

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287


In [6]:
# Mapping a simple function, add_one(), to each element

def add_one(value):
    return value + 1

collection_rdd = collection_rdd.map(add_one)

try:
    # collect() materialises an RDD into a Python list on the master node
    print(collection_rdd.collect())
except Py4JJavaError:
    pass

In [7]:
# Mapping a safer_add_one() to each element in an RDD

collection_rdd = sc.parallelize(collection)

def safer_add_one(value):
    try:
        return value + 1
    except TypeError:
        # Return the original value untouched if encounter TypeError
        return value
    
collection_rdd = collection_rdd.map(safer_add_one)

print(collection_rdd.collect())

[2, 'two', 4.0, ('four', 4), {'five': 5}]


In [8]:
# Filtering an RDD with a lambda function
collection_rdd = collection_rdd.filter(
    lambda elem: isinstance(elem, (float, int))
)

print(collection_rdd.collect())

[2, 4.0]


In [10]:
# Applying the add() funciton via reduce()
collection_rdd = sc.parallelize([4, 7, 9, 1, 3])

print(collection_rdd.reduce(add))

24


### Exercise 8.1

The PySpark RDD API provides a `count()` method that returns the number of elements in the RDD as an integer. Reproduce the behavior of this method using `map()`, `filter()`, and/or `reduce()`.

In [11]:
count = collection_rdd.map(lambda x: 1).reduce(lambda x, y: x + y)

print(count)

5


### Exercise 8.3

What is the return value of the following code block?

```
a_rdd = sc.parallelize([0, 1, None, [], 0.0])
a_rdd.filter(lambda x: x).collect()
```

Answer: `[1]`

In [16]:
# Creating a data frame containing a single-array column
fractions = [[x, y] for x in range(100) for y in range(1, 100)]

frac_df = spark.createDataFrame(fractions, ["numerator", "denominator"])

# array() takes two or more columns of the same type and creates a single column
# containing an array of the columns passed as a parameter
frac_df = frac_df.select(
    F.array(F.col("numerator"), F.col("denominator")).alias("fraction")
)

frac_df.show(5, False)

+--------+
|fraction|
+--------+
|[0, 1]  |
|[0, 2]  |
|[0, 3]  |
|[0, 4]  |
|[0, 5]  |
+--------+
only showing top 5 rows



In [21]:
# Type synonym: "When you see Frac, assume it's a Tuple[int, int]"
Frac = Tuple[int, int]

def py_reduce_fraction(frac: Frac) -> Optional[Frac]:
    """Reduce a fraction represented as a 2-tuple of integers"""
    num, denom = frac
    if denom:
        answer = Fraction(num, denom)
        return answer.numerator, answer.denominator
    return None

In [22]:
assert py_reduce_fraction((3, 6)) == (1, 2) 
assert py_reduce_fraction((1, 0)) is None

In [23]:
def py_fraction_to_float(frac: Frac) -> Optional[float]:
    """Transforms a fraction represented as a 2-tuple of integers into a float."""
    num, denom = frac
    if denom:
        return num / denom
    return None

In [24]:
assert py_fraction_to_float((2, 8)) == 0.25
assert py_fraction_to_float((10, 0)) is None

In [25]:
# Creating a UDF explicitly with the udf() function

# Alias an array of long PySpark type to SparkFrac
SparkFrac = T.ArrayType(T.LongType())

# Promote the Python function using the udf() function
reduce_fraction = F.udf(py_reduce_fraction, SparkFrac)

frac_df = frac_df.withColumn(
    "reduced_fraction", reduce_fraction(F.col("fraction"))
)

frac_df.show(5, False)

+--------+----------------+
|fraction|reduced_fraction|
+--------+----------------+
|[0, 1]  |[0, 1]          |
|[0, 2]  |[0, 1]          |
|[0, 3]  |[0, 1]          |
|[0, 4]  |[0, 1]          |
|[0, 5]  |[0, 1]          |
+--------+----------------+
only showing top 5 rows



In [26]:
# Creating a UDF directly using the udf() decorator

# The decorator performs the same function as the udf() function,
# but returns a UDF bearing the name of the function defined under it
@F.udf(T.DoubleType())
def fraction_to_float(frac: Frac) -> Optional[float]:
    """Transforms a fraction represented as a 2-tuple of integers into a float."""
    num, denom = frac
    if denom:
        return num / denom
    return None

In [29]:
frac_df = frac_df.withColumn(
    "fraction_float", fraction_to_float(F.col("reduced_fraction"))
)

frac_df.select("reduced_fraction", "fraction_float").distinct().show(5, False)

+----------------+-------------------+
|reduced_fraction|fraction_float     |
+----------------+-------------------+
|[3, 50]         |0.06               |
|[3, 67]         |0.04477611940298507|
|[7, 76]         |0.09210526315789473|
|[9, 23]         |0.391304347826087  |
|[9, 25]         |0.36               |
+----------------+-------------------+
only showing top 5 rows



In [30]:
assert fraction_to_float.func((1, 2)) == 0.5

### Exercise 8.3

Using the following definitions, create a temp_to_temp(value, from, to) that takes a
numerical value in from degrees and converts it to degrees.

- C = (F - 32) * 5 / 9 (Celcius)
- K = C + 273.15 (Kelvin)
- R = F + 459.67 (Rankine)

In [31]:
def temp_to_temp(value: float, from_unit: str, to_unit: str) -> float:
    conversion_formulas = {
        ('F', 'C'): lambda x: (x - 32) * 5 / 9,
        ('F', 'K'): lambda x: (x + 459.67) * 5 / 9,
        ('F', 'R'): lambda x: x + 459.67,
        ('C', 'F'): lambda x: x * 9 / 5 + 32,
        ('C', 'K'): lambda x: x + 273.15,
        ('C', 'R'): lambda x: (x + 273.15) * 9 / 5,
        ('K', 'F'): lambda x: x * 9 / 5 - 459.67,
        ('K', 'C'): lambda x: x - 273.15,
        ('K', 'R'): lambda x: x * 9 / 5,
        ('R', 'F'): lambda x: x - 459.67,
        ('R', 'C'): lambda x: (x - 459.67) * 5 / 9,
        ('R', 'K'): lambda x: x * 5 / 9,
    }

    if from_unit not in conversion_formulas or to_unit not in conversion_formulas:
        raise ValueError("Invalid temperature units. Supported units: 'F', 'C', 'K', 'R'")

    return conversion_formulas[(from_unit, to_unit)](value)

### Exercise 8.4

Correct the following UDF, so it doesn’t generate an error.

```
@F.udf(T.IntegerType())
def naive_udf(t: str) -> str:
    return answer * 3.14159
```

In [33]:
@F.udf(T.IntegerType())
def naive_udf(t: int) -> float:
    return t * 3.14159

In [37]:
naive_udf.func(2)

6.28318

### Exercise 8.5

Create a UDF that adds two fractions together, and test it by adding the reduced_
fraction to itself in the test_frac data frame.

In [40]:
@F.udf(SparkFrac)
def add_two_factions(frac_a: Frac, frac_b: Frac)  -> Optional[Frac]:
    """Add two fractions together, represented as a 2-tuple of integers"""
    num_a, denom_a = frac_a
    num_b, denom_b = frac_b
    if denom_a and denom_b:
        if denom_a == denom_b:
            return py_reduce_fraction((num_a + num_b, denom_a))
        else:
            return py_reduce_fraction((num_a*denom_b + num_b*denom_a, denom_a*denom_b))
    return None

In [51]:
frac_df = frac_df.withColumn(
    "fraction_sum", add_two_factions(F.col("reduced_fraction"), F.col("reduced_fraction"))
)

### Exercise 8.6

Because of the `LongType()`, the `py_reduce_fraction` (see the previous exercise) will
not work if the numerator or denominator exceeds `pow(2, 63)-1` or is lower than
`-pow(2, 63).` Modify the `py_reduce_fraction` to return None if this is the case.

In [52]:
def py_reduce_fraction_mod(frac: Frac) -> Optional[Frac]:
    """Reduce a fraction represented as a 2-tuple of integers"""
    num, denom = frac

    # Check if the numerator or denominator exceeds the supported range
    if not (-pow(2, 63) <= num < pow(2, 63)) or not (-pow(2, 63) <= denom < pow(2, 63)):
        return None

    if denom:
        answer = Fraction(num, denom)
        return answer.numerator, answer.denominator
    return None