In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

collection = [1,"two",3.0,("four", 4), {"five", 5}]

sc = spark.sparkContext

collection_rdd = sc.parallelize(collection)

In [6]:
def add_one(value):
    try:
        return value + 1
    except TypeError:
        return value

collection_rdd = collection_rdd.map(add_one)

collection_rdd.collect()

                                                                                

[2, 'two', 4.0, ('four', 4), {5, 'five'}]

In [8]:
collection_rdd = collection_rdd.filter(
    lambda elem: isinstance(elem, (float, int))
)

print(collection_rdd.collect())

[2, 4.0]


In [9]:
from operator import add

collection_rdd = sc.parallelize([4,7,9,1,3])

print(collection_rdd.reduce(add))

24


In [15]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

fractions = [[x,y] for x in range(100) for y in range(1,100)]

frac_df = spark.createDataFrame(fractions, ["numerator", "denominator"])

frac_df = frac_df.select(
    F.array(F.col("numerator"), F.col("denominator")).alias("fraction")
)

frac_df.show()

+--------+
|fraction|
+--------+
|  [0, 1]|
|  [0, 2]|
|  [0, 3]|
|  [0, 4]|
|  [0, 5]|
|  [0, 6]|
|  [0, 7]|
|  [0, 8]|
|  [0, 9]|
| [0, 10]|
| [0, 11]|
| [0, 12]|
| [0, 13]|
| [0, 14]|
| [0, 15]|
| [0, 16]|
| [0, 17]|
| [0, 18]|
| [0, 19]|
| [0, 20]|
+--------+
only showing top 20 rows



In [18]:
from fractions import Fraction
from typing import Tuple, Optional

Frac = Tuple[int, int]

def py_reduce_fraction(frac: Frac) -> Optional[Frac]:
    num, denom = frac
    if denom:
        answer = Fraction(num, denom)
        return answer.numerator, answer.denominator
    return None

assert py_reduce_fraction((3,6)) == (1,2)
assert py_reduce_fraction((1,0)) is None

def py_fraction_to_float(frac: Frac) -> Optional[float]:
    num, demon = frac
    if demon:
        return num / demon
    return None

assert py_fraction_to_float((2,8)) == 0.25
assert py_fraction_to_float((10,0)) is None

In [22]:
SparkFrac = T.ArrayType(T.LongType())

reduce_fraction = F.udf(py_reduce_fraction, SparkFrac)

frac_df = frac_df.withColumn("reduced_fraction", reduce_fraction(F.col("fraction")))

frac_df.show(5, False)

+--------+----------------+
|fraction|reduced_fraction|
+--------+----------------+
|[0, 1]  |[0, 1]          |
|[0, 2]  |[0, 1]          |
|[0, 3]  |[0, 1]          |
|[0, 4]  |[0, 1]          |
|[0, 5]  |[0, 1]          |
+--------+----------------+
only showing top 5 rows



In [25]:
@F.udf(T.DoubleType())
def fraction_to_float(frac: Frac) -> Optional[float]:
    num, denom = frac
    if denom:
        return num / denom
    return None

frac_df = frac_df.withColumn(
    "fraction_float", fraction_to_float(F.col("reduced_fraction"))
)

frac_df.select("reduced_fraction", "fraction_float").distinct().show(5)

+----------------+--------------------+
|reduced_fraction|      fraction_float|
+----------------+--------------------+
|         [3, 50]|                0.06|
|         [3, 67]| 0.04477611940298507|
|         [7, 76]| 0.09210526315789473|
|         [3, 76]|0.039473684210526314|
|         [2, 85]|0.023529411764705882|
+----------------+--------------------+
only showing top 5 rows

