In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer

# Here, we import the udf function
# http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.udf
# This enables us to run custom Python code when querying our DataFrame
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.appName('udf').getOrCreate()

In [3]:
dataframe = spark.createDataFrame([
    (0, "Mary had a little lamb"),
    (1, "It's fleece was white as snow"),
    (2, "And everywhere Mary went"),
    (3, "The lamb was sure to go")
], ["id", "Nursery Rhyme"])
dataframe.show()

+---+--------------------+
| id|       Nursery Rhyme|
+---+--------------------+
|  0|Mary had a little...|
|  1|It's fleece was w...|
|  2|And everywhere Ma...|
|  3|The lamb was sure...|
+---+--------------------+



In [4]:
# Tokenize word
tokenizer = Tokenizer(inputCol="Nursery Rhyme", outputCol="words")
tokenizer

Tokenizer_456e9a7e821f44896059

Behind the scenes, the DataFrame API is using SQL to query the RDD that was created to hold our data.

Remember the method we ran on our DataFrame like `select` - sounds a lot like SQL to me!

When we want to run custom Python code against our DataFrame - again, this is running SQL on our RDD, using Spark - we must use the `udf` function from the `spark.sql.functions` module.

In [5]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [7]:
# Create a user defined function 
# This accepts 1.) the Python function we want to use,
# and 2.) the type of the return value from pyspark.sql.types
count_tokens = udf(word_list_length, IntegerType())
count_tokens

<function __main__.word_list_length(word_list)>

In [8]:
# Transform dataframe
tokenized = tokenizer.transform(dataframe)

# Select the needed columns and don't truncate results
tokenized.select("Nursery Rhyme", "words")\
    .withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+-----------------------------+------------------------------------+------+
|Nursery Rhyme                |words                               |tokens|
+-----------------------------+------------------------------------+------+
|Mary had a little lamb       |[mary, had, a, little, lamb]        |5     |
|It's fleece was white as snow|[it's, fleece, was, white, as, snow]|6     |
|And everywhere Mary went     |[and, everywhere, mary, went]       |4     |
|The lamb was sure to go      |[the, lamb, was, sure, to, go]      |6     |
+-----------------------------+------------------------------------+------+



In [8]:
spark.stop()