In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [3]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [4]:
# Create sample dataframe
dataframe = spark.createDataFrame([
    (0, "Spark is great"),
    (1, "We are learning Spark"),
    (2, "Spark is better than hadoop no doubt")
], ["id", "sentence"])

In [5]:
# Show data frame
dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1|We are learning S...|
|  2|Spark is better t...|
+---+--------------------+



In [6]:
# Tokenize word
# See https://spark.apache.org/docs/2.1.0/ml-features.html#tokenizer
# inputCol refers to the column name in the input DataFrame that you want to tokenize
# outputCol refers to the name of the output column where you want your list of tokens to go
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_47f4b3e12d7821e305b1

In [7]:
# Transform and show dataframe
tokenized = tokenizer.transform(dataframe)
tokenized.show(truncate=False)

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |
|1  |We are learning Spark               |[we, are, learning, spark]                  |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+



In [8]:
# Not necessary for these exercises, but it's a good idea to 
# close down connections to resources like databases, files, etc.
# Here's some more info: http://apache-spark-user-list.1001560.n3.nabble.com/SparkContext-stop-td17826.html
spark.stop()