In [20]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark verion
# For example:
# spark_version = 'spark-3.<enter version>'
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 htt

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Tokens").getOrCreate()

In [None]:
from pyspark.ml.feature import Tokenizer

In [None]:
# Create sample DataFrame from scratch
dataframe = spark.createDataFrame([
                                   (0, "Spark is great"),
                                   (1, "We are learning Spark"),
                                   (2, "Spark is better than hadoop no doubt")
], ["id", "sentence"])

dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1|We are learning S...|
|  2|Spark is better t...|
+---+--------------------+



In [None]:
# The tokenizer function takes input and output parameters. 
# The input passes the name of the column that we want to 
# have tokenized, and the output takes the name that we want the column called.



# Tokenizer sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer
# ^ tokenizer by itself is only a transform method, so 
# when we call it alone it won't show us the dataframe



# To see the dataframe, tokenizer uses a transform function to take in DataFrames as inputs
tokenized_df = tokenizer.transform(dataframe)
# Then we need to use a .show() function
# .show(truncate=False) shows the tokenized dataframe without shortening the output
tokenized_df.show(truncate=False)

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |
|1  |We are learning Spark               |[we, are, learning, spark]                  |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+



In [None]:
# User-Defined Functions (UDFs): Functions created by the user to add custom output columns
# Next, we'll want to create a function to enhance our tokenizer


In [None]:
# Let's add a function that returns a word count for each line

# Start by creating a Python function that takes a list of words as its input, 
# then returns the length of that list



# Create a function to return the length of a list
def word_list_length(word_list):
  return len(word_list)



# Test the function
word_list_0 = ["testing", "this", "function", "out", "using", "these", "words"]
word_list_length(word_list_0)



# # Another Function - NOT AS ELEGANT, NOT AS CONVENIENT
# def word_list_length(word_list):
#   x = 0
#   for word in word_list:
#     x += 1
#   return x

# # Test function
# word_list_1 = ["spark", "is", "great"]
# word_list_length(word_list_1)

7

In [None]:
# Next we'll need to import the udf function from pyspark to create User-Defined Functions
# We'll also want to import the col function to select a column to be passed into a function
from pyspark.sql.functions import col, udf

# And finally, we'll want to import the IntergerType function so we can define datatype outputs in udf functions
from pyspark.sql.types import IntegerType

In [None]:
# Create a user defined function - a function that can add data output columns directly into dataframes
count_tokens = udf(word_list_length, IntegerType())

In [None]:
# Now we need to redo the tokenizer process

# Create out Tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform DataFrame
tokenized_df = tokenizer.transform(dataframe)

# Select the needed columns and don't truncate results
tokenized_df.withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+---+------------------------------------+--------------------------------------------+------+
|id |sentence                            |words                                       |tokens|
+---+------------------------------------+--------------------------------------------+------+
|0  |Spark is great                      |[spark, is, great]                          |3     |
|1  |We are learning Spark               |[we, are, learning, spark]                  |4     |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|7     |
+---+------------------------------------+--------------------------------------------+------+

