In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('tools').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/22 22:32:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType



In [3]:
# Create a DataFrame full of sentences
sen_df = spark.createDataFrame(
    [
    (0, "PySpark is a powerful tool for big data processing"),
    (1, "Creating DataFrames is easy with PySpark"),
    (2, "SparkSession provides a unified entry point for reading data and executing queries"),
    (3, "Logistic,regression,models,are,neat"),
    (4, "Finally, this is the last sentence")
    ], 
    ['id', 'sentence']
)

In [4]:
sen_df.show()

                                                                                

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|PySpark is a powe...|
|  1|Creating DataFram...|
|  2|SparkSession prov...|
|  3|Logistic,regressi...|
|  4|Finally, this is ...|
+---+--------------------+



In [5]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [6]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [7]:
# udf : user defined function
count_tokens = udf(lambda words:len(words), IntegerType())

In [8]:
tokenized = tokenizer.transform(sen_df)

In [9]:
# The way it is displayed makes it unclear
# if the 4th row has been succesfully tokenized, or if it is a single long string
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|PySpark is a powe...|[pyspark, is, a, ...|
|  1|Creating DataFram...|[creating, datafr...|
|  2|SparkSession prov...|[sparksession, pr...|
|  3|Logistic,regressi...|[logistic,regress...|
|  4|Finally, this is ...|[finally,, this, ...|
+---+--------------------+--------------------+



In [10]:
# The 4th row could not be split because there was no white space to split on
# The 5th row was splitted on white space resulting in the first element being "finally,"
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

                                                                                

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|PySpark is a powe...|[pyspark, is, a, ...|     9|
|  1|Creating DataFram...|[creating, datafr...|     6|
|  2|SparkSession prov...|[sparksession, pr...|    12|
|  3|Logistic,regressi...|[logistic,regress...|     1|
|  4|Finally, this is ...|[finally,, this, ...|     6|
+---+--------------------+--------------------+------+



In [11]:
# Split not only on white space, but also on commas
regex_tokenized = regex_tokenizer.transform(sen_df)

In [12]:
regex_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|PySpark is a powe...|[pyspark, is, a, ...|     9|
|  1|Creating DataFram...|[creating, datafr...|     6|
|  2|SparkSession prov...|[sparksession, pr...|    12|
|  3|Logistic,regressi...|[logistic, regres...|     5|
|  4|Finally, this is ...|[finally, this, i...|     6|
+---+--------------------+--------------------+------+



In [13]:
from pyspark.ml.feature import StopWordsRemover

In [14]:
sentenceDataFrame = spark.createDataFrame(
    [
        (0, "I saw the green horse"),
        (1, "Mary had a little lamb")
    ],
    ["id", "sentence"]
)

In [15]:
sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|I saw the green h...|
|  1|Mary had a little...|
+---+--------------------+



In [16]:
sentenceDataFrame_tokenized = tokenizer.transform(sentenceDataFrame)

In [17]:
sentenceDataFrame_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|I saw the green h...|[i, saw, the, gre...|
|  1|Mary had a little...|[mary, had, a, li...|
+---+--------------------+--------------------+



In [18]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')

24/01/22 22:32:49 WARN StopWordsRemover: Default locale set was [en_GR]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [19]:
remover.transform(sentenceDataFrame_tokenized).show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|            filtered|
+---+--------------------+--------------------+--------------------+
|  0|I saw the green h...|[i, saw, the, gre...| [saw, green, horse]|
|  1|Mary had a little...|[mary, had, a, li...|[mary, little, lamb]|
+---+--------------------+--------------------+--------------------+

