In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [3]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
sen_df = spark.createDataFrame([
  (0, 'Hi I heard about Spark'),
  (1, 'I wish Java could use case classes'),
  (2, 'Logistic.regression.models,are,neat')
], ['id', 'sentence'])

In [5]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [6]:
# \\W probably means white space
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [7]:
# udf is user defined function, IntegerType() specifies the function will return int
count_tokens = udf(lambda words: len(words), IntegerType())

In [8]:
tokenized = tokenizer.transform(sen_df)

In [9]:
tokenized.show()

In [10]:
# From this we can see that the 3rd line is being used as an array item as whole
# It needs to be tokenized with ',' being the seperator
tokenized.withColumn('token', count_tokens(col('words'))).show()

In [11]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [12]:
rg_tokenized.withColumn('token', count_tokens(col('words'))).show()

In [13]:
from pyspark.ml.feature import StopWordsRemover

In [14]:
sentenceDataFrame = spark.createDataFrame([
  (0, ['I', 'saw', 'the', 'green', 'horse']),
  (1, ['Mary', 'had', 'a', 'little', 'lamb'])
  ], ['id', 'tokens']
)

In [15]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [16]:
remover.transform(sentenceDataFrame).show()

In [17]:
# n-gram
from pyspark.ml.feature import NGram

In [18]:
wordDataFrame = spark.createDataFrame([
  (0, ['Hi', 'I', 'heard', 'about', 'Spark']),
  (1, ['I', 'wish', 'Java', 'could', 'use', 'case', 'classes']),
  (2, ['Logistic', 'regression', 'models', 'are', 'neat'])
], ['id', 'words'])

In [19]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [20]:
# What ngram does it pairs consecutive words. In this case 2 consecutive words since n=2
# It is useful when we want to see relationship between words. Which words come together a lot
ngram.transform(wordDataFrame).select('grams').show(truncate=False)