In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [5]:
from pyspark.sql.functions import col,udf

In [6]:
from pyspark.sql.types import IntegerType

In [7]:
sen_df = spark.createDataFrame([(0,'hi I heard about Spark'),(1,'i wish java uses more classes'),(2,'logistic regression,models, are need')],(['id','sentence']))

In [8]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|hi I heard about ...|
|  1|i wish java uses ...|
|  2|logistic regressi...|
+---+--------------------+



In [9]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')

In [11]:
regex_tokenizer = RegexTokenizer(inputCol='sentence',outputCol='words',pattern='\\W')

In [12]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [13]:
tokenized = tokenizer.transform(sen_df)

In [14]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|hi I heard about ...|[hi, i, heard, ab...|
|  1|i wish java uses ...|[i, wish, java, u...|
|  2|logistic regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [15]:
tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|i wish java uses ...|[i, wish, java, u...|     6|
|  2|logistic regressi...|[logistic, regres...|     4|
+---+--------------------+--------------------+------+



In [16]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [17]:
rg_tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|i wish java uses ...|[i, wish, java, u...|     6|
|  2|logistic regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [18]:
from pyspark.ml.feature import StopWordsRemover

In [19]:
sentencedataframe = spark.createDataFrame([(0,['I','SAW','THE','LION']),
                                         (1,['mary','had','a','little','lamp'])],
                                         ['id','tokens'])

In [20]:
sentencedataframe.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0| [I, SAW, THE, LION]|
|  1|[mary, had, a, li...|
+---+--------------------+



In [21]:
remover = StopWordsRemover(inputCol='tokens',outputCol='filtered')

In [22]:
remover.transform(sentencedataframe).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0| [I, SAW, THE, LION]|         [SAW, LION]|
|  1|[mary, had, a, li...|[mary, little, lamp]|
+---+--------------------+--------------------+



In [23]:
from pyspark.ml.feature import NGram

In [33]:
wordDataframe = spark.createDataFrame([(0,['I','SAW','THE','LION']),
                                         (1,['mary','had','a','little','lamp'])],
                                         ['id','words'])

In [34]:
ngram = NGram(n=2, inputCol='words',outputCol='grams')

In [35]:
ngram.transform(wordDataframe).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0| [I, SAW, THE, LION]|[I SAW, SAW THE, ...|
|  1|[mary, had, a, li...|[mary had, had a,...|
+---+--------------------+--------------------+



In [37]:
ngram.transform(wordDataframe).select('grams').show(truncate=False)

+----------------------------------------+
|grams                                   |
+----------------------------------------+
|[I SAW, SAW THE, THE LION]              |
|[mary had, had a, a little, little lamp]|
+----------------------------------------+

