In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [5]:
from pyspark.sql.functions import col, udf

In [6]:
from pyspark.sql.types import IntegerType

In [7]:
sen_df = spark.createDataFrame([(0,'Hi I heared about Spark'),
                                (1, 'I wish Java could use case classes'),
                                (2, 'Logistic, regression, models, are, neat')],
                               ['id','sentence']
                                )

In [8]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heared about...|
|  1|I wish Java could...|
|  2|Logistic, regress...|
+---+--------------------+



In [9]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')

In [10]:
regex_tokenizer = RegexTokenizer(inputCol='sentence',outputCol='words', pattern='\\W')

In [11]:
count_tokens = udf(lambda words:len(words),IntegerType())

In [12]:
tokenized = tokenizer.transform(sen_df)

In [13]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heared about...|[hi, i, heared, a...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic, regress...|[logistic,, regre...|
+---+--------------------+--------------------+



In [14]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heared about...|[hi, i, heared, a...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|Logistic, regress...|[logistic,, regre...|     5|
+---+--------------------+--------------------+------+



In [15]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [16]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heared about...|[hi, i, heared, a...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|Logistic, regress...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



# StopWords

In [17]:
from pyspark.ml.feature import StopWordsRemover

In [23]:
sentenceDataFrame = spark.createDataFrame([(0, ['I','saw','the','green','horse']),
                                         (1, ['Mary','had','a','little','lamb'])],
                                        ['id','tokens'])

In [24]:
sentenceDataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [25]:
remover = StopWordsRemover(inputCol='tokens',outputCol='filtered')

In [26]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [27]:
# n-gram

In [28]:
from pyspark.ml.feature import NGram

In [39]:
wordDataFrame = spark.createDataFrame([(0,['Hi','I','heard','about','spark']),
                                       (1,['I','wish','java','could','use','case','classes']),
                                       (2,['Logistic','regression','models','are','neat'])],
                                      ['id','words'])

In [40]:
ngram = NGram(n=2, inputCol='words',outputCol='grams')

In [41]:
ngram.transform(wordDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[Hi, I, heard, ab...|[Hi I, I heard, h...|
|  1|[I, wish, java, c...|[I wish, wish jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+



In [42]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about spark]                         |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

