In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [4]:
sentence_df = spark.createDataFrame([
    (0,'Hi I heard about Spark'),
    (1,'I wish Java could use case classes'),
    (2,'logistic,regression,models,are,neat')],
    ['id','sentence'])

In [5]:
sentence_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|logistic,regressi...|
+---+--------------------+



# Tokenizer

In [6]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')
regexTokenizer = RegexTokenizer(inputCol='sentence',outputCol='words',pattern='\\W')

In [7]:
countTokens = udf(lambda words: len(words), IntegerType())

In [8]:
tokenized = tokenizer.transform(sentence_df)

In [9]:
tokenized.withColumn('tokens',countTokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [10]:
regexTokenized = regexTokenizer.transform(sentence_df)

In [11]:
regexTokenized.withColumn('tokens',countTokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



# Stop Words Removal

In [12]:
from pyspark.ml.feature import StopWordsRemover

In [13]:
sentence_df = spark.createDataFrame([
    (0,['I','saw','the','red','balloon']),
    (1,['mary','had','a','little','lamb'])],
    ['id','raw'])

In [14]:
sentence_df.show()

+---+--------------------+
| id|                 raw|
+---+--------------------+
|  0|[I, saw, the, red...|
|  1|[mary, had, a, li...|
+---+--------------------+



In [15]:
remover = StopWordsRemover(inputCol='raw',outputCol='featured')

In [16]:
remover.transform(sentence_df).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |featured            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[mary, had, a, little, lamb]|[mary, little, lamb]|
+---+----------------------------+--------------------+



# n-grams

In [17]:
from pyspark.ml.feature import NGram

In [18]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])], 
    ["id", "words"])

In [19]:
wordDataFrame.show(truncate=False)

+---+------------------------------------------+
|id |words                                     |
+---+------------------------------------------+
|0  |[Hi, I, heard, about, Spark]              |
|1  |[I, wish, Java, could, use, case, classes]|
|2  |[Logistic, regression, models, are, neat] |
+---+------------------------------------------+



In [20]:
ngram = NGram(n=2,inputCol='words',outputCol='ngrams')

In [21]:
ngramDataFrame = ngram.transform(wordDataFrame)

In [22]:
ngramDataFrame.select('ngrams').show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [23]:
from pyspark.ml.feature import HashingTF,IDF,Tokenizer

In [24]:
sentenceData = spark.createDataFrame([
    (0.0,'Hi I heard about Spark'),
    (0.0,'I wish Java could use case classes'),
    (1.0,'logistic,regression,models,are,neat')],
    ['label','sentence'])

In [25]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|logistic,regressi...|
+-----+--------------------+



In [26]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')

In [27]:
words_data = tokenizer.transform(sentenceData)

In [28]:
words_data.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|logistic,regressi...|[logistic,regress...|
+-----+--------------------+--------------------+



In [29]:
hashing_tf = HashingTF(inputCol='words',outputCol='rawFeatures')

In [30]:
hashed_data = hashing_tf.transform(words_data)

In [31]:
hashed_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+--------------------------------------------------------------------------------------+
|label|sentence                           |words                                     |rawFeatures                                                                           |
+-----+-----------------------------------+------------------------------------------+--------------------------------------------------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |(262144,[24417,49304,73197,91137,234657],[1.0,1.0,1.0,1.0,1.0])                       |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|(262144,[20719,24417,55551,116873,147765,162369,192310],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1.0  |logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |(262144,[84234],[1.0])                      

In [32]:
idf = IDF(inputCol='rawFeatures',outputCol='features')

In [33]:
idf_model = idf.fit(hashed_data)

In [34]:
rescaled_data = idf_model.transform(hashed_data)

In [37]:
rescaled_data.select('label','features').show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[24417,49304,73197,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|0.0  |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

In [38]:
from pyspark.ml.feature import CountVectorizer

In [39]:
df = spark.createDataFrame([
    (0,'a b c'.split(' ')),
    (1,'a b b c a'.split(' '))],
    ['id','words'])

In [40]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [42]:
cv = CountVectorizer(inputCol='words',outputCol='features',vocabSize=3,minDF=2.0)

In [43]:
model = cv.fit(df)

In [44]:
result = model.transform(df)

In [45]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

