In [1]:
import findspark
findspark.init('/home/siddharth/spark-2.4.1-bin-hadoop2.7/')

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [4]:
from pyspark.ml.feature import CountVectorizer,Tokenizer,RegexTokenizer,StopWordsRemover,NGram,HashingTF,IDF

In [5]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

## Tokenizers

In [6]:
data = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

In [7]:
token = Tokenizer(inputCol='sentence',outputCol='token')

In [8]:
token_data = token.transform(data)

In [9]:
# Using truncate=False

token_data.show(truncate=False)

+---+-----------------------------------+------------------------------------------+
|id |sentence                           |token                                     |
+---+-----------------------------------+------------------------------------------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|2  |Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |
+---+-----------------------------------+------------------------------------------+



In [10]:
# creating a user defined function in spark

countTokens = udf(lambda words:len(words),IntegerType())

In [11]:
token_data.withColumn('#token',countTokens(col('token'))).show(truncate=False)

+---+-----------------------------------+------------------------------------------+------+
|id |sentence                           |token                                     |#token|
+---+-----------------------------------+------------------------------------------+------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|2  |Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |1     |
+---+-----------------------------------+------------------------------------------+------+



In [12]:
reg_token_data = RegexTokenizer(inputCol='sentence',outputCol='regToken',pattern='\\W').transform(data)
reg_token_data.show(truncate=False)

+---+-----------------------------------+------------------------------------------+
|id |sentence                           |regToken                                  |
+---+-----------------------------------+------------------------------------------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|2  |Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |
+---+-----------------------------------+------------------------------------------+



In [13]:
# using udf countToken column

reg_token_data.withColumn('#regToken',countTokens(col('regToken'))).show(truncate = False)

+---+-----------------------------------+------------------------------------------+---------+
|id |sentence                           |regToken                                  |#regToken|
+---+-----------------------------------+------------------------------------------+---------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |5        |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7        |
|2  |Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |5        |
+---+-----------------------------------+------------------------------------------+---------+



## Stopwords

In [14]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

sentenceData.show(truncate = False)

+---+----------------------------+
|id |raw                         |
+---+----------------------------+
|0  |[I, saw, the, red, balloon] |
|1  |[Mary, had, a, little, lamb]|
+---+----------------------------+



In [15]:
StopWordsRemover(inputCol='raw',outputCol='removed_words').transform(sentenceData).show(truncate= False)

+---+----------------------------+--------------------+
|id |raw                         |removed_words       |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



## n-grams

In [16]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

wordDataFrame.show(truncate= False)

+---+------------------------------------------+
|id |words                                     |
+---+------------------------------------------+
|0  |[Hi, I, heard, about, Spark]              |
|1  |[I, wish, Java, could, use, case, classes]|
|2  |[Logistic, regression, models, are, neat] |
+---+------------------------------------------+



In [17]:
NGram(n=2,inputCol='words',outputCol='ngram2').transform(wordDataFrame).show(truncate=False)

+---+------------------------------------------+------------------------------------------------------------------+
|id |words                                     |ngram2                                                            |
+---+------------------------------------------+------------------------------------------------------------------+
|0  |[Hi, I, heard, about, Spark]              |[Hi I, I heard, heard about, about Spark]                         |
|1  |[I, wish, Java, could, use, case, classes]|[I wish, wish Java, Java could, could use, use case, case classes]|
|2  |[Logistic, regression, models, are, neat] |[Logistic regression, regression models, models are, are neat]    |
+---+------------------------------------------+------------------------------------------------------------------+



## Feature Extractor

### tf - idf

In [18]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sentenceData.show(truncate = False)

+-----+-----------------------------------+
|label|sentence                           |
+-----+-----------------------------------+
|0.0  |Hi I heard about Spark             |
|0.0  |I wish Java could use case classes |
|1.0  |Logistic regression models are neat|
+-----+-----------------------------------+



In [19]:
token_data = Tokenizer(inputCol='sentence',outputCol='words').transform(sentenceData)
token_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



In [20]:
tf_data = HashingTF(inputCol='words',outputCol='tf',numFeatures=20).transform(token_data)
tf_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+-----------------------------------------+
|label|sentence                           |words                                     |tf                                       |
+-----+-----------------------------------+------------------------------------------+-----------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |(20,[0,5,9,17],[1.0,1.0,1.0,2.0])        |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|(20,[2,7,9,13,15],[1.0,1.0,3.0,1.0,1.0]) |
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |(20,[4,6,13,15,18],[1.0,1.0,1.0,1.0,1.0])|
+-----+-----------------------------------+------------------------------------------+-----------------------------------------+



### IDF acts as a model object as it uses fit and transform

In [21]:
tf_idf_data = IDF(inputCol='tf',outputCol='features').fit(tf_data).transform(tf_data)
tf_idf_data.select('features').show(truncate = False)

+----------------------------------------------------------------------------------------------------------------------+
|features                                                                                                              |
+----------------------------------------------------------------------------------------------------------------------+
|(20,[0,5,9,17],[0.6931471805599453,0.6931471805599453,0.28768207245178085,1.3862943611198906])                        |
|(20,[2,7,9,13,15],[0.6931471805599453,0.6931471805599453,0.8630462173553426,0.28768207245178085,0.28768207245178085]) |
|(20,[4,6,13,15,18],[0.6931471805599453,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.6931471805599453])|
+----------------------------------------------------------------------------------------------------------------------+



## Countvectorizer

In [22]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

df.show(truncate=False)

+---+---------------+
|id |words          |
+---+---------------+
|0  |[a, b, c]      |
|1  |[a, b, b, c, a]|
+---+---------------+



### CountVectorizer acts as a model object as it uses fit and transform

In [23]:
CountVectorizer(minDF=2,vocabSize=3,inputCol='words',outputCol='count_Vect').fit(df).transform(df).show(truncate=False)

+---+---------------+-------------------------+
|id |words          |count_Vect               |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

