In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, Tokenizer

In [2]:
spark = SparkSession.builder.appName('vectorizer').getOrCreate()

In [3]:
# Create sample dataframe
df = spark.createDataFrame([
    (0, "many many some few many"),
    (1, "some many some few some"),
    (2, "one many")
], ["id", "words"])

In [4]:
# Tokenize the dataframe
tokenizer = Tokenizer(inputCol="words", outputCol="tokens")
tokenized = tokenizer.transform(df)
tokenized.show()

+---+--------------------+--------------------+
| id|               words|              tokens|
+---+--------------------+--------------------+
|  0|many many some fe...|[many, many, some...|
|  1|some many some fe...|[some, many, some...|
|  2|            one many|         [one, many]|
+---+--------------------+--------------------+



In [5]:
# Vectorize the term frequency
countvectorizer = CountVectorizer(minTF=1.0, minDF=1.0, vocabSize=20,
                                 inputCol='tokens', outputCol='Vectors, [Indexes], [Frequencies]')

model = countvectorizer.fit(tokenized)
result = model.transform(tokenized)
result.show(truncate=False)

+---+-----------------------+-----------------------------+---------------------------------+
|id |words                  |tokens                       |Vectors, [Indexes], [Frequencies]|
+---+-----------------------+-----------------------------+---------------------------------+
|0  |many many some few many|[many, many, some, few, many]|(4,[0,1,2],[3.0,1.0,1.0])        |
|1  |some many some few some|[some, many, some, few, some]|(4,[0,1,2],[1.0,3.0,1.0])        |
|2  |one many               |[one, many]                  |(4,[0,3],[1.0,1.0])              |
+---+-----------------------+-----------------------------+---------------------------------+



### Checking Vectors
To double check that words are matching up with the correct index we will check to see what the index of each word is in an easy to read format.

In [6]:
# From the tokenized dataframe grab every word used. 
from pyspark.sql.types import StringType
df_vocab = tokenized.select('tokens').rdd.\
            flatMap(lambda x: x[0]).\
            toDF(schema=StringType()).toDF('terms')
df_vocab.show()

+-----+
|terms|
+-----+
| many|
| many|
| some|
|  few|
| many|
| some|
| many|
| some|
|  few|
| some|
|  one|
| many|
+-----+



In [7]:
# Use string indexer to of each word
from pyspark.ml.feature import StringIndexer
stringindexer = StringIndexer(inputCol='terms', outputCol='StringIndexer(index)')

In [8]:
# Fit the string indexer to the vocab set and remove duplicates
stringindexer.fit(df_vocab).transform(df_vocab).\
    distinct().\
    orderBy('StringIndexer(index)').show()

+-----+--------------------+
|terms|StringIndexer(index)|
+-----+--------------------+
| many|                 0.0|
| some|                 1.0|
|  few|                 2.0|
|  one|                 3.0|
+-----+--------------------+



In [9]:
spark.stop()