In [2]:
from pyspark.ml.clustering import LDA
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("LDAExample") \
        .getOrCreate()

    # Loads data.
    dataset = spark.read.format("libsvm").load("sample_lda_libsvm_data.txt")

    # Trains a LDA model.
    lda = LDA(k=10, maxIter=10)
    model = lda.fit(dataset)

    ll = model.logLikelihood(dataset)
    lp = model.logPerplexity(dataset)
    print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
    print("The upper bound on perplexity: " + str(lp))

    # Describe topics.
    topics = model.describeTopics(3)
    print("The topics described by their top-weighted terms:")
    topics.show(truncate=False)

    # Shows the result
    transformed = model.transform(dataset)
    transformed.show(truncate=False)

    spark.stop()

The lower bound on the log likelihood of the entire corpus: -790.8954414680816
The upper bound on perplexity: 3.041905544108006
The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[4, 8, 6]  |[0.10430883833529807, 0.10404870575437365, 0.09556068779773554]|
|1    |[5, 9, 1]  |[0.11219984333172794, 0.1008792924027928, 0.09701302380199746] |
|2    |[10, 6, 3] |[0.27769682778211396, 0.15610547738154668, 0.10421058096457766]|
|3    |[6, 10, 5] |[0.10004403404278889, 0.096828070818015, 0.09275715962797348]  |
|4    |[6, 1, 8]  |[0.1030676314016485, 0.10285888091794514, 0.100316438364213]   |
|5    |[7, 8, 3]  |[0.1033966444824766, 0.10008498686769687, 0.09915597626003157] |
|6    |[6, 1, 3]  |[0.10294822920495952, 0.09937010951136092, 0.09