In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark_LDA_Pipeline") \
    .getOrCreate()

spark


25/12/09 18:00:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [28]:
from pyspark.sql import Row

data = [
    Row(text="This is a test document about machine learning and data science."),
    Row(text="Spark LDA is useful for topic modeling on large text corpora."),
    Row(text="Deep learning neural networks process text data differently."),
    Row(text="Topic modeling identifies latent themes in text documents."),
    Row(text="Clustering and classification are core machine learning tasks."),
]

df = spark.createDataFrame(data)
df.show(truncate=False)


+----------------------------------------------------------------+
|text                                                            |
+----------------------------------------------------------------+
|This is a test document about machine learning and data science.|
|Spark LDA is useful for topic modeling on large text corpora.   |
|Deep learning neural networks process text data differently.    |
|Topic modeling identifies latent themes in text documents.      |
|Clustering and classification are core machine learning tasks.  |
+----------------------------------------------------------------+



In [29]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline

# 1) Tokenize text into tokens
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# 2) Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# 3) Convert tokens to term-frequency vectors
cv = CountVectorizer(
    inputCol="filtered",
    outputCol="features",
    vocabSize=5000,
    minDF=1    # keep terms that appear in at least 1 document
)

# 4) LDA model (k = number of topics)
lda = LDA(
    k=3,
    maxIter=10,
    featuresCol="features"
)

# 5) Build pipeline
pipeline = Pipeline(stages=[tokenizer, remover, cv, lda])

pipeline


Pipeline_1f92ada587e9

In [30]:
model = pipeline.fit(df)

# Extract the CV and LDA sub-models
cv_model = model.stages[2]
lda_model = model.stages[3]

vocab = cv_model.vocabulary
len(vocab), vocab[:20]


25/12/09 18:00:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


(27,
 ['text',
  'learning',
  'topic',
  'data',
  'machine',
  'modeling',
  'differently.',
  'spark',
  'deep',
  'process',
  'clustering',
  'networks',
  'useful',
  'neural',
  'themes',
  'lda',
  'classification',
  'large',
  'identifies',
  'core'])

In [33]:
num_words = 10

# Spark older versions require positional args
topics = lda_model.describeTopics(num_words)

topics.show(truncate=False)

# Map term indices back to words and print nicely
for row in topics.collect():
    topic_id = row.topic
    term_indices = row.termIndices
    term_weights = row.termWeights
    terms = [vocab[idx] for idx in term_indices]
    
    print(f"Topic {topic_id}:")
    for term, weight in zip(terms, term_weights):
        print(f"  {term:18s} {weight:.4f}")
    print()



+-----+------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                         |termWeights                                                                                                                                                                                                             |
+-----+------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[17, 10, 13, 18, 12, 5, 7, 0, 23, 9]|[0.042917006050659935, 0.04291467482002634, 0.04171040252853162, 0.04170473250332992, 0.041315260837475526, 0.041095392612391916, 0.040482448390576055, 0.0393712237733560

In [32]:
transformed = model.transform(df)

# Show the text and its topic distribution
transformed.select("text", "topicDistribution").show(truncate=False)


+----------------------------------------------------------------+-------------------------------------------------------------+
|text                                                            |topicDistribution                                            |
+----------------------------------------------------------------+-------------------------------------------------------------+
|This is a test document about machine learning and data science.|[0.056317115516664296,0.0580467944260681,0.8856360900572676] |
|Spark LDA is useful for topic modeling on large text corpora.   |[0.04321544726090904,0.9157818198481585,0.04100273289093252] |
|Deep learning neural networks process text data differently.    |[0.04596807084395182,0.9070182104424673,0.047013718713580774]|
|Topic modeling identifies latent themes in text documents.      |[0.05053219543309071,0.8990981717561027,0.05036963281080657] |
|Clustering and classification are core machine learning tasks.  |[0.8791474780115819,0.062251992