In [1]:
from datasets import load_dataset
import pandas as pd
import time

# LDA parameters, go to max in production
MAX_ITER = 1
CHECKPOINT = 5
K = 100

In [2]:
dataset = load_dataset("wikipedia", "20220301.simple")
df = pd.DataFrame(dataset['train'])

Found cached dataset wikipedia (C:/Users/vomin/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# test with random 10000 documents
df = df.sample(n=10000)
len(df)

10000

In [4]:
import pyspark
from pyspark import SparkContext, SparkConf

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

spark

In [5]:
sparkDF = spark.createDataFrame(df)
sparkDF.columns

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


['id', 'url', 'title', 'text']

In [6]:
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, CountVectorizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [8]:
tokenizer = RegexTokenizer(inputCol='text',outputCol='words', pattern = '[^a-zA-Z]')
tokenized_df = tokenizer.transform(sparkDF).drop('text')
tokenized_df.head()

Row(id='413960', url='https://simple.wikipedia.org/wiki/Aurelio%20Mosquera', title='Aurelio Mosquera', words=['don', 'aurelio', 'mosquera', 'narv', 'ez', 'august', 'november', 'was', 'an', 'ecuadorian', 'politician', 'who', 'was', 'the', 'president', 'of', 'ecuador', 'from', 'december', 'to', 'november', 'mosquera', 'was', 'born', 'in', 'quito', 'he', 'studied', 'medicine', 'in', 'quito', 'then', 'went', 'to', 'paris', 'to', 'carry', 'in', 'with', 'his', 'education', 'in', 'after', 'the', 'removal', 'of', 'manuel', 'mar', 'a', 'borrero', 'he', 'was', 'named', 'president', 'of', 'ecuador', 'during', 'his', 'short', 'time', 'as', 'president', 'with', 'the', 'support', 'of', 'the', 'army', 'he', 'dissolved', 'the', 'national', 'assembly', 'and', 'reestablished', 'the', 'constitution', 'of', 'known', 'for', 'its', 'secularity', 'his', 'term', 'as', 'president', 'ended', 'when', 'he', 'died', 'on', 'november', 'in', 'quito', 'from', 'a', 'heart', 'attack', 'aged', 'other', 'websites', 'offi

In [9]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_df = remover.transform(tokenized_df).drop('words')
removed_df.head()

Row(id='413960', url='https://simple.wikipedia.org/wiki/Aurelio%20Mosquera', title='Aurelio Mosquera', filtered=['aurelio', 'mosquera', 'narv', 'ez', 'august', 'november', 'ecuadorian', 'politician', 'president', 'ecuador', 'december', 'november', 'mosquera', 'born', 'quito', 'studied', 'medicine', 'quito', 'went', 'paris', 'carry', 'education', 'removal', 'manuel', 'mar', 'borrero', 'named', 'president', 'ecuador', 'short', 'time', 'president', 'support', 'army', 'dissolved', 'national', 'assembly', 'reestablished', 'constitution', 'known', 'secularity', 'term', 'president', 'ended', 'died', 'november', 'quito', 'heart', 'attack', 'aged', 'websites', 'official', 'website', 'ecuadorian', 'government', 'country', 'president', 'history', 'births', 'deaths', 'deaths', 'myocardial', 'infarction', 'presidents', 'ecuador', 'people', 'quito'])

In [10]:
cv = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2.0).fit(removed_df)

lda_count = cv.transform(removed_df).drop('filtered')
lda_count.head()

Row(id='413960', url='https://simple.wikipedia.org/wiki/Aurelio%20Mosquera', title='Aurelio Mosquera', features=SparseVector(32719, {0: 1.0, 7: 1.0, 10: 1.0, 11: 1.0, 14: 2.0, 17: 1.0, 25: 1.0, 32: 1.0, 33: 1.0, 60: 5.0, 62: 1.0, 67: 1.0, 79: 1.0, 91: 1.0, 92: 3.0, 108: 1.0, 118: 1.0, 134: 1.0, 143: 1.0, 157: 1.0, 233: 1.0, 291: 1.0, 324: 1.0, 357: 1.0, 376: 1.0, 382: 1.0, 432: 1.0, 569: 1.0, 617: 1.0, 623: 1.0, 686: 1.0, 771: 1.0, 988: 1.0, 1155: 1.0, 1303: 1.0, 1311: 1.0, 2375: 1.0, 2406: 1.0, 2460: 1.0, 2642: 1.0, 3045: 3.0, 3214: 1.0, 5905: 1.0, 7195: 2.0, 9392: 1.0, 10900: 1.0, 11863: 4.0, 28548: 1.0, 28700: 1.0}))

In [11]:
from pyspark.ml.clustering import LDA

# create LDA with 100 topics

start = time.time()
lda = LDA(k=K, seed=1, optimizer="em", maxIter=MAX_ITER, checkpointInterval=CHECKPOINT)
model = lda.fit(lda_count)
end = time.time()

print(f"Time elapsed: {end-start:.2f} seconds")

Time elapsed: 31.88 seconds


In [12]:
topicIndices = model.describeTopics(maxTermsPerTopic = 5)
vocabList = cv.vocabulary

for row in topicIndices.collect():
    print(f"Topic {row.topic + 1}: ")
    for topic, weight in zip(row.termIndices, row.termWeights):
        print(f"{vocabList[topic]} {weight:.2E}")
    print()

Topic 1: 
people 6.84E-03
american 6.62E-03
references 6.55E-03
also 4.63E-03
first 4.19E-03

Topic 2: 
people 6.77E-03
american 6.69E-03
references 6.46E-03
also 4.68E-03
first 4.12E-03

Topic 3: 
people 6.81E-03
american 6.53E-03
references 6.52E-03
also 4.61E-03
first 3.83E-03

Topic 4: 
people 6.98E-03
references 6.52E-03
american 6.31E-03
also 4.56E-03
first 4.12E-03

Topic 5: 
american 6.88E-03
people 6.83E-03
references 6.65E-03
also 4.58E-03
first 3.98E-03

Topic 6: 
american 7.13E-03
people 7.06E-03
references 6.51E-03
also 4.79E-03
first 3.91E-03

Topic 7: 
people 6.97E-03
american 6.86E-03
references 6.58E-03
also 4.70E-03
first 3.91E-03

Topic 8: 
american 6.58E-03
people 6.55E-03
references 6.55E-03
also 4.70E-03
first 4.02E-03

Topic 9: 
people 6.95E-03
american 6.88E-03
references 6.60E-03
also 4.90E-03
first 3.84E-03

Topic 10: 
people 7.00E-03
american 6.78E-03
references 6.50E-03
also 4.56E-03
first 3.89E-03

Topic 11: 
people 7.14E-03
american 6.77E-03
references 6.5