In [2]:
from datasets import load_dataset
import pandas as pd
import time

In [3]:
dataset = load_dataset("wikipedia", "20220301.simple")
df = pd.DataFrame(dataset['train'])

Found cached dataset wikipedia (C:/Users/vomin/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# test with random 10000 documents
df = df.sample(n=10000)
len(df)

10000

In [5]:
import pyspark
from pyspark import SparkContext, SparkConf

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

spark

In [6]:
sparkDF = spark.createDataFrame(df)
sparkDF.columns

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


['id', 'url', 'title', 'text']

In [7]:
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, CountVectorizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [8]:
countTokens = udf(lambda words: len(words), IntegerType())
tokenizer = RegexTokenizer(inputCol='text',outputCol='words', pattern = '[^a-zA-Z]')
tokenized_df = tokenizer.transform(sparkDF).drop('text')
tokenized_df.head()

Row(id='364479', url='https://simple.wikipedia.org/wiki/List%20of%20U.S.%20state%20ships', title='List of U.S. state ships', words=['this', 'is', 'a', 'list', 'of', 'official', 'u', 's', 'state', 'ships', 'references', 'ships', 'ships', 'transport', 'lists', 'transport', 'in', 'the', 'united', 'states'])

In [9]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_df = remover.transform(tokenized_df).drop('words')
removed_df.head()

Row(id='364479', url='https://simple.wikipedia.org/wiki/List%20of%20U.S.%20state%20ships', title='List of U.S. state ships', filtered=['list', 'official', 'u', 'state', 'ships', 'references', 'ships', 'ships', 'transport', 'lists', 'transport', 'united', 'states'])

In [14]:
cv = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2.0).fit(removed_df)

lda_count = cv.transform(removed_df).drop('filtered')
lda_count.head()

Row(id='364479', url='https://simple.wikipedia.org/wiki/List%20of%20U.S.%20state%20ships', title='List of U.S. state ships', features=SparseVector(32229, {0: 1.0, 8: 1.0, 11: 1.0, 35: 1.0, 72: 1.0, 103: 1.0, 163: 1.0, 1076: 2.0, 1084: 3.0, 1353: 1.0}))

In [11]:
from pyspark.ml.clustering import LDA

# create LDA with 100 topics

start = time.time()
lda = LDA(k=100, seed=1, optimizer="em", maxIter=1)
model = lda.fit(lda_count)
end = time.time()

print(f"Time elapsed: {end-start}")

Time elapsed: 29.626336574554443


In [41]:
topicIndices = model.describeTopics(maxTermsPerTopic = 5)
vocabList = cv.vocabulary

for row in topicIndices.collect():
    print(f"Topic {row.topic + 1}: ")
    for topic, weight in zip(row.termIndices, row.termWeights):
        print(f"{vocabList[topic]} {weight:.2E}")
    print()

Topic 1: 
references 6.60E-03
people 6.42E-03
american 5.86E-03
also 4.50E-03
one 3.87E-03

Topic 2: 
references 6.63E-03
people 6.46E-03
american 6.28E-03
also 4.60E-03
one 3.95E-03

Topic 3: 
references 6.61E-03
people 6.49E-03
american 5.93E-03
also 4.44E-03
one 3.81E-03

Topic 4: 
references 6.60E-03
people 6.54E-03
american 6.31E-03
also 4.67E-03
born 3.84E-03

Topic 5: 
references 6.61E-03
people 6.46E-03
american 6.03E-03
also 4.50E-03
one 3.84E-03

Topic 6: 
references 6.56E-03
people 6.47E-03
american 6.08E-03
also 4.72E-03
first 3.96E-03

Topic 7: 
references 6.64E-03
people 6.41E-03
american 6.01E-03
also 4.66E-03
first 4.02E-03

Topic 8: 
references 6.58E-03
people 6.53E-03
american 6.32E-03
also 4.52E-03
first 3.90E-03

Topic 9: 
references 6.67E-03
people 6.61E-03
american 6.11E-03
also 4.63E-03
one 3.92E-03

Topic 10: 
references 6.65E-03
people 6.53E-03
american 6.33E-03
also 4.68E-03
first 4.07E-03

Topic 11: 
references 6.63E-03
people 6.42E-03
american 6.36E-03
also 