In [16]:
SPARK_MASTER = 'spark://cs303:53725'
SAVE_DIRECTORY = 'scratch/tmv7269/lda'
TARGET_PATH = '/scratch/tmv7269/datasets/wikipedia.en'

In [17]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark

In [18]:
# setting up spark
from pyspark.sql import SparkSession
import pyspark

spark = SparkSession.builder.master(SPARK_MASTER) \
                            .config('spark.executor.memory', '4G') \
                            .config('spark.driver.memory', '60G') \
                            .config("spark.memory.offHeap.enabled", True) \
                            .config("spark.memory.offHeap.size","16g") \
                            .appName('LDA') \
                            .getOrCreate()
spark.sparkContext.setLogLevel("Off")
spark

In [19]:
from datasets import load_dataset
import datasets
import pandas as pd
import time
from pathlib import Path

datasets.config.DOWNLOADED_DATASETS_PATH = Path(TARGET_PATH + '/data')
datasets.config.HF_DATASETS_CACHE = Path(TARGET_PATH + '/cache')

# LDA parameters, go to max in production
MAX_ITER = 100
CHECKPOINT = 5
K = 100

In [None]:
# change it to this to use the full 1m dataset
dataset = load_dataset("wikipedia", "20220301.simple")
# dataset = load_dataset("wikipedia", "20220301.simple")
dataset

In [None]:
sparkDF = spark.createDataFrame(dataset['train'])
sparkDF.columns

In [None]:
sparkDF.count()

In [None]:
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, CountVectorizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [None]:
tokenizer = RegexTokenizer(inputCol='text',outputCol='words', pattern = '[^a-zA-Z]')
tokenized_df = tokenizer.transform(sparkDF).drop('text')
tokenized_df.head()

In [None]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_df = remover.transform(tokenized_df).drop('words')
removed_df.head()

In [11]:
cv = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2.0).fit(removed_df)

lda_count = cv.transform(removed_df).drop('filtered')
lda_count.head()

                                                                                

Row(id='1', title='April', url='https://simple.wikipedia.org/wiki/April', features=SparseVector(221737, {0: 5.0, 1: 1.0, 2: 7.0, 3: 3.0, 4: 17.0, 5: 1.0, 6: 10.0, 7: 3.0, 8: 13.0, 9: 8.0, 12: 3.0, 13: 1.0, 14: 1.0, 17: 13.0, 18: 2.0, 19: 1.0, 22: 1.0, 25: 10.0, 26: 6.0, 27: 1.0, 32: 7.0, 34: 1.0, 35: 4.0, 37: 4.0, 38: 5.0, 40: 1.0, 41: 1.0, 42: 20.0, 44: 7.0, 46: 3.0, 49: 4.0, 50: 5.0, 55: 25.0, 56: 1.0, 59: 1.0, 60: 6.0, 61: 13.0, 62: 1.0, 65: 1.0, 66: 1.0, 67: 4.0, 71: 3.0, 72: 3.0, 76: 3.0, 77: 5.0, 78: 208.0, 81: 1.0, 82: 3.0, 85: 3.0, 86: 1.0, 90: 1.0, 93: 2.0, 94: 2.0, 96: 1.0, 97: 1.0, 103: 2.0, 106: 1.0, 109: 96.0, 111: 10.0, 117: 5.0, 120: 1.0, 123: 1.0, 126: 2.0, 129: 2.0, 140: 1.0, 142: 5.0, 147: 4.0, 152: 3.0, 153: 5.0, 154: 1.0, 155: 1.0, 156: 1.0, 158: 3.0, 159: 2.0, 160: 1.0, 161: 2.0, 162: 1.0, 170: 1.0, 174: 4.0, 176: 2.0, 181: 5.0, 183: 2.0, 184: 4.0, 186: 2.0, 187: 2.0, 188: 1.0, 192: 1.0, 194: 1.0, 198: 1.0, 199: 1.0, 200: 4.0, 203: 1.0, 204: 1.0, 205: 2.0, 206: 3.0

In [12]:
from pyspark.ml.clustering import LDA

# create LDA with K topics

start = time.time()
lda = LDA(k=K, seed=1, optimizer="em", maxIter=MAX_ITER, checkpointInterval=CHECKPOINT)
model = lda.fit(lda_count)
end = time.time()

print(f"Time elapsed: {end-start:.2f} seconds")

                                                                                

Time elapsed: 678.89 seconds


In [13]:
topicIndices = model.describeTopics(maxTermsPerTopic = 5)
vocabList = cv.vocabulary

for row in topicIndices.collect():
    print(f"Topic {row.topic + 1}: ")
    for topic, weight in zip(row.termIndices, row.termWeights):
        print(f"{vocabList[topic]} {weight:.2E}")
    print()

Topic 1: 
south 1.38E-01
australia 7.48E-02
australian 5.47E-02
africa 4.27E-02
african 4.11E-02

Topic 2: 
january 2.01E-01
president 1.43E-01
former 1.12E-01
office 4.85E-02
head 3.85E-02

Topic 3: 
california 8.23E-02
park 6.91E-02
o 6.77E-02
san 5.62E-02
brazilian 2.92E-02

Topic 4: 
may 3.38E-01
radio 6.07E-02
news 4.32E-02
bbc 2.57E-02
host 2.46E-02

Topic 5: 
people 8.27E-01
lived 7.02E-02
live 4.12E-02
million 2.44E-02
oldest 6.86E-03

Topic 6: 
series 1.22E-01
television 9.81E-02
show 6.04E-02
tv 3.19E-02
season 2.54E-02

Topic 7: 
church 8.75E-02
switzerland 6.50E-02
municipality 6.47E-02
municipalities 6.36E-02
canton 4.78E-02

Topic 8: 
north 1.34E-01
local 3.31E-02
asia 2.91E-02
areas 2.86E-02
government 2.69E-02

Topic 9: 
called 8.39E-02
made 8.02E-02
like 6.09E-02
many 5.92E-02
used 4.27E-02

Topic 10: 
world 2.79E-01
championship 4.59E-02
times 3.86E-02
time 3.79E-02
one 3.44E-02

Topic 11: 
animated 1.78E-02
disney 1.75E-02
additional 9.99E-03
cat 9.98E-03
voice 9.65E

In [14]:
model.write().overwrite().save('/scratch/tmv7269/lda')

                                                                                

In [21]:
model.vocabSize()

221737

In [None]:
model.logPerplexity(lda_count)

In [None]:
model.logLikelihood(lda_count)