In [5]:
import sys

# bandaid fix for running this notebook in its folder on ROAR
external_folder_path = '../'
sys.path.insert(0, external_folder_path)

from util.sparkhandler import SparkHandler
from util.dataproctools import get_extracted_wet, save_rdd, load_rdd
from util.dataproctools import extracted_wet_to_df
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover, CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline

In [6]:
handler = SparkHandler(available_cores=4) # look at the parameters for SparkHandler to increase max memory
ss = handler.get_spark_session()
sc = handler.get_spark_context()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/10 18:49:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/10 18:49:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/10 18:49:29 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [7]:
loaded_str_data = load_rdd(spark_context=sc, path_to_load="../saved_intermediates/rawStrRDD")
df = extracted_wet_to_df(spark_session=ss, extracted_wet_rdd=loaded_str_data)
filtered = df.filter(((df.tld == '.gov') | (df.tld == '.edu')) & (df.languages == 'eng'))

                                                                                

                                                                                

In [15]:
text_df = filtered.select('raw_content')
train, val, test = text_df.randomSplit([0.7, 0.2, 0.1], seed=1237)

In [13]:
# 1) Tokenize text into tokens
tokenizer = RegexTokenizer(minTokenLength=2, gaps=False, pattern=r"\b[a-zA-Z]+[\d]*(?:[-'][a-zA-Z]+[\d]*)*\b", inputCol="raw_content", outputCol="tokenized")

# 2) Remove stopwords
remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")

# 3) Convert tokens to term-frequency vectors
cv = CountVectorizer(
    inputCol="filtered",
    outputCol="features",
    vocabSize=5000,
    minDF=1    # keep terms that appear in at least 1 document
)

# 4) LDA model (k = number of topics)
lda = LDA(
    k=12,
    maxIter=10,
    featuresCol="features"
)

# 5) Build pipeline
pipeline = Pipeline(stages=[tokenizer, remover, cv, lda])

pipeline

Pipeline_0a4d4ab2431f

In [26]:
model = pipeline.fit(train)
valed = model.transform(val)

                                                                                

In [28]:
perplexity = model.stages[-1].logPerplexity(valed)
log_likelyhood = model.stages[-1].logLikelihood(valed)

                                                                                

In [29]:
perplexity, log_likelyhood

(11.83888759514212, -208648.55497678474)

In [30]:
import optuna

In [38]:
def objective(trial: optuna.trial.Trial):

    minTokenLength = trial.suggest_int("min word len", 1, 3)
    k = trial.suggest_int("clusters", 1, 20)
    learningDecay = trial.suggest_float("lr decay", 0.5, 1.0)
    learningOffset = trial.suggest_float("offset", 0, 10)
    maxItr = trial.suggest_int("intrs", 10, 100)
    subsamplingRate = trial.suggest_float("subsampling rate", 0, 1)

    # 1) Tokenize text into tokens
    tokenizer = RegexTokenizer(minTokenLength=minTokenLength, gaps=False, pattern=r"\b[a-zA-Z]+[\d]*(?:[-'][a-zA-Z]+[\d]*)*\b", inputCol="raw_content", outputCol="tokenized")

    # 2) Remove stopwords
    remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")

    # 3) Convert tokens to term-frequency vectors
    cv = CountVectorizer(
        inputCol="filtered",
        outputCol="features",
        vocabSize=5000,
        minDF=1    # keep terms that appear in at least 1 document
    )

    # 4) LDA model (k = number of topics)
    lda = LDA(
        k=k,
        learningDecay=learningDecay,
        learningOffset=learningOffset,
        maxIter=maxItr,
        subsamplingRate=subsamplingRate,
        featuresCol="features"
    )

    # 5) Build pipeline
    pipeline = Pipeline(stages=[tokenizer, remover, cv, lda])
    model = pipeline.fit(train)
    valed = model.transform(val)
    perplexity = model.stages[-1].logPerplexity(valed)
    log_likelyhood = model.stages[-1].logLikelihood(valed)
    return perplexity, log_likelyhood

In [39]:
study = optuna.create_study(directions=["minimize", "maximize"])

[I 2025-12-10 17:17:12,324] A new study created in memory with name: no-name-ef03757c-472a-4e07-b6da-668199426365


In [40]:
study.optimize(objective, n_trials=10, timeout=300, n_jobs=8)

[I 2025-12-10 17:19:03,777] Trial 0 finished with values: [9.275974872763742, -163424.1253083516] and parameters: {'min word len': 2, 'clusters': 18, 'lr decay': 0.8014164819018568, 'offset': 2.744765026723408, 'intrs': 430, 'subsampling rate': 0.16450075545546827}.
[I 2025-12-10 17:19:07,152] Trial 1 finished with values: [12.304311381681629, -218376.91840208555] and parameters: {'min word len': 1, 'clusters': 12, 'lr decay': 0.5721986112891881, 'offset': 5.535232146211829, 'intrs': 15, 'subsampling rate': 0.003297773681030458}.
[I 2025-12-10 17:19:19,681] Trial 1 finished with values: [8.957066041741726, -152449.26403044417] and parameters: {'min word len': 3, 'clusters': 6, 'lr decay': 0.7154492260592116, 'offset': 9.224750438192697, 'intrs': 15, 'subsampling rate': 0.8770127231377713}.
[I 2025-12-10 17:19:26,161] Trial 7 finished with values: [8.945142743592205, -158651.05170035135] and parameters: {'min word len': 1, 'clusters': 9, 'lr decay': 0.5835880704022454, 'offset': 0.57466

In [42]:
import pickle
with open("study.pkl", "wb") as f:
    pickle.dump(study, f)

In [3]:
import optuna
import pickle
with open("study.pkl", "rb") as f:
    study = pickle.load(f)

In [4]:
optuna.visualization.plot_pareto_front(study, target_names=["perplexity", "log likelyhood"])