In [1]:
from datasets import load_dataset
import pandas as pd
import time

# LDA parameters, go to max in production
MAX_ITER = 1
CHECKPOINT = 5
K = 100

In [2]:
# change it to this to use the full 1m dataset
# load_dataset("wikipedia", "20220301.en")
dataset = load_dataset("wikipedia", "20220301.simple", streaming=True)
df = pd.DataFrame(dataset['train'])

Found cached dataset wikipedia (C:/Users/vomin/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# test with random 10000 documents
df = df.sample(n=10000)
len(df)

10000

In [4]:
import pyspark
from pyspark import SparkContext, SparkConf

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

spark

In [5]:
sparkDF = spark.createDataFrame(df)
sparkDF.columns

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


['id', 'url', 'title', 'text']

In [6]:
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, CountVectorizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [7]:
tokenizer = RegexTokenizer(inputCol='text',outputCol='words', pattern = '[^a-zA-Z]')
tokenized_df = tokenizer.transform(sparkDF).drop('text')
tokenized_df.head()

Row(id='719307', url='https://simple.wikipedia.org/wiki/John%20W.%20Nicholson', title='John W. Nicholson', words=['john', 'william', 'nicholson', 'born', 'c', 'is', 'an', 'american', 'retired', 'brigadier', 'general', 'of', 'the', 'united', 'states', 'army', 'he', 'was', 'appointed', 'secretary', 'of', 'the', 'american', 'battle', 'monuments', 'commission', 'abmc', 'by', 'president', 'george', 'w', 'bush', 'in', 'january', 'he', 'is', 'the', 'brother', 'of', 'jim', 'nicholson', 'a', 'former', 'secretary', 'of', 'veterans', 'affairs', 'he', 'was', 'born', 'in', 'struble', 'iowa', 'references', 'births', 'living', 'people', 'american', 'generals', 'american', 'military', 'personnel', 'of', 'the', 'vietnam', 'war', 'politicians', 'from', 'iowa'])

In [8]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_df = remover.transform(tokenized_df).drop('words')
removed_df.head()

Row(id='719307', url='https://simple.wikipedia.org/wiki/John%20W.%20Nicholson', title='John W. Nicholson', filtered=['john', 'william', 'nicholson', 'born', 'c', 'american', 'retired', 'brigadier', 'general', 'united', 'states', 'army', 'appointed', 'secretary', 'american', 'battle', 'monuments', 'commission', 'abmc', 'president', 'george', 'w', 'bush', 'january', 'brother', 'jim', 'nicholson', 'former', 'secretary', 'veterans', 'affairs', 'born', 'struble', 'iowa', 'references', 'births', 'living', 'people', 'american', 'generals', 'american', 'military', 'personnel', 'vietnam', 'war', 'politicians', 'iowa'])

In [9]:
cv = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2.0).fit(removed_df)

lda_count = cv.transform(removed_df).drop('filtered')
lda_count.head()

Row(id='719307', url='https://simple.wikipedia.org/wiki/John%20W.%20Nicholson', title='John W. Nicholson', features=SparseVector(32348, {0: 1.0, 1: 4.0, 2: 1.0, 7: 1.0, 8: 2.0, 9: 1.0, 11: 1.0, 31: 1.0, 46: 1.0, 57: 1.0, 73: 1.0, 76: 1.0, 93: 1.0, 100: 1.0, 165: 1.0, 169: 1.0, 233: 1.0, 292: 1.0, 305: 1.0, 311: 1.0, 410: 1.0, 465: 1.0, 576: 1.0, 633: 2.0, 730: 2.0, 788: 1.0, 1263: 1.0, 1459: 1.0, 1490: 1.0, 1704: 1.0, 1726: 1.0, 1823: 1.0, 2238: 1.0, 3297: 1.0, 4106: 1.0, 8579: 1.0, 11829: 2.0, 14168: 1.0}))

In [10]:
from pyspark.ml.clustering import LDA

# create LDA with 100 topics

start = time.time()
lda = LDA(k=K, seed=1, optimizer="em", maxIter=MAX_ITER, checkpointInterval=CHECKPOINT)
model = lda.fit(lda_count)
end = time.time()

print(f"Time elapsed: {end-start:.2f} seconds")

Time elapsed: 32.88 seconds


In [11]:
topicIndices = model.describeTopics(maxTermsPerTopic = 5)
vocabList = cv.vocabulary

for row in topicIndices.collect():
    print(f"Topic {row.topic + 1}: ")
    for topic, weight in zip(row.termIndices, row.termWeights):
        print(f"{vocabList[topic]} {weight:.2E}")
    print()

Topic 1: 
references 6.73E-03
people 6.64E-03
american 6.51E-03
also 4.55E-03
first 3.71E-03

Topic 2: 
american 7.02E-03
references 6.72E-03
people 6.49E-03
also 4.75E-03
first 3.64E-03

Topic 3: 
american 6.88E-03
references 6.86E-03
people 6.42E-03
also 4.78E-03
new 3.88E-03

Topic 4: 
american 6.90E-03
people 6.83E-03
references 6.82E-03
also 4.72E-03
new 3.67E-03

Topic 5: 
references 6.67E-03
people 6.60E-03
american 6.55E-03
also 4.71E-03
new 3.79E-03

Topic 6: 
references 6.78E-03
people 6.71E-03
american 6.65E-03
also 4.84E-03
new 3.79E-03

Topic 7: 
references 6.90E-03
american 6.69E-03
people 6.59E-03
also 4.73E-03
new 3.71E-03

Topic 8: 
american 7.12E-03
references 6.72E-03
people 6.62E-03
also 4.81E-03
united 3.65E-03

Topic 9: 
references 6.73E-03
people 6.45E-03
american 6.45E-03
also 4.86E-03
new 3.73E-03

Topic 10: 
american 6.88E-03
references 6.75E-03
people 6.68E-03
also 4.82E-03
first 3.71E-03

Topic 11: 
american 6.86E-03
references 6.71E-03
people 6.67E-03
also 

In [2]:
from IPython.display import display, Javascript, HTML
import json

In [3]:
data = [5, 10, 20, 40, 50, 30, 10, 20, 40, 10, 5]

In [4]:
Javascript("""
(function(element){
    require(['circles'], function(circles) {
        circles(element.get(0), %s)
    });
})(element);
""" % json.dumps(data))

<IPython.core.display.Javascript object>

In [7]:
from IPython.display import display, HTML
from string import Template
import json

HTML('')

css_text = '''
'''

js_text_template = Template('''
       var bogoSVG = d3.select("#$bogoanimation") 
          .append("svg")
          .attr("width", 300)
          .attr("height", 300);    

      var data = $python_data ;
       bogoSVG.append("circle")
          .style("stroke", "gray")
          .style("fill", "cyan")
          .attr("r", data[0]['r'])
          .attr("cx", data[0]['cx'])
          .attr("cy", data[0]['cy'])
          .transition()
             .delay(100)
             .duration(10000)  
             .attr("r", 10)
             .attr("cx", data[0]['cx'])
             .style("fill", "blue"); 
''')

html_template = Template('''



''')

js_text = js_text_template.substitute({'python_data': json.dumps([{'r': 130, 'cx': 150, 'cy': 150}]),
                                       'bogoanimation': 'animation'})

HTML(html_template.substitute({'css_text': css_text, 'js_text': js_text}))