Notebook written by [Zhedong Zheng](https://github.com/zhedongzheng)

<img src="img/lda.png" width="600">

In [1]:
"""
brew install apache-spark
pip3 install findspark
"""
from nltk.corpus import stopwords

import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.clustering import LDA

In [2]:
N_TOPICS = 20
MAX_TERMS = 5

In [3]:
stopwords = set(stopwords.words('english')).union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth'})

sc = SparkContext('local', 'nlp')
lines = sc.textFile('data/all_book_titles.txt')
lines = lines \
    .map(lambda line: line.strip().lower()) \
    .map(lambda line: line.split()) \
    .map(lambda words: [w for w in words if w.isalpha()]) \
    .map(lambda words: [w for w in words if len(w) > 3]) \
    .map(lambda words: [w for w in words if w not in stopwords]) \
    .zipWithIndex()

sess = SparkSession.builder.appName('nlp').getOrCreate()
df = sess.createDataFrame(lines, ['words', 'idx'])

cv = CountVectorizer(inputCol='words',
                     outputCol='tf')
cv = cv.fit(df)
df = cv.transform(df)
df = IDF(inputCol='tf',
         outputCol='tfidf').fit(df).transform(df)

lda = LDA(k=N_TOPICS,
          featuresCol='tfidf',
          optimizer='em').fit(df)

for i, indices in enumerate(lda.describeTopics(MAX_TERMS).toPandas().termIndices):
    print('Topic %d:'%(i+1), ' '.join([cv.vocabulary[idx] for idx in indices]))

Topic 1: bioinformatics astronomy science microbiology statistical
Topic 2: engineering statistics thermodynamics online student
Topic 3: evolution cultural actuarial norton systems
Topic 4: buddhism aerodynamics relativity database ethics
Topic 5: accounting sociology religions social modern
Topic 6: east feminism asian readings physical
Topic 7: anatomy perspective basic connect islam
Topic 8: anthology exploring earth insurance literature
Topic 9: design forensic nutrition mechanics microprocessor
Topic 10: general handbook philosophy medicine first
Topic 11: real analysis finance corporate pathophysiology
Topic 12: computer world organization greek geography
Topic 13: american history political latin science
Topic 14: biology machine organic research chemistry
Topic 15: language natural international processing systems
Topic 16: practice security theory sciences nursing
Topic 17: data integrated early design structures
Topic 18: quantum mechanics business solutions communication
To