In [None]:
import pyspark

from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.config("spark.driver.memory", "8g").\
                            config("spark.driver.maxResultSize", "0").\
                            config("spark.kryoserializer.buffer.max", "2000M").\
                            config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.0.2").\
                            appName('chapter_6').\
                            getOrCreate()

### Getting the Data

In [None]:
# ! wikiextractor wikidump.xml

# ! mv text wikidump
! tree wikidump

In [None]:
! head -n 5 wikidump/AA/wiki_00

### Spark NLP

If you intend to use the PySpark shell, start the shell with the following command:  
`pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.4`

In [None]:
import sparknlp

spark = sparknlp.start()

In [None]:
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import (Lemmatizer, Stemmer,
                                Tokenizer, Normalizer,
                                StopWordsCleaner)
from sparknlp.pretrained import PretrainedPipeline

### Parsing the Data

In [None]:
data_source = 'wikidump/*/*'

In [None]:
raw_data = spark.sparkContext.wholeTextFiles(data_source).toDF()
raw_data.show(1, vertical=True)

In [None]:
from pyspark.sql import functions as fun
df = raw_data.withColumn('content', fun.explode(fun.split(fun.col("_2"),
  "</doc>")))
df = df.drop(fun.col('_2')).drop(fun.col('_1'))

df.show(4, vertical=True)

In [None]:
df.show(1, truncate=False, vertical=True)

In [None]:
df = df.withColumn('title', fun.split(fun.col('content'), '\n').getItem(2)) \
        .withColumn('content', fun.split(fun.col('content'), '\n').getItem(4))
df.show(4, vertical=True)

### Preparing the Data Using Spark NLP

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("content") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

document_assembler.transform(df).select('document').limit(1).collect()

In [None]:
# Split sentence to tokens(array)
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")

In [None]:
# clean unwanted characters and garbage
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized") \
    .setLowercase(True)

In [None]:
# remove stopwords
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

In [None]:
# stem the words to bring them to the root form.
stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

In [None]:
finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [None]:
from pyspark.ml import Pipeline
nlp_pipeline = Pipeline(
    stages=[document_assembler,
            tokenizer,
            normalizer,
            stopwords_cleaner,
            stemmer,
            finisher])

In [None]:
nlp_model = nlp_pipeline.fit(df)

processed_df  = nlp_model.transform(df)

processed_df.printSchema()

In [None]:
tokens_df = processed_df.select('title','tokens')
tokens_df.show(2, vertical=True)

### Computing the TF-IDFa

In [None]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="tokens", outputCol="raw_features")

# train the model
cv_model = cv.fit(tokens_df)

# transform the data. Output column name will be raw_features.
vectorized_tokens = cv_model.transform(tokens_df)

In [None]:
from pyspark.ml.feature import IDF
idf = IDF(inputCol="raw_features", outputCol="features")

idf_model = idf.fit(vectorized_tokens)

vectorized_df = idf_model.transform(vectorized_tokens)

In [None]:
vectorized_df = vectorized_df.drop(fun.col('raw_features'))

vectorized_df.show(6)

### Creating Our LDA Model

In [None]:
from pyspark.ml.clustering import LDA

num_topics = 5
max_iter = 50

lda = LDA(k=num_topics, maxIter=max_iter)
model = lda.fit(vectorized_df)

lp = model.logPerplexity(vectorized_df)

print("The upper bound on perplexity: " + str(lp))

In [None]:
vocab = cv_model.vocabulary

raw_topics = model.describeTopics().collect()

topic_inds = [ind.termIndices for ind in raw_topics]

topics = []
for topic in topic_inds:
    _topic = []
    for ind in topic:
        _topic.append(vocab[ind])
    topics.append(_topic)

In [None]:
for i, topic in enumerate(topics, start=1):
    print(f"topic {i}: {topic}")

In [None]:
lda_df = model.transform(vectorized_df)
lda_df.select(fun.col('title'), fun.col('topicDistribution')).\
                show(2, vertical=True, truncate=False)

In [None]:
from pyspark.sql.types import IntegerType
max_index = fun.udf(lambda x: x.tolist().index(max(x)) + 1, IntegerType())
lda_df = lda_df.withColumn('topic_index',
                        max_index(fun.col('topicDistribution')))

In [None]:
lda_df.select('title', 'topic_index').show(10, truncate=False)