# Topic Modeling with Gensim

*Notebook version: 1.2402.0701*

We’re going to use the gensim implementations because they offer more functionality out of the box

## Library

In [None]:
!pip install sastrawi

In [None]:
import nltk
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string

from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

<br>
<br>

## Helper Function

In [None]:
def tokenize_clean(text):
  return preprocess_string(text)

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')
def remove_stopwords(tokenized_text):

    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords:
            cleaned_token.append(token)

    return cleaned_token

In [None]:
def stemming_text(tokenized_text):

    #stem using Sastrawi StemmerFactory
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

In [None]:
def text_preprocessing(text):

    prep01 = tokenize_clean(text)
    prep02 = remove_stopwords(prep01)
    prep03 = stemming_text(prep02)

    return prep03

<br>
<br>

## Read Dataset

In [None]:
!mkdir -p dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Berita.txt -P dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Judul-Berita.txt -P dataset

In [None]:
# read article title
article_titles = open('dataset/Judul-Berita.txt').read().split('\n')
len(article_titles)

In [None]:
# read article content
article = open('dataset/Berita.txt', encoding="utf8").read().split('BERHENTI DISINI')
len(article)

In [None]:
article[0]

## Preprocessing

In [None]:
# remove HTML tag
article_clean = []
for text in article:
    text = BeautifulSoup(text, 'html.parser').getText()
    article_clean.append(text)
article = article_clean

In [None]:
article[0]

In [None]:
print(article[0])

In [None]:
# proses ini memerlukan waktu sekitar 3 menit
tokenized_data = []
for text in article:
    tokenized_data.append(text_preprocessing(text))

In [None]:
len(tokenized_data)

In [None]:
print(tokenized_data[0])

In [None]:
len(tokenized_data[0])

<br>
<br>

## Create The Model

In [None]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [None]:
NUM_TOPICS = 3

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha = 'auto', eval_every=5)#, per_word_topics=True)

# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [None]:
print("LDA Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

print("=" * 20)

print("LSI Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

print("=" * 20)

<br>
<br>

## Test The Model

In [None]:
print("article[0]")
print("LDA Model:")
print(lda_model[corpus[0]])

print("")
print("LSA Model:")
print(lsi_model[corpus[0]])

In [None]:
print(tokenized_data[0])

In [None]:
article_titles[0]

In [None]:
text = "Pertandingan berjalan dengan seru. Tim lawan berhasil dikalahkan dengan skor 1-0."
bow = dictionary.doc2bow(text_preprocessing(text))

print("LDA Model:")
print(lda_model[bow])
print("")
print("LSA Model:")
print(lsi_model[bow])

#print(bow)

In [None]:
len(dictionary)

In [None]:
from gensim import similarities

lda_index = similarities.MatrixSimilarity(lda_model[corpus])

# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

# Top most similar documents:
print(similarities[:10])

# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(article[document_id][:1000])

<br>
<br>

## Visualization

In [None]:
# Install pyLDAVis (specific version for Google Collab)
#!pip install pyLDAvis==2.1.2
#!pip install pyLDAvis

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
panel

<br>
<br>

## How Dictionary and doc2bow Work

In [None]:
texts = [['durian', 'belimbing', 'cempedak' ], ['apel', 'belimbing']]

dct = corpora.Dictionary(texts)  # initialize a Dictionary

In [None]:
len(dct)

In [None]:
dct.keys()

In [None]:
dct[1]

In [None]:
dct.doc2bow(["belimbing", "apel", "non_existent_word"])

<br>
<br>

## Revision History


Release: 1.2102.0601
*   First release

Release: 1.2402.0701
*   Change preprocessing process
*   Add how Dictionary dan doc2bow works