# Topic Modeling with Gensim

*Notebook version: 1.2402.0701*

We’re going to use the gensim implementations because they offer more functionality out of the box

## Library

In [1]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [2]:
import nltk
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string

from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

<br>
<br>

## Helper Function

In [3]:
def tokenize_clean(text):
  return preprocess_string(text)

In [4]:
stopwords = nltk.corpus.stopwords.words('indonesian')
def remove_stopwords(tokenized_text):

    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords:
            cleaned_token.append(token)

    return cleaned_token

In [5]:
def stemming_text(tokenized_text):

    #stem using Sastrawi StemmerFactory
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

In [6]:
def text_preprocessing(text):

    prep01 = tokenize_clean(text)
    prep02 = remove_stopwords(prep01)
    prep03 = stemming_text(prep02)

    return prep03

<br>
<br>

## Read Dataset

In [7]:
!mkdir -p dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Berita.txt -P dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Judul-Berita.txt -P dataset

--2024-07-07 19:27:27--  https://raw.githubusercontent.com/project303/dataset/master/Berita.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 76212 (74K) [text/plain]
Saving to: ‘dataset/Berita.txt’


2024-07-07 19:27:27 (27.7 MB/s) - ‘dataset/Berita.txt’ saved [76212/76212]

--2024-07-07 19:27:27--  https://raw.githubusercontent.com/project303/dataset/master/Judul-Berita.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1751 (1.7K) [text/plain]
Saving to: ‘dataset/Judul-Berita.txt’


2024-07-07 19:27:27 (22.3 MB/s) - ‘dataset

In [8]:
# read article title
article_titles = open('dataset/Judul-Berita.txt').read().split('\n')
len(article_titles)

31

In [9]:
# read article content
article = open('dataset/Berita.txt', encoding="utf8").read().split('BERHENTI DISINI')
len(article)

31

In [None]:
article[0]

## Preprocessing

In [10]:
# remove HTML tag
article_clean = []
for text in article:
    text = BeautifulSoup(text, 'html.parser').getText()
    article_clean.append(text)
article = article_clean

In [None]:
article[0]

In [None]:
print(article[0])

In [11]:
# proses ini memerlukan waktu sekitar 3 menit
tokenized_data = []
for text in article:
    tokenized_data.append(text_preprocessing(text))

In [12]:
len(tokenized_data)

31

In [None]:
print(tokenized_data[0])

In [None]:
len(tokenized_data[0])

<br>
<br>

## Create The Model

In [13]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [14]:
NUM_TOPICS = 3

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha = 'auto', eval_every=5)#, per_word_topics=True)

# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)



In [15]:
print("LDA Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

print("=" * 20)

print("LSI Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

print("=" * 20)

LDA Model:
Topic #0: 0.020*"persen" + 0.009*"main" + 0.007*"dunia" + 0.006*"jakarta" + 0.005*"cnn" + 0.005*"belanja" + 0.005*"indonesia" + 0.005*"menteri" + 0.005*"dolar" + 0.005*"balap"
Topic #1: 0.019*"persen" + 0.006*"indonesia" + 0.006*"jakarta" + 0.005*"lapor" + 0.005*"lemah" + 0.005*"uang" + 0.005*"duga" + 0.005*"dolar" + 0.004*"diskon" + 0.004*"polisi"
Topic #2: 0.011*"persen" + 0.009*"indonesia" + 0.007*"duga" + 0.007*"oknum" + 0.006*"novel" + 0.005*"jakarta" + 0.005*"cnn" + 0.004*"polisi" + 0.004*"kasu" + 0.004*"dunia"
LSI Model:
Topic #0: -0.744*"persen" + -0.218*"lemah" + -0.172*"dolar" + -0.164*"mu" + -0.135*"indek" + -0.129*"bunga" + -0.116*"kuat" + -0.112*"uang" + -0.104*"dagang" + -0.101*"indonesia"
Topic #1: 0.401*"novel" + 0.324*"oknum" + 0.315*"duga" + 0.282*"jender" + 0.178*"polisi" + 0.171*"kpk" + 0.147*"teror" + 0.145*"kera" + 0.143*"kasu" + 0.137*"air"
Topic #2: -0.442*"main" + -0.262*"dunia" + -0.251*"argentina" + -0.250*"tanding" + -0.238*"lawan" + -0.202*"piala

<br>
<br>

## Test The Model

In [16]:
print("article[0]")
print("LDA Model:")
print(lda_model[corpus[0]])

print("")
print("LSA Model:")
print(lsi_model[corpus[0]])

article[0]
LDA Model:
[(0, 0.075148284), (2, 0.91709095)]

LSA Model:
[(0, -1.6138115051806652), (1, 1.843478527288506), (2, -0.29634072900010583)]


In [None]:
print(tokenized_data[0])

In [None]:
article_titles[0]

In [17]:
text = "Pertandingan berjalan dengan seru. Tim lawan berhasil dikalahkan dengan skor 1-0."
bow = dictionary.doc2bow(text_preprocessing(text))

print("LDA Model:")
print(lda_model[bow])
print("")
print("LSA Model:")
print(lsi_model[bow])

#print(bow)

LDA Model:
[(0, 0.8225948), (1, 0.07296258), (2, 0.1044426)]

LSA Model:
[(0, -0.1259820147827382), (1, 0.277101767140785), (2, -0.7973742999991429)]


In [None]:
len(dictionary)

In [None]:
from gensim import similarities

lda_index = similarities.MatrixSimilarity(lda_model[corpus])

# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

# Top most similar documents:
print(similarities[:10])

# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(article[document_id][:1000])

<br>
<br>

## Visualization

In [18]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.6 MB[0m [31m1.9 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/2.6 MB[0m [31m18.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.6/2.6 MB[0m [31m24.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [19]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
panel

<br>
<br>

## How Dictionary and doc2bow Work

In [None]:
texts = [['durian', 'belimbing', 'cempedak' ], ['apel', 'belimbing']]

dct = corpora.Dictionary(texts)  # initialize a Dictionary

In [None]:
len(dct)

In [None]:
dct.keys()

In [None]:
dct[1]

In [None]:
dct.doc2bow(["belimbing", "apel", "non_existent_word"])

<br>
<br>

## Revision History


Release: 1.2102.0601
*   First release

Release: 1.2402.0701
*   Change preprocessing process
*   Add how Dictionary dan doc2bow works