# V1 Latent Dirichlet Allocation - Narasipal Topic Modelling
## Pipeline:
1. Case folding
2. Normalization
3. Bigram & trigram detection
4. Tokenization (w/ NLTK)
5. Stopword removal
6. Stemming
7. POS filtering
8. BoW
9. LDA
10. Coherence score test

- NO NER (Named Entity Removal)

## 0. Installing Libraries & Packages

In [1]:
pip install --upgrade gensim



In [2]:
pip install pyldavis==3.2.1

Collecting pyldavis==3.2.1
  Downloading pyLDAvis-3.2.1.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m1.3/1.7 MB[0m [31m37.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting funcy (from pyldavis==3.2.1)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Building wheels for collected packages: pyldavis
  Building wheel for pyldavis (setup.py) ... [?25l[?25hdone
  Created wheel for pyldavis: filename=pyLDAvis-3.2.1-py2.py3-none-any.whl size=136162 sha256=495ec4bd5b983d3a573c3617a531bc5e6a2f08fde1fa0500705a57609aa0f7f2
  Stored in directory: /root/.cache/pip/wheels/ae/69/44/1708c54aad5a7e712e19

In [3]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [4]:
pip install swifter

Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16507 sha256=ae5b89b2ab5238b223dc1b36841f5aa70b7fae1adfc48bdb37536bc13737bc43
  Stored in directory: /root/.cache/pip/wheels/ef/7f/bd/9bed48f078f3ee1fa75e0b29b6e0335ce1cb03a38d3443b3a3
Successfully built swifter
Installing collected packages: swifter
Successfully installed swifter-1.4.0


In [5]:
import pandas as pd
import numpy as np
import nltk
import spacy
import gensim
import pyLDAvis
import Sastrawi
import swifter
import string
import re

## 1. Data Pre-processing

In [8]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [9]:
df_titles = pd.read_excel('/content/drive/MyDrive/+Skripsi/Narasipal LDA/merged_titles.xlsx')
df_titles.head()

Unnamed: 0,judul_berita
0,Panas! 5.000 Roket Ditembakkan dari Gaza ke Is...
1,Militer Israel Mulai Operasi Skala Besar Peran...
2,"Perang Hamas Vs Israel Pecah, Rusia Desak Semu..."
3,"Perang Hamas Vs Israel, Rusia Serukan Gencatan..."
4,6 Fakta Dampak Hamas Vs Israel: 40 Orang Tewas...


### 1. Case folding & punctuation removal

In [10]:
# Replace '-' with ' '
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace('-', ' ')

# Lowercase
df_titles['judul_berita'] = df_titles['judul_berita'].str.lower()

# Remove punctuation
df_titles['judul_berita'] = df_titles['judul_berita'].str.translate(str.maketrans('', '', string.punctuation))

# Remove number
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace(r'\d+', '', regex=True)

# Remove whitespaces
df_titles['judul_berita'] = df_titles['judul_berita'].str.strip()

# Remove multiple whitespaces into a single whitespace
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace(r'\s+', ' ', regex=True)

# Remove single characters
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace(r'\b[a-zA-Z]\b', '', regex=True)

df_titles['judul_berita'] = df_titles['judul_berita'].astype(str)
df_titles.head()

Unnamed: 0,judul_berita
0,panas roket ditembakkan dari gaza ke israel
1,militer israel mulai operasi skala besar peran...
2,perang hamas vs israel pecah rusia desak semua...
3,perang hamas vs israel rusia serukan gencatan ...
4,fakta dampak hamas vs israel orang tewas rs in...


### 2. Normalization

In [11]:
# Load normalized word excel
normalized_word = pd.read_excel('/content/drive/MyDrive/+Skripsi/Narasipal LDA/normalisasi.xlsx')

# Create normalized word dictionary
normalized_word_dict = {}
normalized_word_dict = {k.strip(): v for k, v in normalized_word_dict.items()}

# If word isn't already in normalized_word_dict, add it
for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

# Function for normalizing word
def normalized_term(title):
    for term, replacement in normalized_word_dict.items():
        title = re.sub(rf'\b{re.escape(term)}\b', f' {replacement} ', title)  # Add spaces around replacement
    return ' '.join(title.split())

df_titles['textdata_normalized'] = df_titles['judul_berita'].apply(normalized_term)
df_titles.head(10)

Unnamed: 0,judul_berita,textdata_normalized
0,panas roket ditembakkan dari gaza ke israel,panas roket ditembakkan dari gaza ke israel
1,militer israel mulai operasi skala besar peran...,militer israel mulai operasi skala besar peran...
2,perang hamas vs israel pecah rusia desak semua...,perang hamas vs israel pecah rusia desak semua...
3,perang hamas vs israel rusia serukan gencatan ...,perang hamas vs israel rusia serukan gencatan ...
4,fakta dampak hamas vs israel orang tewas rs in...,fakta dampak hamas vs israel orang tewas rumah...
5,orang di gaza tewas akibat serangan balik isra...,orang di gaza tewas akibat serangan balik isra...
6,kbri amman soal hamas vs israel sejauh ini tak...,kbri amman soal hamas vs israel sejauh ini tak...
7,korban serangan hamas di israel lebih dari ora...,korban serangan hamas di israel lebih dari ora...
8,kemlu indonesia prihatin meningkatnya eskalasi...,kementerian luar negeri indonesia prihatin men...
9,prihatin ketegangan palestina israel china min...,prihatin ketegangan palestina israel cina mint...


### 3. Bigram & trigram detection
Parameters:
- `min_count` : **FREQUENCY FILTER** Controls how many times a word pair must appear before being considered a bigram/trigram
- `threshold` : **STRENGTH OF ASSOCIATION** How strongly words must be associated before forming a phrase. Based on Pointwise Mutual Information (PMI)

In [12]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Convert normalized text into tokenized lists
df_titles['tokenized'] = df_titles['textdata_normalized'].apply(lambda x: x.split())

# Train bigram model
bigram = Phrases(df_titles['tokenized'], min_count=10, threshold=20)
bigram_phraser = Phraser(bigram)  # Optimized for faster processing

# Train trigram model on bigram-transformed data
trigram = Phrases(bigram_phraser[df_titles['tokenized']], min_count=5, threshold=10)
trigram_phraser = Phraser(trigram)

# Apply the models to detect bigrams and trigrams
df_titles['bigrams'] = df_titles['tokenized'].apply(lambda x: bigram_phraser[x])
df_titles['trigrams'] = df_titles['bigrams'].apply(lambda x: trigram_phraser[x])

# View the processed output
df_titles[['textdata_normalized', 'bigrams', 'trigrams']].head(10)

Unnamed: 0,textdata_normalized,bigrams,trigrams
0,panas roket ditembakkan dari gaza ke israel,"[panas, roket, ditembakkan, dari, gaza, ke, is...","[panas, roket, ditembakkan_dari, gaza, ke, isr..."
1,militer israel mulai operasi skala besar peran...,"[militer, israel, mulai, operasi, skala, besar...","[militer, israel, mulai, operasi, skala_besar,..."
2,perang hamas vs israel pecah rusia desak semua...,"[perang, hamas, vs, israel, pecah, rusia, desa...","[perang, hamas_vs, israel, pecah, rusia, desak..."
3,perang hamas vs israel rusia serukan gencatan ...,"[perang, hamas, vs, israel, rusia, serukan, ge...","[perang, hamas_vs, israel, rusia, serukan_genc..."
4,fakta dampak hamas vs israel orang tewas rumah...,"[fakta, dampak, hamas, vs, israel, orang_tewas...","[fakta, dampak, hamas_vs, israel, orang_tewas,..."
5,orang di gaza tewas akibat serangan balik isra...,"[orang, di, gaza, tewas, akibat_serangan, bali...","[orang, di, gaza, tewas, akibat_serangan, bali..."
6,kbri amman soal hamas vs israel sejauh ini tak...,"[kbri, amman, soal, hamas, vs, israel, sejauh,...","[kbri, amman, soal, hamas_vs, israel, sejauh, ..."
7,korban serangan hamas di israel lebih dari ora...,"[korban, serangan, hamas, di, israel, lebih_da...","[korban, serangan, hamas, di, israel, lebih_da..."
8,kementerian luar negeri indonesia prihatin men...,"[kementerian_luar, negeri, indonesia, prihatin...","[kementerian_luar_negeri, indonesia, prihatin,..."
9,prihatin ketegangan palestina israel cina mint...,"[prihatin, ketegangan, palestina, israel, cina...","[prihatin, ketegangan, palestina, israel, cina..."


In [13]:
# Extract phrases from the trained bigram model
detected_bigrams = bigram.export_phrases()

# Extract phrases from the trained trigram model
detected_trigrams = trigram.export_phrases()

# Convert bytes to readable strings
detected_bigrams = [phrase if isinstance(phrase, str) else phrase.decode("utf-8") for phrase in detected_bigrams]
detected_trigrams = [phrase if isinstance(phrase, str) else phrase.decode("utf-8") for phrase in detected_trigrams]

In [14]:
detected_bigrams[:50]

['gencatan_senjata',
 'orang_tewas',
 'rumah_sakit',
 'akibat_serangan',
 'tak_ada',
 'lebih_dari',
 'kementerian_luar',
 'luar_negeri',
 'dalam_jam',
 'detik_detik',
 'baku_tembak',
 'minta_maaf',
 'amerika_serikat',
 'pesawat_tempur',
 'ramai_ramai',
 'tel_aviv',
 'uni_eropa',
 'besar_besaran',
 'jadi_sasaran',
 'iron_dome',
 'fadli_zon',
 'evakuasi_wni',
 'tepi_barat',
 'keluar_dari',
 'harus_dihentikan',
 'kapal_induk',
 'apa_itu',
 'festival_musik',
 'tank_tank',
 'ketum_pbnu',
 'gus_yahya',
 'tembak_mati',
 'situasi_terkini',
 'liga_arab',
 'balas_dendam',
 'sekjen_pbb',
 'bantuan_kemanusiaan',
 'kamp_pengungsian',
 'jet_tempur',
 'tak_boleh',
 'turun_tangan',
 'new_york',
 'gal_gadot',
 'lancarkan_serangan',
 'putra_mahkota',
 'hari_ini',
 'unjuk_rasa',
 'terus_gempur',
 'korban_jiwa',
 'warga_sipil']

In [15]:
detected_trigrams[:50]

['ditembakkan_dari',
 'skala_besar',
 'hamas_vs',
 'semua_pihak',
 'gencatan_senjata',
 'serukan_gencatan_senjata',
 'orang_tewas',
 'rumah_sakit',
 'rumah_sakit_indonesia',
 'akibat_serangan',
 'tak_ada',
 'jadi_korban',
 'lebih_dari',
 'kementerian_luar',
 'kementerian_luar_negeri',
 'eskalasi_konflik',
 'tewas_ditembak',
 'ditembak_tentara',
 'dalam_jam',
 'detik_detik',
 'detik_detik_rudal',
 'baku_tembak',
 'bertambah_jadi',
 'minta_maaf',
 'amerika_serikat',
 'amerika_serikat_kirim',
 'kirim_kapal',
 'pesawat_tempur',
 'ramai_ramai',
 'klaim_berhasil',
 'kok_bisa',
 'perang_lawan',
 'uni_eropa',
 'besar_besaran',
 'presiden_iran',
 'sejarah_rumah_sakit',
 'jadi_sasaran',
 'iron_dome',
 'negara_muslim',
 'asal_usul',
 'evakuasi_wni',
 'tepi_barat',
 'di_tepi_barat',
 'imbau_wni',
 'keluar_dari',
 'harus_dihentikan',
 'jalur_gaza',
 'kapal_induk',
 'hentikan_kekerasan',
 'pertahanan_udara']

### 4. Tokenization
With `punkt`

In [16]:
from nltk.tokenize import word_tokenize

In [17]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [18]:
df_titles['tokens'] = df_titles['trigrams'].apply(lambda x: word_tokenize(" ".join(x)))
df_titles[['textdata_normalized', 'trigrams', 'tokens']].head(10)

Unnamed: 0,textdata_normalized,trigrams,tokens
0,panas roket ditembakkan dari gaza ke israel,"[panas, roket, ditembakkan_dari, gaza, ke, isr...","[panas, roket, ditembakkan_dari, gaza, ke, isr..."
1,militer israel mulai operasi skala besar peran...,"[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, mulai, operasi, skala_besar,..."
2,perang hamas vs israel pecah rusia desak semua...,"[perang, hamas_vs, israel, pecah, rusia, desak...","[perang, hamas_vs, israel, pecah, rusia, desak..."
3,perang hamas vs israel rusia serukan gencatan ...,"[perang, hamas_vs, israel, rusia, serukan_genc...","[perang, hamas_vs, israel, rusia, serukan_genc..."
4,fakta dampak hamas vs israel orang tewas rumah...,"[fakta, dampak, hamas_vs, israel, orang_tewas,...","[fakta, dampak, hamas_vs, israel, orang_tewas,..."
5,orang di gaza tewas akibat serangan balik isra...,"[orang, di, gaza, tewas, akibat_serangan, bali...","[orang, di, gaza, tewas, akibat_serangan, bali..."
6,kbri amman soal hamas vs israel sejauh ini tak...,"[kbri, amman, soal, hamas_vs, israel, sejauh, ...","[kbri, amman, soal, hamas_vs, israel, sejauh, ..."
7,korban serangan hamas di israel lebih dari ora...,"[korban, serangan, hamas, di, israel, lebih_da...","[korban, serangan, hamas, di, israel, lebih_da..."
8,kementerian luar negeri indonesia prihatin men...,"[kementerian_luar_negeri, indonesia, prihatin,...","[kementerian_luar_negeri, indonesia, prihatin,..."
9,prihatin ketegangan palestina israel cina mint...,"[prihatin, ketegangan, palestina, israel, cina...","[prihatin, ketegangan, palestina, israel, cina..."


### 5. Stop word removal

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
from nltk.corpus import stopwords

list_stopwords = stopwords.words('indonesian')
list_stopwords.extend(['bikin', 'masuk', 'gegara', 'update', 'puluhan', 'detik', 'potret', 'foto','ada apa'])
list_stopwords = set(list_stopwords)

def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

# nsw = no stop words
df_titles['textdata_tokens_nsw'] = df_titles['tokens'].apply(stopwords_removal)
df_titles.head()

Unnamed: 0,judul_berita,textdata_normalized,tokenized,bigrams,trigrams,tokens,textdata_tokens_nsw
0,panas roket ditembakkan dari gaza ke israel,panas roket ditembakkan dari gaza ke israel,"[panas, roket, ditembakkan, dari, gaza, ke, is...","[panas, roket, ditembakkan, dari, gaza, ke, is...","[panas, roket, ditembakkan_dari, gaza, ke, isr...","[panas, roket, ditembakkan_dari, gaza, ke, isr...","[panas, roket, ditembakkan_dari, gaza, israel]"
1,militer israel mulai operasi skala besar peran...,militer israel mulai operasi skala besar peran...,"[militer, israel, mulai, operasi, skala, besar...","[militer, israel, mulai, operasi, skala, besar...","[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, operasi, skala_besar, perang..."
2,perang hamas vs israel pecah rusia desak semua...,perang hamas vs israel pecah rusia desak semua...,"[perang, hamas, vs, israel, pecah, rusia, desa...","[perang, hamas, vs, israel, pecah, rusia, desa...","[perang, hamas_vs, israel, pecah, rusia, desak...","[perang, hamas_vs, israel, pecah, rusia, desak...","[perang, hamas_vs, israel, pecah, rusia, desak..."
3,perang hamas vs israel rusia serukan gencatan ...,perang hamas vs israel rusia serukan gencatan ...,"[perang, hamas, vs, israel, rusia, serukan, ge...","[perang, hamas, vs, israel, rusia, serukan, ge...","[perang, hamas_vs, israel, rusia, serukan_genc...","[perang, hamas_vs, israel, rusia, serukan_genc...","[perang, hamas_vs, israel, rusia, serukan_genc..."
4,fakta dampak hamas vs israel orang tewas rs in...,fakta dampak hamas vs israel orang tewas rumah...,"[fakta, dampak, hamas, vs, israel, orang, tewa...","[fakta, dampak, hamas, vs, israel, orang_tewas...","[fakta, dampak, hamas_vs, israel, orang_tewas,...","[fakta, dampak, hamas_vs, israel, orang_tewas,...","[fakta, dampak, hamas_vs, israel, orang_tewas,..."


### 6. Stemming
Sekitar 10 menit tanpa GPU, 7 menit w/ GPU

In [21]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

custom_words = {
    "pasukan": "pasukan",
    "bantu": "bantuan",
    "bantuan": "bantuan",
    "hizbullah": "hizbullah",
    "pengungsi":"pengungsi",
    "pengungsian":"pengungsi",
    "bombardir": "bom",
    "akui": "akui"
}

def stem_words(words):
    return [custom_words[word] if word in custom_words else stemmer.stem(word) for word in words]

df_titles['textdata_tokens_stemmed'] = df_titles['textdata_tokens_nsw'].apply(stem_words)
df_titles[['textdata_tokens_nsw', 'textdata_tokens_stemmed']].head()


Unnamed: 0,textdata_tokens_nsw,textdata_tokens_stemmed
0,"[panas, roket, ditembakkan_dari, gaza, israel]","[panas, roket, tembak dari, gaza, israel]"
1,"[militer, israel, operasi, skala_besar, perang...","[militer, israel, operasi, skala besar, rang, ..."
2,"[perang, hamas_vs, israel, pecah, rusia, desak...","[perang, hamas vs, israel, pecah, rusia, desak..."
3,"[perang, hamas_vs, israel, rusia, serukan_genc...","[perang, hamas vs, israel, rusia, seru gencat ..."
4,"[fakta, dampak, hamas_vs, israel, orang_tewas,...","[fakta, dampak, hamas vs, israel, orang tewas,..."


### 7. POS Tagging
10 minutes w/ GPU

In [22]:
# pip install stanza

In [23]:
# import stanza

# # Download and load Indonesian Stanza model
# stanza.download('id')
# nlp = stanza.Pipeline('id', processors='tokenize,pos')

# # Function to filter nouns & verbs
# def pos_filtering_stanza(words):
#     doc = nlp(" ".join(words))
#     filtered_words = [word.text for sentence in doc.sentences for word in sentence.words if word.upos in {"NOUN", "VERB"}]
#     return filtered_words

# # Apply POS filtering
# df_titles['textdata_tokens_pos'] = df_titles['textdata_tokens_stemmed'].apply(pos_filtering_stanza)

### 8. Bag of Words for LDA

In [24]:
from gensim.corpora import Dictionary

# Create a dictionary from tokenized text
dictionary = Dictionary(df_titles["textdata_tokens_stemmed"])

# Convert tokenized text into a Bag of Words representation
corpus = [dictionary.doc2bow(text) for text in df_titles["textdata_tokens_stemmed"]]

# Save dictionary & corpus for future use
dictionary.save("lda_dictionary.dict")
import pickle
with open("lda_corpus.pkl", "wb") as f:
    pickle.dump(corpus, f)

print(f"Dictionary size: {len(dictionary)} unique tokens")
print(f"Example BoW for first document: {corpus[0]}")

Dictionary size: 8445 unique tokens
Example BoW for first document: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


## 2. LDA Model

In [28]:
from gensim.models import LdaModel

# Define number of topics (you'll tune this later)
num_topics = 6

# Train LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,  # More passes = better optimization
    chunksize=100,
    alpha="asymmetric",
    eta=0.01,
)

# Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# Save model for later use
lda_model.save("lda_model")

Topic 0: 0.359*"israel" + 0.213*"gaza" + 0.073*"serang" + 0.041*"pbb" + 0.032*"as" + 0.030*"iran" + 0.021*"dunia" + 0.013*"senjata" + 0.012*"tampak" + 0.010*"merdeka"
Topic 1: 0.482*"palestina" + 0.071*"dukung" + 0.032*"rusia" + 0.028*"serbu" + 0.027*"bela" + 0.027*"lawan" + 0.024*"orang" + 0.023*"pecah" + 0.021*"jalur gaza" + 0.021*"tahan"
Topic 2: 0.260*"amerika serikat" + 0.225*"israel" + 0.110*"gaza" + 0.077*"orang tewas" + 0.075*"panas" + 0.053*"rudal" + 0.045*"setop" + 0.027*"gagal" + 0.023*"takut" + 0.023*"kondisi"
Topic 3: 0.353*"hamas" + 0.293*"perang" + 0.237*"israel" + 0.037*"vs" + 0.021*"tank" + 0.019*"inggris" + 0.010*"akibat" + 0.009*"drone" + 0.006*"lumpuh" + 0.004*"internasional"
Topic 4: 0.209*"indonesia" + 0.180*"palestina" + 0.129*"bantuan" + 0.077*"konflik" + 0.067*"presiden" + 0.049*"rencana" + 0.042*"rumah sakit indonesia" + 0.038*"ribu" + 0.035*"menteri luar negeri" + 0.034*"fakta"
Topic 5: 0.245*"warga" + 0.236*"israel" + 0.166*"gaza" + 0.113*"tewas" + 0.043*"ha

In [29]:
from collections import Counter

topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in df_titles['textdata_tokens_stemmed'] for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df_imp_wcount = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
df_imp_wcount

Unnamed: 0,word,topic_id,importance,word_count
0,israel,0,0.35914,12459
1,gaza,0,0.213123,7577
2,serang,0,0.073235,2108
3,pbb,0,0.040586,837
4,as,0,0.032311,624
5,iran,0,0.030468,526
6,dunia,0,0.021307,401
7,senjata,0,0.012681,233
8,tampak,0,0.012319,204
9,merdeka,0,0.010212,245


#### Download word-topic list

In [31]:
file_path = "/content/drive/MyDrive/+Skripsi/Narasipal LDA/GOOD_it9_df_imp_wcount.xlsx"
df_imp_wcount.to_excel(file_path, index=False)
print(f"File saved to: {file_path}")

File saved to: /content/drive/MyDrive/+Skripsi/Narasipal LDA/GOOD_it9_df_imp_wcount.xlsx


## 3. Coherence Score Testing
6 menit w/ GPU

In [30]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=df_titles["textdata_tokens_stemmed"],
    dictionary=dictionary,
    coherence="c_v",
)

coherence_score = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.4587663155930384


In [33]:
import numpy as np

def compute_coherence_values(dictionary, corpus, texts, start=2, limit=12, step=1):
    coherence_values = []
    for num_topics in range(start, limit, step):
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=10,
            chunksize=100,
            alpha="asymmetric",
            eta=0.01,
        )
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence="c_v")
        coherence_values.append((num_topics, coherence_model.get_coherence()))
    return coherence_values

# Run coherence test
coherence_scores = compute_coherence_values(dictionary, corpus, df_titles["textdata_tokens_stemmed"])

# Print results
for num_topics, score in coherence_scores:
    print(f"Num Topics: {num_topics}, Coherence Score: {score}")

# Find best number of topics
best_num_topics = max(coherence_scores, key=lambda x: x[1])[0]
print(f"Best number of topics: {best_num_topics}")


## 4. Frequency Test

In [34]:
from collections import Counter

all_words = [word for tokens in df_titles['textdata_tokens_stemmed'] for word in tokens]
word_freq = Counter(all_words)
top_200_words = word_freq.most_common(200)

for word, freq in top_200_words:
    print(f"{word}: {freq}")

israel: 12459
palestina: 8453
gaza: 7577
hamas: 2664
warga: 2438
serang: 2108
perang: 1393
indonesia: 1171
netanyahu: 1024
dukung: 965
pbb: 837
tewas: 831
amerika serikat: 734
bantuan: 710
rafah: 670
as: 624
negara: 573
bom: 534
iran: 526
biden: 495
bunuh: 456
konflik: 445
tentara: 444
desak: 429
militer: 422
dunia: 401
hizbullah: 387
menteri luar negeri: 383
jalur gaza: 379
jokowi: 379
temu: 373
henti: 349
prabowo: 347
tolak: 346
presiden: 341
bela: 339
di tepi barat: 334
ancam: 331
orang: 314
anak: 310
orang tewas: 308
kecam: 307
genosida: 307
seru: 304
mesir: 304
hancur: 302
perintah: 302
tahan: 299
lebanon: 295
gencat senjata: 291
pasukan: 278
klaim: 278
anak anak: 274
pengungsi: 270
gempur: 268
damai: 256
korban: 252
tembak: 250
lawan: 250
merdeka: 245
momen: 240
pm: 237
houthi: 234
senjata: 233
korban tewas: 233
sandera: 223
cina: 217
rudal: 217
menteri luar negeri retno: 214
ribu: 211
rumah sakit: 209
bebas: 207
aksi: 206
bahas: 205
tampak: 204
arab: 202
unrwa: 200
jalan: 196
ki

## 5. Visualize LDA