# Latent Dirichlet Allocation - Narasipal Topic Modelling
With NLTK

# 0. Installing Libraries & Packages

In [39]:
import pandas as pd
import numpy as np
import nltk
import spacy
import gensim
import pyLDAvis
import Sastrawi
import swifter
import string
import re

ModuleNotFoundError: No module named 'spacy'

In [33]:
df_titles = pd.read_excel('/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cleaning/merged_titles.xlsx')
df_titles.head()

Unnamed: 0,judul_berita
0,Panas! 5.000 Roket Ditembakkan dari Gaza ke Is...
1,Militer Israel Mulai Operasi Skala Besar Peran...
2,"Perang Hamas Vs Israel Pecah, Rusia Desak Semu..."
3,"Perang Hamas Vs Israel, Rusia Serukan Gencatan..."
4,6 Fakta Dampak Hamas Vs Israel: 40 Orang Tewas...


# 1. Data pre-processing

## 1. Case folding & punctuation removal

In [34]:
# Replace '-' with ' '
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace('-', ' ')

# Lowercase
df_titles['judul_berita'] = df_titles['judul_berita'].str.lower()

# Remove punctuation
df_titles['judul_berita'] = df_titles['judul_berita'].str.translate(str.maketrans('', '', string.punctuation))

# Remove number
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace(r'\d+', '', regex=True)

# Remove whitespaces
df_titles['judul_berita'] = df_titles['judul_berita'].str.strip()

# Remove multiple whitespaces into a single whitespace
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace(r'\s+', ' ', regex=True)

# Remove single characters
df_titles['judul_berita'] = df_titles['judul_berita'].str.replace(r'\b[a-zA-Z]\b', '', regex=True)

df_titles['judul_berita'] = df_titles['judul_berita'].astype(str)
df_titles.head()

Unnamed: 0,judul_berita
0,panas roket ditembakkan dari gaza ke israel
1,militer israel mulai operasi skala besar peran...
2,perang hamas vs israel pecah rusia desak semua...
3,perang hamas vs israel rusia serukan gencatan ...
4,fakta dampak hamas vs israel orang tewas rs in...


## 2. Normalization

In [35]:
# Load normalized word excel
normalized_word = pd.read_excel('/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cleaning/normalisasi.xlsx')

# Create normalized word dictionary
normalized_word_dict = {}
normalized_word_dict = {k.strip(): v for k, v in normalized_word_dict.items()}

# If word isn't already in normalized_word_dict, add it
for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

# Function for normalizing word
def normalized_term(title):
    for term, replacement in normalized_word_dict.items():
        title = re.sub(rf'\b{re.escape(term)}\b', f' {replacement} ', title)  # Add spaces around replacement
    return ' '.join(title.split()) 
 
df_titles['textdata_normalized'] = df_titles['judul_berita'].apply(normalized_term)
df_titles.head(10)

Unnamed: 0,judul_berita,textdata_normalized
0,panas roket ditembakkan dari gaza ke israel,panas roket ditembakkan dari gaza ke israel
1,militer israel mulai operasi skala besar peran...,militer israel mulai operasi skala besar peran...
2,perang hamas vs israel pecah rusia desak semua...,perang hamas versus israel pecah rusia desak s...
3,perang hamas vs israel rusia serukan gencatan ...,perang hamas versus israel rusia serukan genca...
4,fakta dampak hamas vs israel orang tewas rs in...,fakta dampak hamas versus israel orang tewas r...
5,orang di gaza tewas akibat serangan balik isra...,orang di gaza tewas akibat serangan balik isra...
6,kbri amman soal hamas vs israel sejauh ini tak...,kbri amman soal hamas versus israel sejauh ini...
7,korban serangan hamas di israel lebih dari ora...,korban serangan hamas di israel lebih dari ora...
8,kemlu indonesia prihatin meningkatnya eskalasi...,kementerian luar negeri indonesia prihatin men...
9,prihatin ketegangan palestina israel china min...,prihatin ketegangan palestina israel cina mint...


## 3. Bigram & Trigram Detection
Parameters:
- `min_count` : **FREQUENCY FILTER** Controls how many times a word pair must appear before being considered a bigram/trigram
- `threshold` : **STRENGTH OF ASSOCIATION** How strongly words must be associated before forming a phrase. Based on Pointwise Mutual Information (PMI)


In [36]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Convert normalized text into tokenized lists
df_titles['tokenized'] = df_titles['textdata_normalized'].apply(lambda x: x.split())

# Train bigram model
bigram = Phrases(df_titles['tokenized'], min_count=5, threshold=10)
bigram_phraser = Phraser(bigram)  # Optimized for faster processing

# Train trigram model on bigram-transformed data
trigram = Phrases(bigram_phraser[df_titles['tokenized']], min_count=5, threshold=10)
trigram_phraser = Phraser(trigram)

# Apply the models to detect bigrams and trigrams
df_titles['bigrams'] = df_titles['tokenized'].apply(lambda x: bigram_phraser[x])
df_titles['trigrams'] = df_titles['bigrams'].apply(lambda x: trigram_phraser[x])

# View the processed output
df_titles[['textdata_normalized', 'bigrams', 'trigrams']].head(10)

Unnamed: 0,textdata_normalized,bigrams,trigrams
0,panas roket ditembakkan dari gaza ke israel,"[panas, roket, ditembakkan, dari, gaza, ke, is...","[panas, roket, ditembakkan_dari, gaza, ke, isr..."
1,militer israel mulai operasi skala besar peran...,"[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, mulai, operasi, skala_besar,..."
2,perang hamas versus israel pecah rusia desak s...,"[perang, hamas_versus, israel, pecah, rusia, d...","[perang_hamas_versus, israel, pecah, rusia, de..."
3,perang hamas versus israel rusia serukan genca...,"[perang, hamas_versus, israel, rusia, serukan_...","[perang_hamas_versus, israel, rusia, serukan_g..."
4,fakta dampak hamas versus israel orang tewas r...,"[fakta, dampak, hamas_versus, israel, orang_te...","[fakta, dampak, hamas_versus, israel, orang_te..."
5,orang di gaza tewas akibat serangan balik isra...,"[orang, di, gaza, tewas_akibat, serangan, bali...","[orang, di, gaza, tewas_akibat, serangan, bali..."
6,kbri amman soal hamas versus israel sejauh ini...,"[kbri, amman, soal, hamas_versus, israel, seja...","[kbri, amman, soal, hamas_versus, israel, seja..."
7,korban serangan hamas di israel lebih dari ora...,"[korban, serangan, hamas, di, israel, lebih_da...","[korban, serangan, hamas, di, israel, lebih_da..."
8,kementerian luar negeri indonesia prihatin men...,"[kementerian_luar, negeri_indonesia, prihatin,...","[kementerian_luar_negeri_indonesia, prihatin, ..."
9,prihatin ketegangan palestina israel cina mint...,"[prihatin, ketegangan, palestina, israel, cina...","[prihatin, ketegangan, palestina, israel, cina..."


# 4. Tokenization

### With `punkt`

In [37]:
from nltk.tokenize import word_tokenize

In [38]:
nltk.download('punkt')
df_titles['tokens'] = df_titles['trigrams'].apply(lambda x: word_tokenize(" ".join(x)))
df_titles[['textdata_normalized', 'trigrams', 'tokens']].head(10)

[nltk_data] Downloading package punkt to /Users/salmadanu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,textdata_normalized,trigrams,tokens
0,panas roket ditembakkan dari gaza ke israel,"[panas, roket, ditembakkan_dari, gaza, ke, isr...","[panas, roket, ditembakkan_dari, gaza, ke, isr..."
1,militer israel mulai operasi skala besar peran...,"[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, mulai, operasi, skala_besar,..."
2,perang hamas versus israel pecah rusia desak s...,"[perang_hamas_versus, israel, pecah, rusia, de...","[perang_hamas_versus, israel, pecah, rusia, de..."
3,perang hamas versus israel rusia serukan genca...,"[perang_hamas_versus, israel, rusia, serukan_g...","[perang_hamas_versus, israel, rusia, serukan_g..."
4,fakta dampak hamas versus israel orang tewas r...,"[fakta, dampak, hamas_versus, israel, orang_te...","[fakta, dampak, hamas_versus, israel, orang_te..."
5,orang di gaza tewas akibat serangan balik isra...,"[orang, di, gaza, tewas_akibat, serangan, bali...","[orang, di, gaza, tewas_akibat, serangan, bali..."
6,kbri amman soal hamas versus israel sejauh ini...,"[kbri, amman, soal, hamas_versus, israel, seja...","[kbri, amman, soal, hamas_versus, israel, seja..."
7,korban serangan hamas di israel lebih dari ora...,"[korban, serangan, hamas, di, israel, lebih_da...","[korban, serangan, hamas, di, israel, lebih_da..."
8,kementerian luar negeri indonesia prihatin men...,"[kementerian_luar_negeri_indonesia, prihatin, ...","[kementerian_luar_negeri_indonesia, prihatin, ..."
9,prihatin ketegangan palestina israel cina mint...,"[prihatin, ketegangan, palestina, israel, cina...","[prihatin, ketegangan, palestina, israel, cina..."


## 5. Stop Word Removal

In [40]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/salmadanu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

list_stopwords = stopwords.words('indonesian')
list_stopwords = set(list_stopwords)

def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

# nsw = no stop words
df_titles['textdata_tokens_nsw'] = df_titles['tokens'].apply(stopwords_removal)
df_titles.head()

Unnamed: 0,judul_berita,textdata_normalized,tokenized,bigrams,trigrams,tokens,textdata_tokens_nsw
0,panas roket ditembakkan dari gaza ke israel,panas roket ditembakkan dari gaza ke israel,"[panas, roket, ditembakkan, dari, gaza, ke, is...","[panas, roket, ditembakkan, dari, gaza, ke, is...","[panas, roket, ditembakkan_dari, gaza, ke, isr...","[panas, roket, ditembakkan_dari, gaza, ke, isr...","[panas, roket, ditembakkan_dari, gaza, israel]"
1,militer israel mulai operasi skala besar peran...,militer israel mulai operasi skala besar peran...,"[militer, israel, mulai, operasi, skala, besar...","[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, mulai, operasi, skala_besar,...","[militer, israel, operasi, skala_besar, perang..."
2,perang hamas vs israel pecah rusia desak semua...,perang hamas versus israel pecah rusia desak s...,"[perang, hamas, versus, israel, pecah, rusia, ...","[perang, hamas_versus, israel, pecah, rusia, d...","[perang_hamas_versus, israel, pecah, rusia, de...","[perang_hamas_versus, israel, pecah, rusia, de...","[perang_hamas_versus, israel, pecah, rusia, de..."
3,perang hamas vs israel rusia serukan gencatan ...,perang hamas versus israel rusia serukan genca...,"[perang, hamas, versus, israel, rusia, serukan...","[perang, hamas_versus, israel, rusia, serukan_...","[perang_hamas_versus, israel, rusia, serukan_g...","[perang_hamas_versus, israel, rusia, serukan_g...","[perang_hamas_versus, israel, rusia, serukan_g..."
4,fakta dampak hamas vs israel orang tewas rs in...,fakta dampak hamas versus israel orang tewas r...,"[fakta, dampak, hamas, versus, israel, orang, ...","[fakta, dampak, hamas_versus, israel, orang_te...","[fakta, dampak, hamas_versus, israel, orang_te...","[fakta, dampak, hamas_versus, israel, orang_te...","[fakta, dampak, hamas_versus, israel, orang_te..."


# 6. Stemming

In [43]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_words(words):
    return [stemmer.stem(word) for word in words]

df_titles['textdata_tokens_stemmed'] = df_titles['textdata_tokens_nsw'].apply(stem_words)
df_titles[['textdata_tokens_nsw', 'textdata_tokens_stemmed']].head()


Unnamed: 0,textdata_tokens_nsw,textdata_tokens_stemmed
0,"[panas, roket, ditembakkan_dari, gaza, israel]","[panas, roket, tembak dari, gaza, israel]"
1,"[militer, israel, operasi, skala_besar, perang...","[militer, israel, operasi, skala besar, rang, ..."
2,"[perang_hamas_versus, israel, pecah, rusia, de...","[perang hamas versus, israel, pecah, rusia, de..."
3,"[perang_hamas_versus, israel, rusia, serukan_g...","[perang hamas versus, israel, rusia, seru genc..."
4,"[fakta, dampak, hamas_versus, israel, orang_te...","[fakta, dampak, hamas versus, israel, orang te..."


# 7. POS Tagging

In [None]:
import stanza

# Download and load Indonesian Stanza model
stanza.download('id')
nlp = stanza.Pipeline('id', processors='tokenize,pos')

# Function to filter nouns & verbs
def pos_filtering_stanza(words):
    doc = nlp(" ".join(words))
    filtered_words = [word.text for sentence in doc.sentences for word in sentence.words if word.upos in {"NOUN", "VERB"}]
    return filtered_words

# Apply POS filtering
df_titles['textdata_tokens_pos'] = df_titles['textdata_tokens_stemmed'].apply(pos_filtering_stanza)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 36.3MB/s]                    
2025-02-28 14:14:42 INFO: Downloaded file to /Users/salmadanu/stanza_resources/resources.json
2025-02-28 14:14:42 INFO: Downloading default packages for language: id (Indonesian) ...
Downloading https://huggingface.co/stanfordnlp/stanza-id/resolve/v1.10.0/models/default.zip: 100%|██████████| 396M/396M [02:23<00:00, 2.75MB/s] 
2025-02-28 14:17:10 INFO: Downloaded file to /Users/salmadanu/stanza_resources/id/default.zip
2025-02-28 14:17:22 INFO: Finished downloading models and saved to /Users/salmadanu/stanza_resources
2025-02-28 14:17:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00,

: 

: 