Import

In [243]:
from math import log
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from collections import Counter
from fractions import Fraction

DATASET

In [244]:
corpus = []

# Dokumen 1
doc1 = ('mary','had','a','lamp')
corpus.append(doc1)

# Dokumen 2
doc2 = ('mary','has','roses')
corpus.append(doc2)

# Dokumen 3
doc3 = ('roses','are','red')
corpus.append(doc3)

# Dokumen 4
doc4 = ('violets','are','light','blue')
corpus.append(doc4)

# Menampilkan korpus
for i, doc in enumerate(corpus):
    print(f"Dokumen {i+1}:\n{doc}\n")

Dokumen 1:
('mary', 'had', 'a', 'lamp')

Dokumen 2:
('mary', 'has', 'roses')

Dokumen 3:
('roses', 'are', 'red')

Dokumen 4:
('violets', 'are', 'light', 'blue')



LDA MODEL

In [259]:

# Daftar stop words khusus
custom_stopwords = set(['are', 'had', 'has', 'a'])

# Konversi tuple ke list
corpus = [list(doc) for doc in corpus]

# Pra-pemrosesan teks dengan penghapusan stop words
tokenized_corpus = [
    [word for word in doc if word not in custom_stopwords]
    for doc in corpus
]

# Membangun kamus
dictionary = corpora.Dictionary(tokenized_corpus)

# Membuat representasi bag-of-words untuk setiap dokumen
bow_corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]

# Melatih model LDA
lda_model = LdaModel(bow_corpus, num_topics=3, id2word=dictionary, passes=10)

# Menampilkan topik
topics = lda_model.print_topics(num_words=2)
for topic in topics:
    print(topic)


(0, '0.427*"roses" + 0.249*"red"')
(1, '0.249*"light" + 0.249*"blue"')
(2, '0.427*"mary" + 0.250*"lamp"')


TOPIC COHERENCE

Segmentasi Corpus Sliding Windows 3

In [247]:
def generate_sliding_windows(document, window_size):
    windows = []
    words = document
    
    if len(words) < window_size:
        return windows  # Tidak cukup kata untuk membuat satu jendela

    for i in range(len(words) - window_size + 1):
        window = tuple(words[i:i+window_size])
        windows.append(window)

    return windows


# Sliding window size = 3
window_size = 3

all_sliding_windows = []

# Generate sliding windows untuk setiap dokumen dalam corpus
for i, document in enumerate(corpus):
    sliding_windows = generate_sliding_windows(document, window_size)
    all_sliding_windows.extend(sliding_windows)

# Cetak hasil sliding windows untuk semua dokumen
print("Sliding Windows untuk Semua Dokumen:")
for j, window in enumerate(all_sliding_windows):
    print(f"  Window {j+1}: {window}")


Sliding Windows untuk Semua Dokumen:
  Window 1: ('mary', 'had', 'a')
  Window 2: ('had', 'a', 'lamp')
  Window 3: ('mary', 'has', 'roses')
  Window 4: ('roses', 'are', 'red')
  Window 5: ('violets', 'are', 'light')
  Window 6: ('are', 'light', 'blue')


Probability Calculation

In [248]:
# Fungsi untuk menghitung probabilitas semua kata dalam sliding windows
def calculate_all_word_probabilities(all_sliding_windows):
    # Menghitung frekuensi kemunculan setiap kata dalam semua sliding windows
    word_counts = Counter(word for window in all_sliding_windows for word in window)
    
    # Menghitung jumlah total kata dalam semua sliding windows
    total_words = sum(len(window) for window in all_sliding_windows)
    
    # Menghitung probabilitas setiap kata
    probabilities = {word: Fraction(count, len(all_sliding_windows)) for word, count in word_counts.items()}
    
    return probabilities

# Hitung probabilitas semua kata dalam semua sliding windows
all_word_probabilities = calculate_all_word_probabilities(all_sliding_windows)

# Cetak hasil probabilitas dalam bentuk pecahan
print("Probabilitas untuk Semua Kata (dalam bentuk pecahan):")
for word, probability in all_word_probabilities.items():
    print(f"  Kata '{word}': {probability}")


Probabilitas untuk Semua Kata (dalam bentuk pecahan):
  Kata 'mary': 1/3
  Kata 'had': 1/3
  Kata 'a': 1/3
  Kata 'lamp': 1/6
  Kata 'has': 1/6
  Kata 'roses': 1/3
  Kata 'are': 1/2
  Kata 'red': 1/6
  Kata 'violets': 1/6
  Kata 'light': 1/3
  Kata 'blue': 1/6


In [261]:
# Fungsi untuk menghitung probabilitas windows yang mengandung beberapa kombinasi kata target
def calculate_custom_combination_window_probabilities(all_sliding_windows, target_combinations):
    # Menghitung jumlah windows yang mengandung setiap kombinasi kata target
    windows_with_target_combinations = sum(1 for window in all_sliding_windows if any(all(word in window for word in combination) for combination in target_combinations))
    
    # Menghitung jumlah total windows
    total_windows = len(all_sliding_windows)
    
    # Menghitung probabilitas windows yang mengandung kombinasi kata target
    probability_target_combinations = Fraction(windows_with_target_combinations, total_windows)
    
    return probability_target_combinations

# Kombinasi kata topik yang ingin dihitung probabilitasnya
target_combinations = [['roses', 'red'], ['light', 'blue'] , ['mary', 'lamp']]  

# Iterasi melalui setiap kombinasi kata target
for combination in target_combinations:
    # Hitung probabilitas windows yang mengandung kombinasi kata target
    combination_window_prob = calculate_custom_combination_window_probabilities(all_sliding_windows, [combination])

    # Cetak hasil probabilitas untuk kombinasi kata tertentu dalam bentuk pecahan
    print(f"Kata hasil modelling : {', '.join(combination)}:", combination_window_prob)

Kata hasil modelling : roses, red: 1/6
Kata hasil modelling : light, blue: 1/6
Kata hasil modelling : mary, lamp: 0


Confirmation Measure

In [262]:
# Fungsi untuk menghitung NPMI antara dua kata
def calculate_npmi(word1, word2, all_word_probabilities, all_sliding_windows):
    # Menghitung probabilitas bersama P(w1, w2)
    joint_probability = sum(1 for window in all_sliding_windows if word1 in window and word2 in window) / len(all_sliding_windows)

    # Menghitung probabilitas P(w1) dan P(w2)
    probability_word1 = all_word_probabilities.get(word1, 0)
    probability_word2 = all_word_probabilities.get(word2, 0)

    # Jika 0
    if joint_probability == 0 or probability_word1 == 0 or probability_word2 == 0:
        npmi = 0
    else:

        #Rumusnya
        numerator = log(joint_probability / (probability_word1 * probability_word2))
        denominator = -log(joint_probability)
        npmi = numerator / denominator if denominator != 0 else 0

    return npmi

# Contoh penggunaan
word1 = 'roses'
word2 = 'red'

# Hitung NPMI antara word1 dan word2
npmi_value = calculate_npmi(word1, word2, all_word_probabilities, all_sliding_windows)

# Cetak hasil NPMI
print(f"Skor '{word1}' dan '{word2}': {npmi_value}")


Skor 'roses' dan 'red': 0.6131471927654585
