In [2]:
# create sample documents
doc_a = "Brokoli bagus untuk dimakan. Adikku suka makan brokoli, tetapi ibuku tidak."
doc_b = "Ibuku menghabiskan banyak waktu berkeliling melihat latihan bisbol adikku."
doc_c = "Beberapa ahli kesehatan menyarankan bahwa mengemudi dapat menyebabkan ketegangan dan tekanan darah meningkat."
doc_d = "Saya sering merasakan tekanan untuk tampil baik saat presentasi di sekolah."
doc_e = "Profesional kesehatan mengatakan bahwa brokoli itu baik untuk kesehatan Anda."
doc_f = "Teman saya seorang pemain bisbol yang pernah mendapatkan juara."
doc_g = "Pemain bisbol yang bernama Flash itu sangat suka memakan brokoli."
doc_h = "Sopir yang mengemudi taksi itu mendapatkan tekanan dari penumpangnya."
doc_i = "Saat tanding, olahraga bisbol memberikan ketegangan dan meningkatkan tekanan darah para penonton."
doc_j = "Ibuku menyarankan saya untuk memakan brokoli agar tekanan darah terkontrol."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e, doc_f, doc_g, doc_h, doc_i, doc_j]

# POS Tagging

In [3]:
# PYTHON 2.7
import os
from hmmtagger import MainTagger
from tokenization import *

mt = None

In [4]:
import pandas as pd
import numpy as np

## Daftar Tagging
Berikut adalah daftar tagging berdasarkan jurnal **HMM Based Part-of-Speech Tagger for Bahasa Indonesia.** (Alfan Farizki & Ayu Purwarianti, 2010)

In [5]:
# daftar tagging
df = pd.DataFrame({'POS' : ['OP', 'CP', 'GM', ';', ':', 
                            '"', '.', ',', '-', '...', 
                            'JJ', 'RB', 'NN', 'NNP','NNG', 
                            'VBI', 'VBT', 'IN', 'MD', 'CC', 
                            'SC', 'DT', 'UH', 'CDO', 'CDC', 
                            'CDP', 'CDI', 'PRP', 'WP', 'PRN', 
                            'PRL', 'NEG', 'SYM', 'RP', 'FW'],
                   'POS Name' : ['Open Parenthesis', 'Close Parenthesis', 'Slash', 'Semicolon', 'Colon', 
                                 'Quotation', 'Sentence Terminator', 'Comma', 'Dash', 'Ellipsis', 
                                 'Adjective', 'Adverb', 'Common Noun', 'Proper Noun', 'Genitive Noun', 
                                 'Intransitive Verb', 'Transitive Verb', 'Preposition', 'Modal', 'Coor-Conjuction', 
                                 'Subor-Conjunction', 'Determiner', 'Interjection', 'Ordinal Numerals', 'Collective Numerals', 
                                 'Primary Numerals', 'Irregular Numerals', 'Personal Pronouns', 'WH-Pronouns', 'Number Pronouns',
                                 'Locative Pronouns', 'Negation', 'Symbol', 'Particles', 'Foreign Word'],
                   'Example' : ['({[', ')}]', '/', ';', ':',
                                '"', '.!?', ',', '-', '...',
                                'Kaya, Manis', 'Sementara, Nanti', 'Mobil', 'Bekasi, Indonesia', 'Bukunya',
                                'Pergi', 'Membeli', 'Di, Ke, Dari', 'Bisa', 'Dan, Atau, Tetapi',
                                'Jika, Ketika', 'Para, Ini, Itu', 'Wah, Aduh, Oi', 'Pertama, Kedua', 'Bertiga',
                                'Satu, Dua', 'Beberapa', 'Saya, Kamu', 'Apa, Siapa', 'Kedua-duanya',
                                'Sini, Situ, Sana', 'Bukan, Tidak', '@#$%^&', 'Pun, Kah', 'Foreign, Word']})

df = df[['POS', 'POS Name', 'Example']]
df.index = np.arange(1, len(df) + 1)
df

Unnamed: 0,POS,POS Name,Example
1,OP,Open Parenthesis,({[
2,CP,Close Parenthesis,)}]
3,GM,Slash,/
4,;,Semicolon,;
5,:,Colon,:
6,"""",Quotation,""""
7,.,Sentence Terminator,.!?
8,",",Comma,","
9,-,Dash,-
10,...,Ellipsis,...


## Pembagian Tagging
Dibagi menjadi 2 yaitu **Class Content** dan **Class Function**.

**Class Content:**
1. JJ = Adjective
2. NN = Common Noun
3. NNP = Proper Noun
4. NNG = Genitive Noun
5. VBI = Intransitive Verb
6. VBT = Transitive Verb
7. FW = Foreign Word


**Class Function:**
1. OP = Open Parenthesis
2. CP = Close Parenthesis
3. GM = Slash
4. ; = Semicolon
5. : = Colon
6. " = Quotation
7. . = Sentence Terminator
8. , = Comma
9. '-' = Dash
10. ... = Ellipsis
11. RB = Adverb
12. IN = Preposition
13. MD = Modal
14. CC = Coor-Conjunction
15. SC = Subor-Conjunction
16. DT = Determiner
17. UH = Interjection
18. CDO = Ordinal Numerals
19. CDC = Collective Numerals
20. CDP = Primary Numerals
21. CDI = Irregular Numerals
22. PRP = Personal Pronouns
23. WP = WH-Pronouns
24. PRN = Number Pronouns
25. PRL = Locative Pronouns
26. NEG = Negation
27. SYM = Symbol
28. RP = Particles

In [6]:
Ccon = ['JJ', 'NN','NNP', 'NNG', 'VBI', 'VBT', 'FW']
Cfunc = ['OP', 'CP', 'GM', ';', ':', '"', '.', 
         ',', '-', '...', 'RB', 'IN', 'MD', 'CC',
         'SC', 'DT', 'UH', 'CDO', 'CDC', 'CDP', 'CDI',
         'PRP', 'WP', 'PRN', 'PRL', 'NEG', 'SYM', 'RP']

In [7]:
"""
Parameters:
    String fileLexicon
    String fileNGram
    int NGramType = 0
    int maxAffixLength = 3
    int Treshold = 3
    int minWordFreq = 0
    int modeAffixTree = 0
    boolean debug = False
    double LambdaBigram = 0.2
    int TwoPhaseType = 0
    double beamFactor = 500.0
    int useLexicon = 0
"""

def init_tag():
    global mt
    if mt is None:
        mt = MainTagger("resource/Lexicon.trn", "resource/Ngram.trn", 0, 3, 3, 0, 0, False, 0.2, 0, 500.0, 1)

## Proses Tagging

In [None]:
def do_tag(documents):
#     lines = text.strip().split("\n")
    for doc in documents:
        result = []
        try:
            init_tag()
            for l in lines:
                if len(l) == 0: continue
                out = sentence_extraction(cleaning(l))
                for o in out:
                    strtag = " ".join(tokenisasi_kalimat(o)).strip()
                    result += [" ".join(mt.taggingStr(strtag))]
        except:
            return "Error Exception"
    return "\n".join(result)

# LDA

In [None]:
def sample_from(weights):
    """returns i with probability weights[i] / sum(weights)"""
    total = sum(weights)
    rnd = total * random.random()     # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                      # return the smallest i such that
        if rnd <= 0: return i         # weights[0] + ... + weights[i] >= rnd

In [None]:
from collections import Counter

In [None]:
K = 5

In [None]:
# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]

In [None]:
# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]

In [None]:
# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]

In [None]:
# a list of numbers, one for each documents
document_lengths = [len(d) for d in documents]

In [None]:
# the number of distinct words
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

In [None]:
# the number of documents
D = len(documents)

In [None]:
def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document _d_
    that are assigned to _topic_ (plus some smoothing)"""
    
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

In [None]:
def p_word_given_topic(word, topic, beta=0.1):
    """the fration of words assigned to _topic_
    that equal _word_ (plus some smoothing)"""
    
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

In [None]:
def topic_weight(d, word, k):
    """given a document and a word in that document,
    return the weight for the k-th topic"""
    
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

In [None]:
import random

In [None]:
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic, in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

In [None]:
for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):
            
            # remove this word / topic from counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1
            
            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            
            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [None]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0: print (k, word, count)

In [None]:
topic_names = ["kata sambung",
               "kesehatan",
               "juara dan prestasi",
               "bisbol",
               "makan brokoli"]

In [None]:
for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count, in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)