In [1]:
# create sample documents
doc_a = "Brokoli bagus untuk dimakan. Adikku suka makan brokoli, tetapi ibuku tidak."
doc_b = "Ibuku menghabiskan banyak waktu berkeliling melihat latihan bisbol adikku."
doc_c = "Beberapa ahli kesehatan menyarankan bahwa mengemudi dapat menyebabkan ketegangan dan tekanan darah meningkat."
doc_d = "Saya sering merasakan tekanan untuk tampil baik saat presentasi di sekolah."
doc_e = "Profesional kesehatan mengatakan bahwa brokoli itu baik untuk kesehatan Anda."
doc_f = "Teman saya seorang pemain bisbol yang pernah mendapatkan juara."
doc_g = "Pemain bisbol yang bernama Flash itu sangat suka memakan brokoli."
doc_h = "Sopir yang mengemudi taksi itu mendapatkan tekanan dari penumpangnya."
doc_i = "Saat tanding, olahraga bisbol memberikan ketegangan dan meningkatkan tekanan darah para penonton."
doc_j = "Ibuku menyarankan saya untuk memakan brokoli agar tekanan darah terkontrol."

# compile sample documents into a list
documents = [doc_a, doc_b, doc_c, doc_d, doc_e, doc_f, doc_g, doc_h, doc_i, doc_j]

# POS Tagging

In [2]:
import os
from hmmtagger.tagger import MainTagger
from tokenization import *

mt = None

In [3]:
import pandas as pd
import numpy as np

## Daftar Tagging
Berikut adalah daftar tagging berdasarkan jurnal **HMM Based Part-of-Speech Tagger for Bahasa Indonesia.** (Alfan Farizki & Ayu Purwarianti, 2010)

In [4]:
# daftar tagging
df = pd.DataFrame({'POS' : ['OP', 'CP', 'GM', ';', ':', 
                            '"', '.', ',', '-', '...', 
                            'JJ', 'RB', 'NN', 'NNP','NNG', 
                            'VBI', 'VBT', 'IN', 'MD', 'CC', 
                            'SC', 'DT', 'UH', 'CDO', 'CDC', 
                            'CDP', 'CDI', 'PRP', 'WP', 'PRN', 
                            'PRL', 'NEG', 'SYM', 'RP', 'FW'],
                   'POS Name' : ['Open Parenthesis', 'Close Parenthesis', 'Slash', 'Semicolon', 'Colon', 
                                 'Quotation', 'Sentence Terminator', 'Comma', 'Dash', 'Ellipsis', 
                                 'Adjective', 'Adverb', 'Common Noun', 'Proper Noun', 'Genitive Noun', 
                                 'Intransitive Verb', 'Transitive Verb', 'Preposition', 'Modal', 'Coor-Conjuction', 
                                 'Subor-Conjunction', 'Determiner', 'Interjection', 'Ordinal Numerals', 'Collective Numerals', 
                                 'Primary Numerals', 'Irregular Numerals', 'Personal Pronouns', 'WH-Pronouns', 'Number Pronouns',
                                 'Locative Pronouns', 'Negation', 'Symbol', 'Particles', 'Foreign Word'],
                   'Example' : ['({[', ')}]', '/', ';', ':',
                                '"', '.!?', ',', '-', '...',
                                'Kaya, Manis', 'Sementara, Nanti', 'Mobil', 'Bekasi, Indonesia', 'Bukunya',
                                'Pergi', 'Membeli', 'Di, Ke, Dari', 'Bisa', 'Dan, Atau, Tetapi',
                                'Jika, Ketika', 'Para, Ini, Itu', 'Wah, Aduh, Oi', 'Pertama, Kedua', 'Bertiga',
                                'Satu, Dua', 'Beberapa', 'Saya, Kamu', 'Apa, Siapa', 'Kedua-duanya',
                                'Sini, Situ, Sana', 'Bukan, Tidak', '@#$%^&', 'Pun, Kah', 'Foreign, Word']})

df = df[['POS', 'POS Name', 'Example']]
df.index = np.arange(1, len(df) + 1)
df

Unnamed: 0,POS,POS Name,Example
1,OP,Open Parenthesis,({[
2,CP,Close Parenthesis,)}]
3,GM,Slash,/
4,;,Semicolon,;
5,:,Colon,:
6,"""",Quotation,""""
7,.,Sentence Terminator,.!?
8,",",Comma,","
9,-,Dash,-
10,...,Ellipsis,...


## Pembagian Tagging
Dibagi menjadi 2 yaitu **Class Content** dan **Class Function**.

**Class Content:**
1. JJ = Adjective
2. NN = Common Noun
3. NNP = Proper Noun
4. NNG = Genitive Noun
5. VBI = Intransitive Verb
6. VBT = Transitive Verb
7. FW = Foreign Word


**Class Function:**
1. OP = Open Parenthesis
2. CP = Close Parenthesis
3. GM = Slash
4. ; = Semicolon
5. : = Colon
6. " = Quotation
7. . = Sentence Terminator
8. , = Comma
9. '-' = Dash
10. ... = Ellipsis
11. RB = Adverb
12. IN = Preposition
13. MD = Modal
14. CC = Coor-Conjunction
15. SC = Subor-Conjunction
16. DT = Determiner
17. UH = Interjection
18. CDO = Ordinal Numerals
19. CDC = Collective Numerals
20. CDP = Primary Numerals
21. CDI = Irregular Numerals
22. PRP = Personal Pronouns
23. WP = WH-Pronouns
24. PRN = Number Pronouns
25. PRL = Locative Pronouns
26. NEG = Negation
27. SYM = Symbol
28. RP = Particles

In [5]:
Ccon = ['JJ', 'NN','NNP', 'NNG', 'VBI', 'VBT', 'FW']
Cfunc = ['OP', 'CP', 'GM', ';', ':', '"', '.', 
         ',', '-', '...', 'RB', 'IN', 'MD', 'CC',
         'SC', 'DT', 'UH', 'CDO', 'CDC', 'CDP', 'CDI',
         'PRP', 'WP', 'PRN', 'PRL', 'NEG', 'SYM', 'RP']

In [6]:
"""
Parameters:
    String fileLexicon
    String fileNGram
    int NGramType = 0
    int maxAffixLength = 3
    int Treshold = 3
    int minWordFreq = 0
    int modeAffixTree = 0
    boolean debug = False
    double LambdaBigram = 0.2
    int TwoPhaseType = 0
    double beamFactor = 500.0
    int useLexicon = 0
"""

def init_tag():
    global mt
    try:
        if mt is None:
            mt = MainTagger("resource/Lexicon.trn", "resource/Ngram.trn", 0, 3, 3, 0, 0, False, 0.2, 0, 500.0, 1)
    except:
        print("Error Exception")

## Proses Tagging

In [7]:
import re

In [8]:
text = "Saya suka makan bakso H. Hadi. Baksonya sangat enak dan mantap. Aku kepedesan hehe."

In [9]:
a = sentence_extraction(text)
a

['Saya suka makan bakso H. Hadi.',
 'Baksonya sangat enak dan mantap.',
 'Aku kepedesan hehe.']

In [10]:
for _ in a:
    hasil = " ".join(tokenisasi_kalimat(_)).strip()
hasil

'Aku kepedesan hehe .'

In [11]:
# tokenisasi dan tagging
tagged_doc = []
for doc in documents:
    lines = doc.strip().split("\n")
    try:
        init_tag()
        for l in lines:
            if len(l) == 0: continue
            out = sentence_extraction(cleaning(l))
            for o in out:
                strtag = " ".join(tokenisasi_kalimat(o)).strip()
#                 result += [" ".join(mt.taggingStr(strtag))]
                tagged_doc += [mt.taggingStr(strtag)]
    except:
        print ("Error Exception")

for _ in tagged_doc:
    print (_)

['Brokoli/NN', 'bagus/NN', 'untuk/IN', 'dimakan/VBT', './.']
['Adikku/NN', 'suka/VBI', 'makan/VBT', 'brokoli/NN', ',/,', 'tetapi/CC', 'ibuku/NN', 'tidak/NEG', './.']
['Ibuku/NN', 'menghabiskan/VBT', 'banyak/JJ', 'waktu/NN', 'berkeliling/NN', 'melihat/VBT', 'latihan/NN', 'bisbol/NN', 'adikku/NNP', './.']
['Beberapa/CDI', 'ahli/NN', 'kesehatan/NN', 'menyarankan/VBT', 'bahwa/SC', 'mengemudi/VBT', 'dapat/MD', 'menyebabkan/VBT', 'ketegangan/NN', 'dan/CC', 'tekanan/NN', 'darah/NN', 'meningkat/VBI', './.']
['Saya/PRP', 'sering/JJ', 'merasakan/VBT', 'tekanan/NN', 'untuk/IN', 'tampil/NN', 'baik/JJ', 'saat/SC', 'presentasi/NN', 'di/IN', 'sekolah/NN', './.']
['Profesional/NNP', 'kesehatan/NN', 'mengatakan/VBI', 'bahwa/SC', 'brokoli/NN', 'itu/DT', 'baik/JJ', 'untuk/IN', 'kesehatan/NN', 'Anda/PRP', './.']
['Teman/NN', 'saya/PRP', 'seorang/NN', 'pemain/NN', 'bisbol/NN', 'yang/SC', 'pernah/NN', 'mendapatkan/VBT', 'juara/NN', './.']
['Pemain/NN', 'bisbol/NN', 'yang/SC', 'bernama/VBT', 'Flash/NN', 'itu

## Cleaning tagging buat cari topik

In [12]:
doc_con = []
for tagged in tagged_doc:
    con = []
    for _ in tagged:
        if _.split("/", 1)[1] in Ccon:
            con += ["".join(_)]
    doc_con += [con]

for _ in doc_con:
    print (_) 

['Brokoli/NN', 'bagus/NN', 'dimakan/VBT']
['Adikku/NN', 'suka/VBI', 'makan/VBT', 'brokoli/NN', 'ibuku/NN']
['Ibuku/NN', 'menghabiskan/VBT', 'banyak/JJ', 'waktu/NN', 'berkeliling/NN', 'melihat/VBT', 'latihan/NN', 'bisbol/NN', 'adikku/NNP']
['ahli/NN', 'kesehatan/NN', 'menyarankan/VBT', 'mengemudi/VBT', 'menyebabkan/VBT', 'ketegangan/NN', 'tekanan/NN', 'darah/NN', 'meningkat/VBI']
['sering/JJ', 'merasakan/VBT', 'tekanan/NN', 'tampil/NN', 'baik/JJ', 'presentasi/NN', 'sekolah/NN']
['Profesional/NNP', 'kesehatan/NN', 'mengatakan/VBI', 'brokoli/NN', 'baik/JJ', 'kesehatan/NN']
['Teman/NN', 'seorang/NN', 'pemain/NN', 'bisbol/NN', 'pernah/NN', 'mendapatkan/VBT', 'juara/NN']
['Pemain/NN', 'bisbol/NN', 'bernama/VBT', 'Flash/NN', 'suka/VBI', 'memakan/VBT', 'brokoli/NN']
['Sopir/NNP', 'mengemudi/VBT', 'mendapatkan/VBT', 'tekanan/NN', 'penumpangnya/NN']
['Saat/NN', 'tanding/NN', 'olahraga/NN', 'memberikan/VBT', 'ketegangan/NN', 'meningkatkan/VBT', 'tekanan/NN', 'darah/NN', 'penonton/NN']
['Ibuku/NN'

In [13]:
documents = []
for con in doc_con:
    co = []
    for c in con:
        result = c.split('/', 1)[0]
        co.append(result)
    documents += [co]
    
for _ in documents:
    print (_)

['Brokoli', 'bagus', 'dimakan']
['Adikku', 'suka', 'makan', 'brokoli', 'ibuku']
['Ibuku', 'menghabiskan', 'banyak', 'waktu', 'berkeliling', 'melihat', 'latihan', 'bisbol', 'adikku']
['ahli', 'kesehatan', 'menyarankan', 'mengemudi', 'menyebabkan', 'ketegangan', 'tekanan', 'darah', 'meningkat']
['sering', 'merasakan', 'tekanan', 'tampil', 'baik', 'presentasi', 'sekolah']
['Profesional', 'kesehatan', 'mengatakan', 'brokoli', 'baik', 'kesehatan']
['Teman', 'seorang', 'pemain', 'bisbol', 'pernah', 'mendapatkan', 'juara']
['Pemain', 'bisbol', 'bernama', 'Flash', 'suka', 'memakan', 'brokoli']
['Sopir', 'mengemudi', 'mendapatkan', 'tekanan', 'penumpangnya']
['Saat', 'tanding', 'olahraga', 'memberikan', 'ketegangan', 'meningkatkan', 'tekanan', 'darah', 'penonton']
['Ibuku', 'menyarankan', 'memakan', 'brokoli', 'tekanan', 'darah', 'terkontrol']


In [14]:
text = 'name/NN'
sep = '/'
rest = text.split(sep, 1)[0]
print (rest)

name


In [15]:
my_string="mengatakan/VBI"
print (my_string.split("/",1)[1]) 

VBI


# LDA

In [16]:
def sample_from(weights):
    """returns i with probability weights[i] / sum(weights)"""
    total = sum(weights)
    rnd = total * random.random()     # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                      # return the smallest i such that
        if rnd <= 0: return i         # weights[0] + ... + weights[i] >= rnd

In [17]:
from collections import Counter

In [18]:
K = 5

In [19]:
# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]

In [20]:
# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]

In [21]:
# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]

In [22]:
# a list of numbers, one for each documents
document_lengths = [len(d) for d in documents]

In [23]:
# the number of distinct words
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

In [24]:
# the number of documents
D = len(documents)

In [25]:
def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document _d_
    that are assigned to _topic_ (plus some smoothing)"""
    
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

In [26]:
def p_word_given_topic(word, topic, beta=0.1):
    """the fration of words assigned to _topic_
    that equal _word_ (plus some smoothing)"""
    
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

In [27]:
def topic_weight(d, word, k):
    """given a document and a word in that document,
    return the weight for the k-th topic"""
    
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

In [28]:
import random

In [29]:
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic, in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

In [30]:
for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):
            
            # remove this word / topic from counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1
            
            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            
            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [31]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0: print (k, word, count)

0 tekanan 4
0 darah 3
0 ketegangan 2
0 menyarankan 2
0 Saat 1
0 tanding 1
0 meningkatkan 1
0 olahraga 1
0 memberikan 1
0 penonton 1
0 penumpangnya 1
0 meningkat 1
0 ahli 1
1 Ibuku 2
1 waktu 1
1 bisbol 1
1 banyak 1
1 Brokoli 1
1 dimakan 1
1 terkontrol 1
1 menghabiskan 1
1 latihan 1
1 adikku 1
1 presentasi 1
2 brokoli 4
2 bisbol 2
2 memakan 2
2 suka 2
2 Adikku 1
2 Flash 1
2 Profesional 1
2 bagus 1
2 menyebabkan 1
2 Pemain 1
2 bernama 1
3 kesehatan 3
3 baik 2
3 sering 1
3 mengatakan 1
3 merasakan 1
3 mengemudi 1
3 sekolah 1
3 tampil 1
3 melihat 1
3 berkeliling 1
4 mendapatkan 2
4 mengemudi 1
4 seorang 1
4 juara 1
4 tekanan 1
4 makan 1
4 ibuku 1
4 Sopir 1
4 pernah 1
4 Teman 1
4 pemain 1


In [32]:
topic_names = ["Topik 1: ",
               "Topik 2: ",
               "Topik 3: ",
               "Topik 4: ",
               "Topik 5: "]

In [33]:
for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count, in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)

['Brokoli', 'bagus', 'dimakan']
Topik 2:  2
Topik 3:  1
['Adikku', 'suka', 'makan', 'brokoli', 'ibuku']
Topik 3:  3
Topik 5:  2
['Ibuku', 'menghabiskan', 'banyak', 'waktu', 'berkeliling', 'melihat', 'latihan', 'bisbol', 'adikku']
Topik 2:  7
Topik 4:  2
['ahli', 'kesehatan', 'menyarankan', 'mengemudi', 'menyebabkan', 'ketegangan', 'tekanan', 'darah', 'meningkat']
Topik 1:  6
Topik 4:  2
Topik 3:  1
['sering', 'merasakan', 'tekanan', 'tampil', 'baik', 'presentasi', 'sekolah']
Topik 4:  5
Topik 1:  1
Topik 2:  1
['Profesional', 'kesehatan', 'mengatakan', 'brokoli', 'baik', 'kesehatan']
Topik 4:  4
Topik 3:  2
['Teman', 'seorang', 'pemain', 'bisbol', 'pernah', 'mendapatkan', 'juara']
Topik 5:  6
Topik 3:  1
['Pemain', 'bisbol', 'bernama', 'Flash', 'suka', 'memakan', 'brokoli']
Topik 3:  7
['Sopir', 'mengemudi', 'mendapatkan', 'tekanan', 'penumpangnya']
Topik 5:  4
Topik 1:  1
['Saat', 'tanding', 'olahraga', 'memberikan', 'ketegangan', 'meningkatkan', 'tekanan', 'darah', 'penonton']
Topik 

In [56]:
from lda.ldamodel import LdaModel

In [57]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [58]:
lda = LdaModel(documents, 5, 0.1, 0.1)

TypeError: object() takes no parameters