In [None]:
# create sample documents
doc_a = "Brokoli bagus untuk dimakan. Adikku suka makan brokoli, tetapi ibuku tidak."
doc_b = "Ibuku menghabiskan banyak waktu berkeliling melihat latihan bisbol adikku."
doc_c = "Beberapa ahli kesehatan menyarankan bahwa mengemudi dapat menyebabkan ketegangan dan tekanan darah meningkat."
doc_d = "Saya sering merasakan tekanan untuk tampil baik saat presentasi di sekolah."
doc_e = "Profesional kesehatan mengatakan bahwa brokoli itu baik untuk kesehatan Anda."
doc_f = "Teman saya seorang pemain bisbol yang pernah mendapatkan juara."
doc_g = "Pemain bisbol yang bernama Flash itu sangat suka memakan brokoli."
doc_h = "Sopir yang mengemudi taksi itu mendapatkan tekanan dari penumpangnya."
doc_i = "Saat tanding, olahraga bisbol memberikan ketegangan dan meningkatkan tekanan darah para penonton."
doc_j = "Ibuku menyarankan saya untuk memakan brokoli agar tekanan darah terkontrol."

# compile sample documents into a list
documents = [doc_a, doc_b, doc_c, doc_d, doc_e, doc_f, doc_g, doc_h, doc_i, doc_j]

# POS Tagging

In [None]:
import os
from hmmtagger.tagger import MainTagger
from tokenization import *

mt = None

In [None]:
import pandas as pd
import numpy as np

## Daftar Tagging
Berikut adalah daftar tagging berdasarkan jurnal **HMM Based Part-of-Speech Tagger for Bahasa Indonesia.** (Alfan Farizki & Ayu Purwarianti, 2010)

In [None]:
# daftar tagging
df = pd.DataFrame({'POS' : ['OP', 'CP', 'GM', ';', ':', 
                            '"', '.', ',', '-', '...', 
                            'JJ', 'RB', 'NN', 'NNP','NNG', 
                            'VBI', 'VBT', 'IN', 'MD', 'CC', 
                            'SC', 'DT', 'UH', 'CDO', 'CDC', 
                            'CDP', 'CDI', 'PRP', 'WP', 'PRN', 
                            'PRL', 'NEG', 'SYM', 'RP', 'FW'],
                   'POS Name' : ['Open Parenthesis', 'Close Parenthesis', 'Slash', 'Semicolon', 'Colon', 
                                 'Quotation', 'Sentence Terminator', 'Comma', 'Dash', 'Ellipsis', 
                                 'Adjective', 'Adverb', 'Common Noun', 'Proper Noun', 'Genitive Noun', 
                                 'Intransitive Verb', 'Transitive Verb', 'Preposition', 'Modal', 'Coor-Conjuction', 
                                 'Subor-Conjunction', 'Determiner', 'Interjection', 'Ordinal Numerals', 'Collective Numerals', 
                                 'Primary Numerals', 'Irregular Numerals', 'Personal Pronouns', 'WH-Pronouns', 'Number Pronouns',
                                 'Locative Pronouns', 'Negation', 'Symbol', 'Particles', 'Foreign Word'],
                   'Example' : ['({[', ')}]', '/', ';', ':',
                                '"', '.!?', ',', '-', '...',
                                'Kaya, Manis', 'Sementara, Nanti', 'Mobil', 'Bekasi, Indonesia', 'Bukunya',
                                'Pergi', 'Membeli', 'Di, Ke, Dari', 'Bisa', 'Dan, Atau, Tetapi',
                                'Jika, Ketika', 'Para, Ini, Itu', 'Wah, Aduh, Oi', 'Pertama, Kedua', 'Bertiga',
                                'Satu, Dua', 'Beberapa', 'Saya, Kamu', 'Apa, Siapa', 'Kedua-duanya',
                                'Sini, Situ, Sana', 'Bukan, Tidak', '@#$%^&', 'Pun, Kah', 'Foreign, Word']})

df = df[['POS', 'POS Name', 'Example']]
df.index = np.arange(1, len(df) + 1)
df

## Pembagian Tagging
Dibagi menjadi 2 yaitu **Class Content** dan **Class Function**.

**Class Content:**
1. JJ = Adjective
2. NN = Common Noun
3. NNP = Proper Noun
4. NNG = Genitive Noun
5. VBI = Intransitive Verb
6. VBT = Transitive Verb
7. FW = Foreign Word


**Class Function:**
1. OP = Open Parenthesis
2. CP = Close Parenthesis
3. GM = Slash
4. ; = Semicolon
5. : = Colon
6. " = Quotation
7. . = Sentence Terminator
8. , = Comma
9. '-' = Dash
10. ... = Ellipsis
11. RB = Adverb
12. IN = Preposition
13. MD = Modal
14. CC = Coor-Conjunction
15. SC = Subor-Conjunction
16. DT = Determiner
17. UH = Interjection
18. CDO = Ordinal Numerals
19. CDC = Collective Numerals
20. CDP = Primary Numerals
21. CDI = Irregular Numerals
22. PRP = Personal Pronouns
23. WP = WH-Pronouns
24. PRN = Number Pronouns
25. PRL = Locative Pronouns
26. NEG = Negation
27. SYM = Symbol
28. RP = Particles

In [None]:
Ccon = ['JJ', 'NN','NNP', 'NNG', 'VBI', 'VBT', 'FW']
Cfunc = ['OP', 'CP', 'GM', ';', ':', '"', '.', 
         ',', '-', '...', 'RB', 'IN', 'MD', 'CC',
         'SC', 'DT', 'UH', 'CDO', 'CDC', 'CDP', 'CDI',
         'PRP', 'WP', 'PRN', 'PRL', 'NEG', 'SYM', 'RP']

In [None]:
"""
Parameters:
    String fileLexicon
    String fileNGram
    int NGramType = 0
    int maxAffixLength = 3
    int Treshold = 3
    int minWordFreq = 0
    int modeAffixTree = 0
    boolean debug = False
    double LambdaBigram = 0.2
    int TwoPhaseType = 0
    double beamFactor = 500.0
    int useLexicon = 0
"""

def init_tag():
    global mt
    try:
        if mt is None:
            mt = MainTagger("resource/Lexicon.trn", "resource/Ngram.trn", 0, 3, 3, 0, 0, False, 0.2, 0, 500.0, 1)
    except:
        print("Error Exception")

## Proses Tagging

In [None]:
import re

In [None]:
text = "Saya suka makan bakso H. Hadi. Baksonya sangat enak dan mantap. Aku kepedesan hehe."

In [None]:
a = sentence_extraction(text)
a

In [None]:
for _ in a:
    hasil = " ".join(tokenisasi_kalimat(_)).strip()
hasil

In [None]:
# tokenisasi dan tagging
tagged_doc = []
for doc in documents:
    lines = doc.strip().split("\n")
    try:
        init_tag()
        for l in lines:
            if len(l) == 0: continue
            out = sentence_extraction(cleaning(l))
            for o in out:
                strtag = " ".join(tokenisasi_kalimat(o)).strip()
#                 result += [" ".join(mt.taggingStr(strtag))]
                tagged_doc += [mt.taggingStr(strtag)]
    except:
        print ("Error Exception")

for _ in tagged_doc:
    print (_)

## Cleaning tagging buat cari topik

In [None]:
doc_con = []
for tagged in tagged_doc:
    con = []
    for _ in tagged:
        if _.split("/", 1)[1] in Ccon:
            con += ["".join(_)]
    doc_con += [con]

for _ in doc_con:
    print (_) 

In [None]:
documents = []
for con in doc_con:
    co = []
    for c in con:
        result = c.split('/', 1)[0]
        co.append(result)
    documents += [co]
    
for _ in documents:
    print (_)

In [None]:
text = 'name/NN'
sep = '/'
rest = text.split(sep, 1)[0]
print (rest)

In [None]:
my_string="mengatakan/VBI"
print (my_string.split("/",1)[1]) 

# LDA

In [None]:
topic_names = ["Topik 1: ",
               "Topik 2: ",
               "Topik 3: ",
               "Topik 4: ",
               "Topik 5: "]

In [None]:
for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count, in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)

In [1]:
from lda.ldamodel import LdaModel

In [2]:
documents2 = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [3]:
lda = LdaModel(documents2, 4, 0.1, 0.1)

In [4]:
lda.print_topics()

0 Java 3
0 Big Data 3
0 Hadoop 2
0 HBase 1
0 C++ 1
0 Spark 1
0 Storm 1
0 programming languages 1
0 MapReduce 1
0 Cassandra 1
0 deep learning 1
1 HBase 2
1 neural networks 2
1 Postgres 2
1 MongoDB 2
1 machine learning 2
1 Cassandra 1
1 numpy 1
1 decision trees 1
1 deep learning 1
1 databases 1
1 MySQL 1
1 NoSQL 1
1 artificial intelligence 1
1 scipy 1
2 regression 3
2 Python 2
2 R 2
2 libsvm 2
2 scikit-learn 2
2 mathematics 1
2 support vector machines 1
2 Haskell 1
2 Mahout 1
3 statistics 3
3 probability 3
3 Python 2
3 R 2
3 pandas 2
3 statsmodels 2
3 C++ 1
3 artificial intelligence 1
3 theory 1


In [5]:
lda.print_topics2()

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
Topik a:  7
['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
Topik b:  5
['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Topik b:  2
Topik c:  2
Topik d:  2
['R', 'Python', 'statistics', 'regression', 'probability']
Topik d:  3
Topik c:  2
['machine learning', 'regression', 'decision trees', 'libsvm']
Topik c:  2
Topik b:  2
['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
Topik c:  3
Topik a:  3
['statistics', 'probability', 'mathematics', 'theory']
Topik d:  3
Topik c:  1
['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
Topik c:  2
Topik b:  2
['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
Topik b:  3
Topik a:  1
['Hadoop', 'Java', 'MapReduce', 'Big Data']
Topik a:  4
['statistics', 'R', 'statsmodels']
Topik d:  3
['C++', 'deep learning', 'artificial intelligence', 'probability']
Topik d:  3
Topik a:  1
['pandas', 'R', 'Pyt