# LDA Bahasa Indonesia

Instalasi beberapa library yang diperlukan, untuk NLTK dan GENSIM sudah include di Colab

In [1]:
pip install --upgrade gensim



In [2]:
pip install pyldavis==3.2.1



In [3]:
pip install Sastrawi



In [4]:
pip install swifter



# Preparing Data

In [5]:
import pandas as pd 
import numpy as np
import nltk

In [6]:
#import data dari GDrive
#bisa lihat di video ini https://youtu.be/AAwgqf2d82k
! gdown --id 1JXKfpv2rAPVn9OcPQAVh3YCXGn7XnACR #id file di drive, bisa dilihat di link file dengan menu share file

Downloading...
From: https://drive.google.com/uc?id=1JXKfpv2rAPVn9OcPQAVh3YCXGn7XnACR
To: /content/dataBerita.xlsx
  0% 0.00/25.9k [00:00<?, ?B/s]100% 25.9k/25.9k [00:00<00:00, 22.4MB/s]


In [7]:
dataSB = pd.read_excel('/content/dataBerita.xlsx', sheet_name="Sheet1") #lokasi file

dataSB.head()

Unnamed: 0,articlename,textdata
0,http://regional./read/2016/10/20/12504331/gama...,Gamawan Sebut Anggaran KTP Elektronik Dibahas ...
1,https://biz./read/2016/02/28/090910928/Pemkot....,Pemkot Tangerang Permudah Investasi Dengan Per...
2,https://biz./read/2016/03/28/142218228/Wajah.B...,"Wajah Baru Pengadaan Barang/Jasa, Simpel tapi ..."
3,https://biz./read/2016/03/29/160443028/Pengusa...,Pengusaha Kecil dan Menengah Bisa Ikut Jadi Pe...
4,https://biz./read/2016/11/08/080000028/mengopt...,Mengoptimalkan Manajemen Modal Kerja Bisnis sa...


In [8]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas

dataSB['textdata'] = dataSB['textdata'].str.lower()

print('Case Folding Result : \n')
print(dataSB['textdata'].head(5))

Case Folding Result : 

0    gamawan sebut anggaran ktp elektronik dibahas ...
1    pemkot tangerang permudah investasi dengan per...
2    wajah baru pengadaan barang/jasa, simpel tapi ...
3    pengusaha kecil dan menengah bisa ikut jadi pe...
4    mengoptimalkan manajemen modal kerja bisnis sa...
Name: textdata, dtype: object


# Tokenizing

Menghapus karakter-karakter yang tidak penting dari text

In [9]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

In [10]:
# ------ Tokenizing ---------

nltk.download('punkt')

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
dataSB['textdata'] = dataSB['textdata'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

dataSB['textdata'] = dataSB['textdata'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

dataSB['textdata'] = dataSB['textdata'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

dataSB['textdata'] = dataSB['textdata'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

dataSB['textdata'] = dataSB['textdata'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

dataSB['textdata'] = dataSB['textdata'].apply(remove_singl_char)

# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

dataSB['textdata_tokens'] = dataSB['textdata'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(dataSB['textdata_tokens'].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenizing Result : 

0    [gamawan, sebut, anggaran, ktp, elektronik, di...
1    [pemkot, tangerang, permudah, investasi, denga...
2    [wajah, baru, pengadaan, barangjasa, simpel, t...
3    [pengusaha, kecil, dan, menengah, bisa, ikut, ...
4    [mengoptimalkan, manajemen, modal, kerja, bisn...
Name: textdata_tokens, dtype: object


In [11]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

dataSB['textdata_tokens_fdist'] = dataSB['textdata_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(dataSB['textdata_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(dan, 9), (anggaran, 8), (gamawan, 6), (ktp, ...
1    [(tangerang, 28), (yang, 26), (kota, 22), (onl...
2    [(yang, 49), (pengadaan, 30), (ini, 16), (peny...
3    [(yang, 27), (dan, 23), (penyedia, 22), (penga...
4    [(vendor, 14), (yang, 13), (pembiayaan, 8), (d...
Name: textdata_tokens_fdist, dtype: object


# Stopwords
Membersihkan teks dari kata-kata yang termasuk ke dalam stopwords

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah', 'bisnis', 'pandemi', 'indonesia'])

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataSB['textdata_tokens_WSW'] = dataSB['textdata_tokens'].apply(stopwords_removal) 

print(dataSB['textdata_tokens_WSW'].head())

0    [gamawan, anggaran, ktp, elektronik, dibahas, ...
1    [pemkot, tangerang, permudah, investasi, periz...
2    [wajah, pengadaan, barangjasa, simpel, akuntab...
3    [pengusaha, menengah, penyedia, barangjasa, pe...
4    [mengoptimalkan, manajemen, modal, kerja, meng...
Name: textdata_tokens_WSW, dtype: object


# Normalisasi
Mengganti kata-kata tertentu dengan kata lain yang lebih tepat

In [14]:
#import file xlsx untuk proses normalisasi
! gdown --id 1eJ5g-7fzbLXlTl0e_dC9CLqmShBOfRkf #id file di drive

Downloading...
From: https://drive.google.com/uc?id=1eJ5g-7fzbLXlTl0e_dC9CLqmShBOfRkf
To: /content/normalisasi.xlsx
  0% 0.00/8.56k [00:00<?, ?B/s]100% 8.56k/8.56k [00:00<00:00, 14.9MB/s]


In [15]:
normalizad_word = pd.read_excel('/content/normalisasi.xlsx') #lokasi file

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

dataSB['textdata_normalized'] = dataSB['textdata_tokens_WSW'].apply(normalized_term)

dataSB['textdata_normalized'].head(10)

0    [gamawan, anggaran, ktp, elektronik, dibahas, ...
1    [pemkot, tangerang, permudah, investasi, periz...
2    [wajah, pengadaan, barangjasa, simpel, akuntab...
3    [pengusaha, menengah, penyedia, barangjasa, pe...
4    [mengoptimalkan, manajemen, modal, kerja, meng...
5    [ilmu, perbendaharaan, dibutuhkan, bank, pelak...
6    [bayar, belanjaan, forum, jual, beli, kaskus, ...
7    [libur, lebaran, pelayanan, publik, banyuwangi...
8    [semarang, gandeng, kpk, pencegahan, korupsi, ...
9    [pembangunan, jalan, tol, cisumdawu, dipacu, d...
Name: textdata_normalized, dtype: object

In [16]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in dataSB['textdata_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))


1278


In [17]:
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    
    # untuk melihat hasilnya silahkan jalankan baris di bawah ini
    # print(term,":" ,term_dict[term])

In [18]:
# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

dataSB['textdata_tokens_stemmed'] = dataSB['textdata_normalized'].swifter.apply(get_stemmed_term)

print(dataSB['textdata_tokens_stemmed'])

Pandas Apply:   0%|          | 0/10 [00:00<?, ?it/s]

0    [gamawan, anggar, ktp, elektronik, bahas, wapr...
1    [pemkot, tangerang, mudah, investasi, izin, on...
2    [wajah, ada, barangjasa, simpel, akuntabel, up...
3    [usaha, tengah, sedia, barangjasa, perintah, l...
4    [optimal, manajemen, modal, kerja, hadap, pasa...
5    [ilmu, bendahara, butuh, bank, laku, ilmu, ben...
6    [bayar, belanja, forum, jual, beli, kaskus, pa...
7    [libur, lebaran, layan, publik, banyuwangi, la...
8    [semarang, gandeng, kpk, cegah, korupsi, semar...
9    [bangun, jalan, tol, cisumdawu, pacu, direktor...
Name: textdata_tokens_stemmed, dtype: object


In [26]:
#stopwords #2

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["ada", "tan", "ton", "pt", "komentar", "juta", "unit", "menang", "artikel", 
                       "smartphone", "tagar", "sedia", "kaskus", "seksi"])

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataSB['textdata_tokens_stemmed2'] = dataSB['textdata_tokens_stemmed'].apply(stopwords_removal) 

print(dataSB['textdata_tokens_stemmed2'].head())

0    [gamawan, anggar, ktp, elektronik, bahas, wapr...
1    [pemkot, tangerang, mudah, investasi, izin, on...
2    [wajah, barangjasa, simpel, akuntabel, upaya, ...
3    [usaha, barangjasa, perintah, laksana, barang,...
4    [optimal, manajemen, modal, kerja, hadap, pasa...
Name: textdata_tokens_stemmed2, dtype: object


In [27]:
for i in range(len(dataSB)):
        a=dataSB.iloc[i][6]
        document.append(a)
        
document[0:5]

['pembangunan', 'jalan', 'tol', 'cisumdawu', 'dipacu']

In [28]:
doc_clean = dataSB['textdata_tokens_stemmed2']
doc_clean

0    [gamawan, anggar, ktp, elektronik, bahas, wapr...
1    [pemkot, tangerang, mudah, investasi, izin, on...
2    [wajah, barangjasa, simpel, akuntabel, upaya, ...
3    [usaha, barangjasa, perintah, laksana, barang,...
4    [optimal, manajemen, modal, kerja, hadap, pasa...
5    [ilmu, bendahara, butuh, bank, laku, ilmu, ben...
6    [bayar, belanja, forum, jual, beli, pakai, sak...
7    [libur, lebaran, layan, publik, banyuwangi, la...
8    [semarang, gandeng, kpk, cegah, korupsi, semar...
9    [bangun, jalan, tol, cisumdawu, pacu, direktor...
Name: textdata_tokens_stemmed2, dtype: object

# LDA model using gensim
Proses topic modeling dengan LDA gensim

In [29]:
import gensim
from gensim import corpora

dictionary = corpora.Dictionary(doc_clean)
print(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

Dictionary(983 unique tokens: ['administrasi', 'aju', 'anggar', 'audit', 'auditor']...)


In [30]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

total_topics = 3 # jumlah topik yang akan di extract
number_words = 10 # jumlah kata per topik

In [31]:
# Running and Trainign LDA model on the document term matrix.
lda_model = Lda(doc_term_matrix, num_topics=total_topics, id2word = dictionary, passes=50)

lda_model.show_topics(num_topics=total_topics, num_words=number_words)

[(0,
  '0.024*"perintah" + 0.016*"korupsi" + 0.014*"proses" + 0.013*"informasi" + 0.012*"usaha" + 0.012*"sikap" + 0.012*"kerja" + 0.011*"barangjasa" + 0.011*"semarang" + 0.008*"kpk"'),
 (1,
  '0.021*"tangerang" + 0.019*"kota" + 0.018*"kerja" + 0.018*"jalan" + 0.016*"layan" + 0.016*"tol" + 0.014*"online" + 0.011*"aplikasi" + 0.008*"duduk" + 0.008*"izin"'),
 (2,
  '0.022*"bayar" + 0.022*"beli" + 0.016*"vendor" + 0.012*"biaya" + 0.012*"jual" + 0.011*"saku" + 0.010*"anggar" + 0.008*"elektronik" + 0.008*"fjb" + 0.007*"ktp"')]

In [32]:
# Word Count of Topic Keywords

from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in doc_clean for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df_imp_wcount = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count']) 
print(df_imp_wcount)

          word  topic_id  importance  word_count
0     perintah         0    0.024323          49
1      korupsi         0    0.016070          25
2       proses         0    0.013536          29
3    informasi         0    0.012900          32
4        usaha         0    0.011631          29
5        sikap         0    0.011629          18
6        kerja         0    0.011629          48
7   barangjasa         0    0.010997          18
8     semarang         0    0.010995          17
9          kpk         0    0.007824          18
10   tangerang         1    0.020900          28
11        kota         1    0.018690          36
12       kerja         1    0.017953          48
13       jalan         1    0.017949          24
14       layan         1    0.015739          24
15         tol         1    0.015736          21
16      online         1    0.014266          23
17    aplikasi         1    0.010576          25
18       duduk         1    0.008361          15
19        izin      

In [33]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [34]:
#simpan ke google drive
df_imp_wcount.to_csv('df_imp_wcount.csv')
!cp df_imp_wcount.csv "drive/My Drive/Colab Notebooks/LDA Indonesia/"

In [None]:
#jika simpan ke local drive
#filedisimpan='df_imp_wcount.xlsx'
#df_imp_wcount.to_excel(filedisimpan, index = False, header=True)

In [35]:
#Dominant topic and its percentage contribution in each topic
def format_topics_sentences(ldamodel=None, corpus=doc_term_matrix, texts=document):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [36]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=doc_term_matrix, texts=doc_clean)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
print(df_dominant_topic.head(10))

   Document_No  ...                                               Text
0            0  ...  [gamawan, anggar, ktp, elektronik, bahas, wapr...
1            1  ...  [pemkot, tangerang, mudah, investasi, izin, on...
2            2  ...  [wajah, barangjasa, simpel, akuntabel, upaya, ...
3            3  ...  [usaha, barangjasa, perintah, laksana, barang,...
4            4  ...  [optimal, manajemen, modal, kerja, hadap, pasa...
5            5  ...  [ilmu, bendahara, butuh, bank, laku, ilmu, ben...
6            6  ...  [bayar, belanja, forum, jual, beli, pakai, sak...
7            7  ...  [libur, lebaran, layan, publik, banyuwangi, la...
8            8  ...  [semarang, gandeng, kpk, cegah, korupsi, semar...
9            9  ...  [bangun, jalan, tol, cisumdawu, pacu, direktor...

[10 rows x 5 columns]


In [37]:
#simpan ke google drive
df_dominant_topic.to_csv('df_dominant_topic.csv')
!cp df_dominant_topic.csv "drive/My Drive/Colab Notebooks/LDA Indonesia/"

In [None]:
# jika simpan ke local drive
# filedisimpan='df_dominant_topic.xlsx'
# df_dominant_topic.to_excel(filedisimpan, index = False, header=True)

In [38]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


In [39]:
import os
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(total_topics))

In [40]:
corpus = [dictionary.doc2bow(text) for text in doc_clean]

In [41]:
# proses ini mungkin agak lama
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

In [42]:
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

In [43]:
pyLDAvis.save_html(LDAvis_prepared, 'drive/My Drive/Colab Notebooks/LDA Indonesia/ldavis_prepared_'+ str(total_topics) +'.html')

In [44]:
# proses ini mungkin agak lama
LDAvis_prepared