In [24]:
import pandas as pd
import nltk
from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words('indonesian') + list(punctuation)

# Import Data

In [25]:
df = pd.read_csv("data/kompas.csv")
df.head()

Unnamed: 0,teks
0,Ginandjar Tetap Ditahan. Jaksa Agung Dilaporka...
1,Jakarta Dikangkangi Para Preman\nKALAU tak pun...
2,Penyimpangan di Setpres Seolah Terjadi Sekaran...
3,"Dibayarkan, Rapel Kenaikan Gaji Pegawai Pos\nK..."
4,"Stop Kekerasan, Elite agar Duduk Bersama\nSeju..."


# Extract BoW

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
bow = CountVectorizer(ngram_range=(1,2), tokenizer=word_tokenize, stop_words=sw_indo, min_df=5)
bow_matrix = bow.fit_transform(df.teks)

  'stop_words.' % sorted(inconsistent))


# Topic Modeling

In [28]:
vocab = bow.get_feature_names()

In [33]:
def get_topic(model):
    return[[vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()] 
        for comp in model.components_]

# Latent Semantic Analysis (LSA)

In [29]:
from sklearn.decomposition import TruncatedSVD

In [30]:
lsa = TruncatedSVD(n_components=10, n_iter=10, random_state=42)

In [31]:
lsa_matrix = lsa.fit_transform(bow_matrix)

In [32]:
print(bow_matrix.shape) # hidden
print(lsa_matrix.shape) # weight / code
print(lsa.components_.shape) # fitur / topic


(2008, 25132)
(2008, 10)
(10, 25132)


In [35]:
get_topic(lsa)

[['presiden', 'indonesia', 'pemerintah', 'dpr'],
 ['presiden', 'dpr', 'ketua', 'partai', 'mpr', 'tandjung'],
 ['pemerintah', 'rp', 'indonesia', 'bank', 'persen', 'utang'],
 ['rp', 'tandjung', 'dana', 'bulog', 'hukum', 'harga'],
 ['presiden', 'air', 'banjir', 'harga', 'rp', 'dpr'],
 ['harga', 'beras', 'rp', 'bbm'],
 ['mpr', 'konstitusi', 'bppn', 'uud'],
 ['indonesia', 'mpr', 'konstitusi', 'uud', 'perubahan', '1945'],
 ['pemerintah', 'dpr', 'israel', 'bppn', 'kota', 'aceh'],
 ['massa', 'rupiah', 'bunga', 'mpr', 'bank', 'suku']]

# Latent Dirichlet Allocation (LDA)

In [36]:
from sklearn.decomposition import LatentDirichletAllocation

In [37]:
lda = LatentDirichletAllocation(n_components=10, max_iter=10, random_state=42)

In [38]:
lda_matrix = lda.fit_transform(bow_matrix)

In [39]:
get_topic(lda)

[['hukum', 'tandjung', 'agung', 'tim'],
 ['rumah', 'polisi', 'orang', 'jakarta'],
 ['pemerintah', 'rp', 'bppn', 'utang'],
 ['presiden', 'dpr', 'ketua', 'politik'],
 ['israel', 'aceh', 'keamanan', 'pemerintah'],
 ['indonesia', 'rp', 'dollar', 'as'],
 ['kota', 'warga', 'orang', 'korban'],
 ['indonesia', 'beras', 'harga', 'petani'],
 ['banjir', 'jakarta', 'air', 'jalan'],
 ['pesawat', 'pt', 'bca', 'ka']]