Library

In [3]:
import string
import re
from collections import OrderedDict
from tabulate import tabulate
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
set(stopwords.words('indonesian'))

[nltk_data] Downloading package stopwords to C:\Users\rovan
[nltk_data]     wardana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

Membaca dataset

In [4]:
df = pd.read_csv("news_data.csv")
df = df[['article_text']].dropna().head(20)
text = [[f"Doc {i+1}", row['article_text']] for i, row in df.iterrows()]


Setup Library

In [5]:
stopfactory = StopWordRemoverFactory()
stopword = stopfactory.create_stop_word_remover()
stemfactory = StemmerFactory()
stemmer = stemfactory.create_stemmer()

Preprocessing Function

In [6]:
def preprocess_text(text):
    title, content = text

    # Clean the content
    clean = "".join([c for c in content if c not in string.punctuation]).lower()
    clean = re.sub(r'\d+', '', clean)  # Remove numbers

    # Tokenize the content
    tokens = clean.split()

    #Stopword removal
    filtered = [word for word in tokens if word not in stopwords.words('indonesian')]

    # Stemming
    stemmed = [stemmer.stem(word) for word in filtered]

    # Terms
    terms = list(OrderedDict.fromkeys(stemmed))
    return [title, clean, tokens, filtered, stemmed, terms]

    

Preprocessing Output in Table

In [7]:
headers = ["Title", "Clean", "Tokens", "Filtered", "Stemmed", "Terms"]
for idx, doc in enumerate(text):
    hasil = preprocess_text(doc)
    print(f"Document {idx+1}:")
    print(tabulate([hasil], headers=headers, tablefmt="grid"))
    print('=' * 100)
    print("\n")

Document 1:
+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Save as .csv

In [8]:
semua_hasil = []
for idx, doc in enumerate(text):
    hasil = preprocess_text(doc)
    semua_hasil.append(hasil)

hasil_df = pd.DataFrame(semua_hasil, columns=headers)
hasil_df.to_csv("preprocessed_news_data.csv", index=False)

In [9]:
# Jika kolom 'Stemmed' berisi list kata-kata, gabungkan menjadi string
hasil_df['Stemmed'] = hasil_df['Stemmed'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

from sklearn.feature_extraction.text import TfidfVectorizer
# Sekarang, proses TF-IDF bisa dilakukan
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(hasil_df['Stemmed'])

In [10]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

In [11]:
hasil_df['cluster'] = labels
print(hasil_df)

     Title                                              Clean  \
0    Doc 1  sekjen pdip sekaligus sekretaris tim pemenang ...   
1    Doc 2  mantan ketua umum pengurus besar nahdlatul ula...   
2    Doc 3  surat pernyataan kpps pemilu  merupakan bagian...   
3    Doc 4  sekitar  personel dikerahkan untuk mengamankan...   
4    Doc 5  ukraina marah usai rusia menjadikan empat wila...   
5    Doc 6  sekretaris tim kampanye nasional tkn prabowo g...   
6    Doc 7  cawapres nomor urut  gibran rakabuming raka me...   
7    Doc 8  sekretaris tim pemenangan nasional tpn capres ...   
8    Doc 9  tim kampanye nasional tkn menegaskan kapasitas...   
9   Doc 10  pasangan capres dan cawapres nomor urut  prabo...   
10  Doc 11  surat pendaftaran kpps pemilu  merupakan bagia...   
11  Doc 12  debat perdana pilpres  bakal digelar kpu ri se...   
12  Doc 13  partai gerindra resmi memberikan rekomendasi c...   
13  Doc 14  wakil ketua umum partai golkar bambang soesaty...   
14  Doc 15  cawapres nomo

In [12]:
# Atur lebar tampilan kolom agar teks tidak dipotong
pd.set_option('display.max_colwidth', None)

# Cetak DataFrame
print(hasil_df[['Stemmed', 'cluster']])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [13]:
from collections import Counter

# Buat fungsi untuk menampilkan kata-kata paling sering di setiap cluster
def top_words_per_cluster(df, cluster_col='cluster', text_col='Stemmed', n_top=5):
    for cluster in sorted(df[cluster_col].unique()):
        # Gabungkan semua teks dalam cluster jadi satu
        texts = df[df[cluster_col] == cluster][text_col]
        words = ' '.join(texts).split()
        most_common = Counter(words).most_common(n_top)
        print(f'\nCluster {cluster} Top Words: {most_common}')

top_words_per_cluster(hasil_df)


Cluster 0 Top Words: [('abuya', 15), ('khofifah', 13), ('dukung', 12), ('jatim', 8), ('muhtadi', 6)]

Cluster 1 Top Words: [('debat', 37), ('gugat', 32), ('hukum', 24), ('kpu', 19), ('tim', 18)]

Cluster 2 Top Words: [('anggota', 42), ('milu', 38), ('daftar', 36), ('kpps', 32), ('surat', 25)]

Cluster 3 Top Words: [('gibran', 23), ('doa', 13), ('said', 10), ('pesantren', 10), ('ketua', 9)]

Cluster 4 Top Words: [('milu', 20), ('aman', 13), ('tps', 13), ('prabowo', 11), ('rawan', 11)]
