<a href="https://colab.research.google.com/github/raffi-ns/datashet/blob/main/Klasterisasi_K_Means_STKI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import numpy as np

nltk.download('punkt')

# Pra-pemrosesan (Preprocessing)
def preprocess_document(document):
    tokens = word_tokenize(document)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    preprocessed_document = ' '.join(stemmed_tokens)
    return preprocessed_document

# Dokumen
documents = [
    "PSIS berburu juara Liga Indonesia",
    "Hasil putusan Sidang Elit Politik",
    "Partai politik berebut suara",
    "Manchester United Juara Liga Inggris",
    "Timnas Indonesia juara Liga AFC",
]

# Pra-pemrosesan dokumen
preprocessed_documents = [preprocess_document(doc) for doc in documents]

# Term Weighting menggunakan TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# K-means Clustering
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Menentukan Centroid D1 dan D3
normalized_centroids = normalize(kmeans.cluster_centers_, axis=1, norm='l2')
centroid_d1_idx, _ = pairwise_distances_argmin_min(normalized_centroids, tfidf_matrix)

# Dokumen dalam Cluster 1
cluster1_documents = [documents[i] for i in range(len(documents)) if kmeans.labels_[i] == 0]

# Dokumen dalam Cluster 2
cluster2_documents = [documents[i] for i in range(len(documents)) if kmeans.labels_[i] == 1]

# Menampilkan hasil
print("Dokumen dalam Cluster 1:")
for doc in cluster1_documents:
    print(doc)

print("\nDokumen dalam Cluster 2:")
for doc in cluster2_documents:
    print(doc)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Dokumen dalam Cluster 1:
PSIS berburu juara Liga Indonesia
Manchester United Juara Liga Inggris
Timnas Indonesia juara Liga AFC

Dokumen dalam Cluster 2:
Hasil putusan Sidang Elit Politik
Partai politik berebut suara


