In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install Sastrawi

import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt_tab')

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import re

stemmer = StemmerFactory().create_stemmer()
stopword_factory = StopWordRemoverFactory()
stopwords = set(stopword_factory.get_stop_words())

def preprocessing(teks):
    hasil = []
    kalimat = sent_tokenize(teks)
    for kal in kalimat:
        kal = kal.lower()
        kal = re.sub(r'[^a-z\s]', '', kal)
        tokens = kal.split()
        tokens = [t for t in tokens if t not in stopwords]
        hasil_stem = [stemmer.stem(t) for t in tokens]
        hasil.append(" ".join(hasil_stem))
    return hasil

def mmr_euclidean(tfidf_matrix, distance_matrix, lambda_param=0.7, summary_length=3):
    selected = []
    candidates = list(range(len(distance_matrix)))

    while len(selected) < summary_length and candidates:
        mmr_scores = []
        for idx in candidates:
            relevance = np.mean(tfidf_matrix[idx])
            if selected:
                redundancy = max([1 / (1 + distance_matrix[idx][j]) for j in selected])
            else:
                redundancy = 0
            mmr_score = lambda_param * relevance - (1 - lambda_param) * redundancy
            mmr_scores.append((idx, mmr_score))

        selected_idx = sorted(mmr_scores, key=lambda x: x[1], reverse=True)[0][0]
        selected.append(selected_idx)
        candidates.remove(selected_idx)

    return selected

input_folder = "/content/drive/MyDrive/Summarization_MMR/teks_asli/"
output_folder = "/content/drive/MyDrive/Summarization_MMR/ringkasan_mmr_euclidean/"
os.makedirs(output_folder, exist_ok=True)

for i in range(1, 101):
    input_path = os.path.join(input_folder, f"{i}.txt")
    output_path = os.path.join(output_folder, f"S{i}.txt")

    with open(input_path, "r") as f:
        teks_asli = f.read()

    kalimat_asli = sent_tokenize(teks_asli)
    kalimat_prep = preprocessing(teks_asli)

    if len(kalimat_prep) < 2:
        with open(output_path, "w") as out:
            out.write(teks_asli.strip())
        continue

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(kalimat_prep).toarray()
    euclidean_sim = euclidean_distances(tfidf_matrix)

    summary_len = max(1, round(0.3 * len(kalimat_prep)))
    selected_indices = mmr_euclidean(tfidf_matrix, euclidean_sim, lambda_param=0.7, summary_length=summary_len)

    selected_indices.sort()

    with open(output_path, "w") as out:
        for idx in selected_indices:
            out.write(kalimat_asli[idx].strip() + "\n")

print("✅ Ringkasan MMR + Euclidean selesai disimpan untuk 100 dokumen.")

Mounted at /content/drive
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


✅ Ringkasan MMR + Euclidean selesai disimpan untuk 100 dokumen.
