# Segmentasi Teks

## Impor library yang dibutuhkan

In [11]:
import time
import gensim
import word2vec
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

## Load pre-trained model word2vec Bahasa Indonesia

In [12]:
# Sumber: https://github.com/deryrahman/word2vec-bahasa-indonesia
model = Word2Vec.load("idwiki_word2vec_100.model")

## Data untuk segmentasi

In [13]:
file_path = "Split Anotasi Data 2-2.xlsx"
csv_file = pd.read_excel(file_path, sheet_name="Data Awal")

answers = csv_file["Answer "].to_list()
sentence_example = answers[194]

## Teks terlebih dahulu di-tokenize

In [14]:
from textsplit.tools import SimpleSentenceTokenizer
sentence_tokenizer = SimpleSentenceTokenizer()

sentenced_text = sentence_tokenizer(sentence_example)

## Membangun wordvector dari model

In [15]:
# https://stackoverflow.com/questions/46885454/how-to-create-a-dataframe-with-the-word2ve-vectors-as-data-and-the-terms-as-row

ordered_vocab = [(v, model.wv.key_to_index[v], model.wv.get_vecattr(v, "count")) for v in model.wv.index_to_key]
ordered_vocab = sorted(ordered_vocab, key=lambda k: k[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
wordvec = pd.DataFrame(model.wv.vectors[term_indices, :], index=ordered_terms)

## Membangun CountVectorizer

In [16]:
count_vec = CountVectorizer(vocabulary=wordvec.index)

sentence_vec = count_vec.transform(sentenced_text).dot(wordvec)

## Segmentasi menggunakan algoritma optimal

In [17]:
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal, split_greedy, get_total
from pprint import pprint

segment_len = 4
penalty = get_penalty([sentence_vec], segment_len)

optimal_segmentation = split_optimal(sentence_vec, penalty, seg_limit=250)
segmented_text = get_segments(sentenced_text, optimal_segmentation)

print(f"{len(sentenced_text)} sentences, {len(segmented_text)} segments, avg. {len(sentenced_text) // len(segmented_text)} sentences per segment")
pprint(segmented_text)

25 sentences, 7 segments, avg. 3 sentences per segment
[["Halo   Bunda Rizkisusan, terima kasih  telah bertanya di Alodokter :')  "
  'Berdasarkan panduan WHO dan IDAI (Ikatan Dokter Anak Indonesia), pemberian '
  'MPASI (Makanan Pendamping ASI) diberikan pada saat anak berusia 6 bulan. '],
 ['Hal ini disebabkan karena anak usia 6 bulan dinilai sudah memiliki saluran '
  'pencernaan yang lebih kuat untuk dapat menerima makanan. ',
  'Namun kendati demikian terdapat beberapa kondisi yang memperboleh seorang '
  'anak mendapat MPASI dibawah usia 6 bulan yaitu jika anak dinilai tidak '
  'memiliki berat badan yang ideal, atau anak sulit mengalami peningkatan '
  'berat badan. '],
 ['Pemberian awal MPASI dapat dimulai dengan pemberian menu tunggal selama 1 - '
  '2 minggu. ',
  'Dalam satu hari anak sudah dapat mulai diajarkan untuk makan 2 - 3 kali '
  'sehari dengan menu yang berbeda - beda, dengan komposisi makanan yang '
  'mengandung karbohidrat - protein - sayuran serta buah - buahan

## Segmentasi menggunakan algoritma Greedy

In [None]:
greedy_segmentation = split_greedy(sentence_vec, max_splits=len(optimal_segmentation.splits))
greedy_segmented_text = get_segments(sentenced_text, greedy_segmentation)

print(f"{len(sentenced_text)} sentences, {len(greedy_segmented_text)} segments, avg. {len(sentenced_text) // len(greedy_segmented_text)} sentences per segment")
pprint(greedy_segmented_text)

# UTS Package 

In [20]:
#https://github.com/intfloat/uts

import uts

In [25]:
document = sentenced_text
model = uts.C99(window=2)
boundary = model.segment(document)
# output: [1, 0, 1, 0]
print(boundary)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]


# Centre Borelli - ha

In [29]:
from pathlib import Path

import nltk
import numpy as np
import ruptures as rpt  # our package
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import regexp_tokenize
from ruptures.base import BaseCost
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LogNorm

In [45]:
nltk.download("stopwords")
STOPWORD_SET = set(
    stopwords.words("english")
)  # set of stopwords of the English language
#stopword nya englsih, datamya indonesia
PUNCTUATION_SET = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/selomitazhafiirah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
def preprocess(list_of_sentences: list) -> list:
    """Preprocess each sentence (remove punctuation, stopwords, then stemming.)"""
    transformed = list()
    for sentence in list_of_sentences:
        ps = PorterStemmer()
        list_of_words = regexp_tokenize(text=sentence.lower(), pattern="\w+")
        list_of_words = [
            ps.stem(word) for word in list_of_words if word not in STOPWORD_SET
        ]
        transformed.append(" ".join(list_of_words))
    return transformed

In [32]:
def draw_square_on_ax(start, end, ax, linewidth=0.8):
    """Draw a square on the given ax object."""
    ax.vlines(
        x=[start - 0.5, end - 0.5],
        ymin=start - 0.5,
        ymax=end - 0.5,
        linewidth=linewidth,
    )
    ax.hlines(
        y=[start - 0.5, end - 0.5],
        xmin=start - 0.5,
        xmax=end - 0.5,
        linewidth=linewidth,
    )
    return ax


In [None]:
for (line_number, sentence) in enumerate(sentence_example.split('.')):
    sentence = sentence.strip("\n")
    print(f"{line_number:>2}: {sentence}")

## id-sentence-segmenter
from https://github.com/yudanta/id-sentence-segmenter

In [46]:
from idsentsegmenter.sentence_segmentation import SentenceSegmentation

ModuleNotFoundError: No module named 'idsentsegmenter'

In [None]:
# create sentence segmenter instance from SentenceSegmentation class
sentence_segmenter = SentenceSegmentation()

# parse text to sentences 
sentences = sentence_segmenter.get_sentences(news_content)