### Load file `input2.csv`

In [1]:
import pandas as pd

df = pd.read_csv("input2.csv")

### Load pretrained model Word2Vec

In [4]:
from gensim.models import Word2Vec

model = Word2Vec.load("idwiki_word2vec_200/idwiki_word2vec_200.model")

### Membuat WordVector dari model

Sumber: https://stackoverflow.com/questions/46885454/how-to-create-a-dataframe-with-the-word2ve-vectors-as-data-and-the-terms-as-row

In [5]:
ordered_vocab = [(v, model.wv.key_to_index[v], model.wv.get_vecattr(v, "count")) for v in model.wv.index_to_key]
ordered_vocab = sorted(ordered_vocab, key=lambda k: k[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
wordvec = pd.DataFrame(model.wv.vectors[term_indices, :], index=ordered_terms)

### Membuat CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(vocabulary=wordvec.index)

### Segmentasi menggunakan algoritma optimal

In [23]:
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal

segment_len = 3

out = {
    "doc_id": [],
    "segment_id": [],
    "sentence_id": []
}


for doc_id in set(df["doc_id"]):
    if doc_id == 87: continue
    
    sentence_text = []
    for _, row in df.iterrows():
        if row["doc_id"] == doc_id:
            sentence_text.append(row["sentence_text"])
    
    sentence_vec = count_vec.transform(sentence_text).dot(wordvec)
    penalty = get_penalty([sentence_vec], segment_len)
    
    optimal_segmentation = split_optimal(sentence_vec, penalty, seg_limit=250)
    segment_text = get_segments(sentence_text, optimal_segmentation)
    
    for seg_id, seg in enumerate(segment_text):
        for s in seg:
            out["doc_id"].append(doc_id)
            out["segment_id"].append(f"s{doc_id}_{seg_id}")
            out["sentence_id"].append(df.loc[df["sentence_text"] == s, "sentence_id"].iloc[0])

### Ekspor hasil segmentasi ke CSV

In [24]:
from datetime import date

today = date.today()

out_df = pd.DataFrame(out)
out_df.to_csv(f"textsplit_output_{today.strftime('%d%m%Y')}.csv", index=False)