# Import required libraries

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal, split_greedy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nlp_id.lemmatizer import Lemmatizer

import pandas as pd
import re

# Load and preproccess data

In [2]:
df = pd.read_csv("Data.csv")

df['IdKalimat'] = df.groupby('IdData').cumcount()
df['DataCleaned'] = ''
df = df[['IdData', 'IdKalimat', 'Data', 'DataCleaned', 'Label']]

In [3]:
lemmatizer = Lemmatizer()
stop_words = stopwords.words("indonesian")
word_detokenizer = TreebankWordDetokenizer()

df['DataCleaned'] = df['Data'].str.lower()
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', s)))
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: lemmatizer.lemmatize(s))
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: [v for v in s.split() if v not in stop_words])
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: word_detokenizer.detokenize(s))

# Load Word2Vec model

In [4]:
model = Word2Vec.load("idwiki_word2vec_200/idwiki_word2vec_200.model")

# Create WordVector from model

Source: https://stackoverflow.com/questions/46885454/how-to-create-a-dataframe-with-the-word2ve-vectors-as-data-and-the-terms-as-row

In [5]:
ordered_vocab = [(v, model.wv.key_to_index[v], model.wv.get_vecattr(v, "count")) for v in model.wv.index_to_key]
ordered_vocab = sorted(ordered_vocab, key=lambda k: k[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
wordvec = pd.DataFrame(model.wv.vectors[term_indices, :], index=ordered_terms)

# Create CountVectorizer

In [6]:
count_vec = CountVectorizer(vocabulary=wordvec.index)

# Segmentation optimal & greedy

In [11]:
segment_length = 3
segment_result = {
    'DataEditOptimal': [],
    'DataEditGreedy': [],
}

for IdData in df['IdData'].unique():
    sentence_text = df.loc[df['IdData'] == IdData, 'Data']
    sentence_vec = count_vec.transform(sentence_text).dot(wordvec)
    
    try:
        penalty = get_penalty([sentence_vec], segment_length)
    except ValueError:
        print(f"doc_id {doc_id} is too short ({len(sentence_text)})")
        continue
    
    optimal_segmentation = split_optimal(sentence_vec, penalty, seg_limit=250)
    optimal_segment_text = get_segments(sentence_text, optimal_segmentation)
    
    greedy_segmentation = split_greedy(sentence_vec, max_splits=len(optimal_segmentation.splits))
    greedy_segment_text = get_segments(sentence_text, greedy_segmentation)

    for seg_id, segment in enumerate(greedy_segment_text):
        for _ in segment:
            segment_result['DataEditGreedy'].append(seg_id)
    
    for seg_id, segment in enumerate(optimal_segment_text):
        for _ in segment:
            segment_result['DataEditOptimal'].append(seg_id)

# Export result to CSV

In [12]:
df['DataEditGreedy'] = segment_result['DataEditGreedy']
df['DataEditOptimal'] = segment_result['DataEditOptimal']

df.to_csv('OutputTextsplit.csv', index=False)