# Import required libraries

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

from collections import defaultdict

import pandas as pd
import numpy as np
import re
from nlp_id.lemmatizer import Lemmatizer

# Load and order data

In [5]:
df = pd.read_csv('ProsesSimilarityGabung.csv')
df['IdKalimat'] = df.groupby('IdData').cumcount()
df['DataCleaned'] = ''
df = df[['IdData', 'IdKalimat', 'Data', 'DataCleaned', 'Label', 'DataEditGreedy', 'DataEditOptimal']]

# Preprocess data

In [6]:
lemmatizer = Lemmatizer()
stop_words = stopwords.words("indonesian")
word_detokenizer = TreebankWordDetokenizer()

df['DataCleaned'] = df['Data'].str.lower()
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', s)))
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: lemmatizer.lemmatize(s))
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: [v for v in s.split() if v not in stop_words])
df['DataCleaned'] = df['DataCleaned'].apply(lambda s: word_detokenizer.detokenize(s))

# Make some helper functions

#### `makeSimilarityMatrix`: for creating similarity matrix 

In [8]:
def makeSimilarityMatrix(data: list[str]):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    similarities = pd.DataFrame(cosine_similarity(X))
    return similarities

#### `makeSegmentLabelMatrix()`: for making a matrix of segment-label combination

In [61]:
def makeSegmentLabelMatrix(data: pd.DataFrame, field: str):
    labels = sorted(data['Label'].unique())
    segments = sorted(data[field].unique())

    matrix = pd.DataFrame(np.zeros((len(segments), len(labels))), 
                                  index=segments, columns=labels)

    return matrix

#### `getLargestValuesSum()`: for getting the largest values

The largest values must not be in the same row and column 
only used inside `calculateScore()` function

In [10]:
def getLargestValueSum(matrix: pd.DataFrame):
    maxVal1 = matrix.max().max()
    
    matrixCopy = matrix.copy()
    
    for i, _ in matrix.iterrows():
        for j in matrix.columns:
            if matrixCopy.loc[i][j] == maxVal1:
                matrixCopy.loc[i][:] = 0.0
                matrixCopy[j] = 0.0
                break
    
    maxValArr = []
    if len(matrix) < len(matrix.columns):
        maxValArr = matrixCopy.max(axis=1)
    else:
        maxValArr = matrixCopy.max(axis=0)
    
    maxVal2 = maxValArr.sum()
    maxValTotal = maxVal1 + maxVal2
    
    return maxValTotal

#### `calculateScore()`: for calculating similarity greedy score

In [13]:
def calculateScore(segment: int, label: str, df: pd.DataFrame, field: str, cossim_matrix: pd.DataFrame):
    sgRow = df[df[field] == segment]['IdKalimat'].to_list()
    sgCol = df[df['Label'] == label]['IdKalimat'].to_list()

    sgMatrix = pd.DataFrame(np.zeros((len(sgRow), len(sgCol))),
                            index=sgRow,
                            columns=sgCol)

    for noData in sgMatrix.columns:
        for index, _ in sgMatrix.iterrows():
            sgMatrix[noData][index] = similarities[noData][index]
    
    maxValTotal = getLargestValueSum(sgMatrix)
    
    score = ((2 * maxValTotal) / (len(sgRow) + len(sgCol)))
    return score

#### `getLabelSegmentPair()`: for labelling a segment

use a `defaultdict` from the `collections` library  
so when it encounters a segment without a label, will be set to `'Fx'`

In [36]:
def getSegmentLabelPair(df: pd.DataFrame):
    # Find the indices of the flattened array that would sort it in descending order
    sorted_indices = np.argsort(-df.values.ravel())
    num_values = len(df.columns)

    # Initialize empty lists for the row and column indices of the largest values
    rows = []
    cols = []

    # Loop through the sorted indices and add the row and column indices to the lists
    for idx in sorted_indices:
        row = idx // df.shape[1]
        col = idx % df.shape[1]
        if row not in rows and col not in cols:
            rows.append(row)
            cols.append(col)
        if len(rows) == num_values:
            break

    # Get the largest values at the selected row and column indices
    values = df.values[rows, cols]
    segments = rows

    # Return the results as a dict
    labels = [df.columns[c] for c in cols]
    return defaultdict(lambda: 'Fx', dict(zip(segments, labels)))

# Processing greedy segment

In [62]:
df['LabelGreedy'] = ''

for doc_id in df['IdData'].unique():
    data = {}
    
    data['IdKalimat'] = df.loc[df['IdData'] == doc_id, 'IdKalimat'].tolist()
    data['Data'] = df.loc[df['IdData'] == doc_id, 'Data'].tolist()
    data['DataCleaned'] = df.loc[df['IdData'] == doc_id, 'DataCleaned'].tolist()
    data['Label'] = df.loc[df['IdData'] == doc_id, 'Label'].tolist()
    data['DataEditGreedy'] = df.loc[df['IdData'] == doc_id, 'DataEditGreedy'].tolist()

    data = pd.DataFrame(data)

    similarities = makeSimilarityMatrix(data['DataCleaned'])

    slMatrix = makeSegmentLabelMatrix(data, 'DataEditGreedy')

    for label in slMatrix.columns:
        for index, _ in slMatrix.iterrows():
            slMatrix[label][index] = calculateScore(index, label, data, 'DataEditGreedy', similarities)

    segmentLabelPair = getSegmentLabelPair(slMatrix)
    
    mask = df['IdData'] == doc_id
    df.loc[mask, 'LabelGreedy'] = df.loc[mask, 'DataEditGreedy'].map(segmentLabelPair)

# Processing optimal segment

In [63]:
df['LabelOptimal'] = ''

for doc_id in df['IdData'].unique():
    data = {}
    
    data['IdKalimat'] = df.loc[df['IdData'] == doc_id, 'IdKalimat'].tolist()
    data['Data'] = df.loc[df['IdData'] == doc_id, 'Data'].tolist()
    data['DataCleaned'] = df.loc[df['IdData'] == doc_id, 'DataCleaned'].tolist()
    data['Label'] = df.loc[df['IdData'] == doc_id, 'Label'].tolist()
    data['DataEditOptimal'] = df.loc[df['IdData'] == doc_id, 'DataEditOptimal'].tolist()

    data = pd.DataFrame(data)

    similarities = makeSimilarityMatrix(data['DataCleaned'])

    slMatrix = makeSegmentLabelMatrix(data, 'DataEditOptimal')

    for label in slMatrix.columns:
        for index, _ in slMatrix.iterrows():
            slMatrix[label][index] = calculateScore(index, label, data, 'DataEditOptimal', similarities)

    segmentLabelPair = getSegmentLabelPair(slMatrix)
    
    mask = df['IdData'] == doc_id
    df.loc[mask, 'LabelOptimal'] = df.loc[mask, 'DataEditOptimal'].map(segmentLabelPair)

# Export result to CSV

In [64]:
df.to_csv('SimilarityGreedyResult.csv', index=False)