## Import required libraries

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

import pandas as pd
import numpy as np
import re
from nlp_id.lemmatizer import Lemmatizer

## Load data

In [2]:
df = pd.read_csv('./proses_similarity1.csv')
df = df[['IdData', 'Data', 'Label', 'DataEditGreedy', 'DataEditOptimal']]

## Pre-processing

1. Lowercasing

In [3]:
df['DataLower'] = df['Data'].str.lower()

2. Removing punctuation and numbers

In [4]:
df['DataLowerPunct'] = df['DataLower'].apply(lambda s: re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', s)))

3. Lemmatization, and

In [5]:
lemmatizer = Lemmatizer()

df['DataLemmatized'] = df['DataLowerPunct'].apply(lambda s: lemmatizer.lemmatize(s))

4. Stopwords removal

In [6]:
stop_words = stopwords.words("indonesian")

df['DataStopwordsCleaned'] = df['DataLemmatized'].apply(lambda s: [v for v in s.split() if v not in stop_words])

In [7]:
word_detokenizer = TreebankWordDetokenizer()

df['DataStopwordsCleaned'] = df['DataStopwordsCleaned'].apply(lambda s: word_detokenizer.detokenize(s))

## Sample data for testing

In [8]:
data = {
    'IdKalimat': [],
    'Data': [],
    'DataCleaned': [],
    'Label': [],
    'DataEditGreedy': [],
    'LabelGreedy': []
}

IdKalimat = 0
for _, row in df.iterrows():
    if row['IdData'] == 0:
        data['IdKalimat'].append(IdKalimat)
        data['Data'].append(row['Data'])
        data['DataCleaned'].append(row['DataStopwordsCleaned'])
        data['Label'].append(row['Label'])
        data['DataEditGreedy'].append(row['DataEditGreedy'])
        data['LabelGreedy'].append('')
        IdKalimat += 1

data = pd.DataFrame(data)
data

Unnamed: 0,IdKalimat,Data,DataCleaned,Label,DataEditGreedy,LabelGreedy
0,0,"Halo yayan, Terima kasih atas pertanyaannya",halo yayan terima kasih,F1,0,
1,1,Berdasarkan riwayat keluhan istri Anda yang su...,dasar riwayat keluh istri muntah muntah diare ...,F4,1,
2,2,Gejala muntah dan diare pada dasarnya merupaka...,gejala muntah diare dasar rupa respon tubuh wa...,F3,1,
3,3,Kondisi ini dapat membaik dengan sendirinya da...,kondisi hitung obat khusus dokter,F3,2,
4,4,"Pantau terus gejala yang muncul, kembalikan ca...",pantau gejala muncul cair tubuh monitor tanda ...,F5,3,
5,5,"Selain itu, berikut anjuran yang dapat dilakuk...",anjur laku keluh hindar makan jam,F5,4,
6,6,"Minum air, sup, atau minuman elektrolit untuk ...",minum air sup minum elektrolit bantu cair tubu...,F5,4,
7,7,"Ketika makan nanti, makan sedikit - sedikit de...",makan makan makan tawar roti roti nasi bubur b...,F5,4,
8,8,Istirahat yang cukup,istirahat,F5,4,
9,9,"Menghindari susu dan produk susu, kopi, minuma...",hindar susu produk susu kopi minum soda makan ...,F5,5,


## Making cosine similarity matrix

In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['DataCleaned'])
similarities = pd.DataFrame(cosine_similarity(X))
similarities

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.387298,0.08165,0.182574,0.223607,0.110096,0.265684,0.0,0.101274,0.263523,0.0,0.0
2,0.0,0.387298,1.0,0.0,0.117851,0.096225,0.284268,0.171499,0.0,0.130744,0.204124,0.0,0.0
3,0.0,0.08165,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.182574,0.117851,0.0,1.0,0.068041,0.150756,0.0,0.0,0.0,0.096225,0.0,0.0
5,0.0,0.223607,0.096225,0.0,0.068041,1.0,0.0,0.297044,0.0,0.226455,0.0,0.0,0.0
6,0.0,0.110096,0.284268,0.0,0.150756,0.0,1.0,0.0,0.0,0.167248,0.0,0.1066,0.0
7,0.0,0.265684,0.171499,0.0,0.0,0.297044,0.0,1.0,0.0,0.201802,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.0,0.101274,0.130744,0.0,0.0,0.226455,0.167248,0.201802,0.0,1.0,0.0,0.0,0.0


## Determining segment combinations

In [10]:
labels = sorted(data['Label'].unique())
segmentId = sorted(data['DataEditGreedy'].unique())

segmentLabelCombination = pd.DataFrame(np.zeros((len(segmentId),
                                            len(labels))), 
                                  index=segmentId, 
                                  columns=labels)

segmentLabelCombination

Unnamed: 0,F1,F3,F4,F5
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0


## Calculate similarity greedy score

Function for finding some largest values that are not in the same row and column

In [11]:
def getMaxValueSum(matrix: pd.DataFrame):
    maxVal1 = matrix.max().max()
    
    matrixCopy = matrix.copy()
    
    for i, _ in matrix.iterrows():
        for j in matrix.columns:
            if matrixCopy.loc[i][j] == maxVal1:
                matrixCopy.loc[i][:] = 0.0
                matrixCopy[j] = 0.0
                break
    
    maxValArr = []
    if len(matrix) < len(matrix.columns):
        maxValArr = matrixCopy.max(axis=1)
    else:
        maxValArr = matrixCopy.max(axis=0)
    
    maxVal2 = maxValArr.sum()
    maxValTotal = maxVal1 + maxVal2
    
    return maxValTotal

Function to calculate similarity greedy score

In [12]:
def calculateScore(segment: int, label: str, df: pd.DataFrame, cossim_matrix: pd.DataFrame):
    sgRow = df[df['DataEditGreedy'] == segment]['IdKalimat'].to_list()
    sgCol = df[df['Label'] == label]['IdKalimat'].to_list()

    sgMatrix = pd.DataFrame(np.zeros((len(sgRow), len(sgCol))),
                            index=sgRow,
                            columns=sgCol)

    for noData in sgMatrix.columns:
        for index, _ in sgMatrix.iterrows():
            sgMatrix[noData][index] = similarities[noData][index]
    
    maxValTotal = getMaxValueSum(sgMatrix)
    
    score = ((2 * maxValTotal) / (len(sgRow) + len(sgCol)))
    return score

Put back all values into the segment-label combination matrix

In [13]:
for label in segmentLabelCombination.columns:
    for index, _ in segmentLabelCombination.iterrows():
        segmentLabelCombination[label][index] = calculateScore(index, label, data, similarities)

segmentLabelCombination

Unnamed: 0,F1,F3,F4,F5
0,0.5,0.0,0.0,0.0
1,0.0,0.540825,0.666667,0.122212
2,0.0,0.666667,0.08165,0.0
3,0.0,0.078567,0.182574,0.25
4,0.030457,0.094756,0.106274,0.727273
5,0.571429,0.068041,0.105409,0.383018


## Get largest values not in the same row and column

Repeat the same process as before

In [14]:
def getMaxValArr(segmentLabelCombination: pd.DataFrame):
    maxVal = segmentLabelCombination.max().max()
    
    segmentLabelCombinationCopy = segmentLabelCombination.copy()
    
    for i, _ in segmentLabelCombinationCopy.iterrows():
        for j in segmentLabelCombinationCopy.columns:
            if segmentLabelCombinationCopy.loc[i][j] == maxVal:
                segmentLabelCombinationCopy.loc[i][:] = 0.0
                segmentLabelCombinationCopy[j] = 0.0
                break
    
    maxValArr = []
    if len(segmentLabelCombinationCopy) < len(segmentLabelCombinationCopy.columns):
        maxValArr = segmentLabelCombinationCopy.max(axis=1)
    else:
        maxValArr = segmentLabelCombinationCopy.max(axis=0)
    
    maxValArr = maxValArr.tolist()
    maxValArr.append(maxVal)
    
    maxValArr = [val for val in maxValArr if val != 0]
    
    return maxValArr

## Labelling

Get the index (segment) and column (label) for each of the values
then put the label according to the segment.

Finally, fill the unlabelled segment with `Fx`

In [15]:
vals = getMaxValArr(segmentLabelCombination)

for v in vals:
    label = segmentLabelCombination.columns[segmentLabelCombination.isin([v]).any()].item()
    segment = segmentLabelCombination.index[segmentLabelCombination.isin([v]).any(axis=1)].item()
    
    data.loc[data['DataEditGreedy'] == segment, 'LabelGreedy'] = label

data['LabelGreedy'].replace('','Fx', inplace=True)
data

Unnamed: 0,IdKalimat,Data,DataCleaned,Label,DataEditGreedy,LabelGreedy
0,0,"Halo yayan, Terima kasih atas pertanyaannya",halo yayan terima kasih,F1,0,Fx
1,1,Berdasarkan riwayat keluhan istri Anda yang su...,dasar riwayat keluh istri muntah muntah diare ...,F4,1,F4
2,2,Gejala muntah dan diare pada dasarnya merupaka...,gejala muntah diare dasar rupa respon tubuh wa...,F3,1,F4
3,3,Kondisi ini dapat membaik dengan sendirinya da...,kondisi hitung obat khusus dokter,F3,2,F3
4,4,"Pantau terus gejala yang muncul, kembalikan ca...",pantau gejala muncul cair tubuh monitor tanda ...,F5,3,Fx
5,5,"Selain itu, berikut anjuran yang dapat dilakuk...",anjur laku keluh hindar makan jam,F5,4,F5
6,6,"Minum air, sup, atau minuman elektrolit untuk ...",minum air sup minum elektrolit bantu cair tubu...,F5,4,F5
7,7,"Ketika makan nanti, makan sedikit - sedikit de...",makan makan makan tawar roti roti nasi bubur b...,F5,4,F5
8,8,Istirahat yang cukup,istirahat,F5,4,F5
9,9,"Menghindari susu dan produk susu, kopi, minuma...",hindar susu produk susu kopi minum soda makan ...,F5,5,F1
