Write a few sentences to encode (sentences **0** and **2** are both similar):

In [None]:
import numpy as np
import pandas as pd
import re

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

        
df = pd.read_csv('preprocessed.csv')
df = df[df["sentence"].str.len() < 512] 

display(df)

sentences = list(df["sentence"])
precursors = list(df["precursor"])
# succesors = list(df["succesor"])

# ----------------
# preprocessing
# ----------------

sentences = [re.sub("(b|z|f|i) ", " ", sentence) for sentence in sentences]
sentences = [re.sub("sp[0-9]+ ", " ", sentence) for sentence in sentences]
sentences = [re.sub("(\n|\r)+ ", " ", sentence) for sentence in sentences]

display(sentences[0:5])

flat_sentences = sentences
sentences = list(chunks(sentences, 256))

print(len(sentences))

# -------------

precursors = [re.sub("(b|z|f|i) ", " ", sentence) for sentence in precursors]
precursors = [re.sub("sp[0-9]+ ", " ", sentence) for sentence in precursors]
precursors = [re.sub("(\n|\r)+ ", " ", sentence) for sentence in precursors]

display(precursors[0:5])

flat_precursors = precursors
precursors = list(chunks(precursors, 256))

# -------------

# select label columns
cols = df.columns
label_cols = list(cols[3:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
classes = label_cols

# set header for all label columns
df['labels'] = list(df[label_cols].values)
display(df.head())

# get input and outputs
labels = list(df.labels.values)
display(labels[0:5])

Initialize our model:

In [None]:
!pip3 install sentence_transformers --upgrade

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

Encode the sentences:

In [None]:
from tqdm import trange

sentence_embeddings = np.zeros((0, 768))

for idx in trange(len(sentences)):
    lines = sentences[idx]
    new_val = model.encode(lines)
    # print(np.shape(new_val))
    sentence_embeddings = np.concatenate((sentence_embeddings, new_val), axis=0)

In [None]:
precursors_embeddings = np.zeros((0, 768))

for idx in trange(len(sentences)):
    lines = precursors[idx]
    new_val = model.encode(lines)
    # print(np.shape(new_val))
    precursors_embeddings = np.concatenate((precursors_embeddings, new_val), axis=0)

In [None]:
print(sentence_embeddings.shape)
print(precursors_embeddings.shape)

full_embeddings = np.concatenate((sentence_embeddings, precursors_embeddings), axis=1)
full_embeddings.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sims = cosine_similarity(
    sentence_embeddings,
    sentence_embeddings
)

precursors_sims = cosine_similarity(
    precursors_embeddings,
    precursors_embeddings
)

full_sims = cosine_similarity(
    full_embeddings,
    full_embeddings
)

print(np.shape(sims))

print(sims[0,0])

In [None]:
def n_most_similar(similarities, sentences, n):

    most_similar_idx = np.argsort(similarities,  )[::-1][:n]
    
    return [(sentences[idx], similarities[idx]) for idx in most_similar_idx]
    
print(flat_sentences[0])
n_most_similar(sims[0], flat_sentences, 5)

In [None]:
def cutoff_most_similar(similarities, sentences, labels, cutoff):

    similar_idx = [[sentences[idx], labels[idx], sim] for sim, idx in zip(similarities, range(len(similarities))) if sim > cutoff]
    
    return similar_idx
  
print(flat_sentences[1])

sim_result = cutoff_most_similar(full_sims[1], flat_sentences, labels, 0.8)

display(sim_result)

In [None]:
sim_result = cutoff_most_similar(sims[1], flat_sentences, labels, 0.8)

display(sim_result)

In [None]:
from math import log, e

def entropy(labels, base=None):
    """ Computes entropy of label distribution. """

    n_labels = len(labels)

    if n_labels <= 1:
        return 0

    value,counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute entropy
    base = e if base is None else base
    for i in probs:
        ent -= i * log(i, base)

    return ent

def calc_error(label, other_labels):
    """ Computes entropy of label distribution. """

    errors = []
    
    for other_label in other_labels.transpose():
        # optimistic_metric = np.multiply(label, other_label)
        # pessimistic_metric = np.logical_xor(label, other_label)
        # realistic_metric = np.abs(np.subtract(label, other_label))
        realistic_metric = np.power(np.subtract(label, other_label), 100)
        # errors.append(np.multiply(label, other_label))
        errors.append(realistic_metric)

    return np.array(errors, dtype=float)


def entropy_cutoff_most_similar(similarities, sentences, labels, cutoff):
    similar_idx = [[sentences[idx], labels[idx], sim] for sim, idx in zip(similarities, range(len(similarities))) if sim > cutoff]
    
    similar_labels = np.transpose(np.array([s[1] for s in similar_idx]))
    
    return [1 - entropy(label) for label in similar_labels], len(similar_idx)

# rules
# the similarity of a text sentence should fall in the domain of [0, 1]
# any arbitrary pair of sentence s and p are always 100% similar to each other, iff s.text = p.text and s.label = p.label
# for any arbitrary pair of sentence s and p, similarity(s, p) = similarity(s, p)
# if s.label = p.label and text_similarity(s, p) = 0, then similarity(s, p) = 0
# if s.label != p.label and text_similarity(s, p) = 1, then similarity(s, p) = 0
# if s.label != p.label and text_similarity(s, p) = 0, then similarity(s, p) = 1
def error_similar(idx, similarities, sentences, labels):
    similar_idx = [[sentences[idx], labels[idx], sim] for sim, idx in zip(similarities, range(len(similarities)))]
    
    similar_labels = np.transpose(np.array([s[1] for s in similar_idx]))
    
    error = calc_error(similar_labels[:,idx], similar_labels)
    # print(error)
    # print(error[:,1])
    # test_sim = np.full((4), 1 - similarities[idx])
    # print(error[idx,:], test_sim, np.subtract(test_sim, error[idx,:])) # should be [1, 1, ...., 1]
    # print(error[idx,:], similarities[idx], np.abs(np.subtract(1 - similarities[idx], error[idx,:]))) # should be [0, 0, ...., 0]
    for i in range(len(error[0])):
        error[:,i] = np.abs(np.subtract(1 - similarities, error[:,i]))
    
    return error


print(flat_sentences[1])

sim_result = np.mean(error_similar(1, sims[1], flat_sentences, labels), axis=0)
sim_result_mirrored = np.maximum(sim_result, 1 - sim_result)
# sim_result = error_similar(sims[1], flat_sentences, labels)

display(sim_result_mirrored)

In [None]:
agreement = np.zeros((1, len(labels[0])))

for i in trange(len(sims)):
    sim = sims[i]
    precursors_sim = precursors_sims[i]
    full_sim = full_sims[i]
    error = np.mean(error_similar(0, full_sim, flat_sentences, labels), axis=0)
    error_corrected = np.maximum(error, 1 - error)
    agreement += error_corrected * (len(sims) - i)
    
agreement /= len(sims) * len(sims) / 2

display(agreement)
display(np.mean(agreement))

In [None]:
pred_labels = np.loadtxt("predictions.csv", delimiter = ",")
pred_labels

In [None]:
pred_agreement = np.zeros((1, len(labels[0])))

for i in trange(len(sims)):
    sim = sims[i]
    precursors_sim = precursors_sims[i]
    full_sim = full_sims[i]
    error = np.mean(error_similar(0, full_sim, flat_sentences, pred_labels), axis=0)
    error_corrected = np.maximum(error, 1 - error)
    pred_agreement += error_corrected * (len(sims) - i)
    
pred_agreement /= len(sims) * len(sims) / 2

display(pred_agreement)
display(np.mean(pred_agreement))

In [None]:
from tabulate import tabulate

table_classes = ["Experienced QoC", "Experiences", "Expectations", "Context"]
table_classes = np.append(table_classes, 'Average')
table_manual = np.append(agreement[0], np.mean(agreement[0])) * 100
table_digital = np.append(pred_agreement[0], np.mean(pred_agreement[0])) * 100

table_manual = np.round(table_manual, 1)
table_digital = np.round(table_digital, 1)

print(table_manual)
print(table_digital)

table = np.transpose([table_classes, table_manual, table_digital])
print(table)

print(tabulate(table, headers=["Theme", 'Manual (%)', 'Digital (%)'], tablefmt='orgtbl'))