In [None]:
pip install --quiet --user --force-reinstall datasets==2.18.0 pandas==2.2.1 numpy==1.26.4 nltk==3.8.1 levenshtein==0.25.0 sentence-transformers==2.5.1 transformers==4.38.2 openai==1.14.1 gdown==5.1.0 torch==2.2.1 umap-learn==0.5.5

In [None]:
%mkdir benchmark
%mkdir data

In [None]:
!gdown "1Cfr6GZZps-HZtTqPRMAXPqzKqsGjUsNR" --output "data/test.contr.json" --quiet
!gdown "1xP35a1Y_S0-LBT_bg8aubSDShIIOp4Zg" --output "data/test.uncontr.json" --quiet
!gdown "1ouoV1-MBc5zxycrNcYKUwLfcF5EjYG__" --output "data/train.contr.json" --quiet
!gdown "14vcPWyyCt9zCWXJaONdizB7eJovE4EUY" --output "data/train.uncontr.json" --quiet
!gdown "1HSpe4ERtfpATjkff3E2lvXFneA-VUk5q" --output "data/validation.contr.json" --quiet
!gdown "18CnNKT2JZOoVYkxbDzShCYxB0u338UyM" --output "data/validation.uncontr.json" --quiet

!gdown "153AaDrTO3x354AjvLNeHtz-fojBioJ7m" --output "data/One2Set.txt" --quiet
!gdown "13HkAz5WItwVKbvdJekI2stkUdKoDfrTt" --output "data/CatSeqTG_2RF1.txt" --quiet
!gdown "1mDM1JGlsIndzbGpY6x5_5euLYWMXaG2J" --output "data/KG-KE-KR-M.out" --quiet

In [None]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

# Make dataset

In [None]:
import nltk
nltk.download("punkt")

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
stemmer = PorterStemmer()

def contains(subseq, inseq):
    return any(inseq[pos:pos + len(subseq)] == subseq for pos in range(0, len(inseq) - len(subseq) + 1))

def tokenize(s):
    return word_tokenize(s)

def lowercase_and_stem(_words):
    return [stemmer.stem(w.lower()) for w in _words]

def pmru(tok_text, tok_kps):
    p, r, m, u = [], [], [], []
    absent_words = set()
    # loop through the keyphrases
    for j, kp in enumerate(tok_kps):

        # if kp is present
        if contains(kp, tok_text):
            p.append(j)

        # if kp is considered as absent
        else:
            # find present and absent words
            present_words = [w for w in kp if w in tok_text]
            absent_words.update([w for w in kp if w not in tok_text])

            # if "all" words are present
            if len(present_words) == len(kp):
                r.append(j)
            # if "some" words are present
            elif len(present_words) > 0:
                m.append(j)
            # if "no" words are present
            else:
                u.append(j)

    return {"P": p, "R": r, "M": m, "U": u}

In [None]:
from datasets import load_dataset
import pandas
import numpy
import json

controlled = ["controlled", "uncontrolled"]
prmu = ["P", "R", "M", "U"]
splits = ["train", "validation", "test"]
dataset = pandas.DataFrame()

for split in splits:
    keyphrases = { "controlled": json.load(open(f"data/{split}.contr.json")), "uncontrolled": json.load(open(f"data/{split}.uncontr.json")) }
    split_df = load_dataset("taln-ls2n/inspec", split=split, trust_remote_code=True).to_pandas()
    split_df["split"] = split
    split_df["input"] = split_df[["title", "abstract"]].apply(lambda x: (x["title"] + ". " + x["abstract"]).lower() if x["title"][-1].isalpha() else (x["title"] + " " + x["abstract"]).lower(), axis=1)
    for contr in controlled:
        split_df[f"keyphrases_{contr}"] = split_df[["id"]].apply(lambda x: [" ".join(keyphrase) for keyphrase in keyphrases[contr][x["id"]]], axis=1)
        split_df[f"prmu_{contr}"] = split_df[["input", f"keyphrases_{contr}"]].apply(lambda x: pmru(tok_text=lowercase_and_stem(tokenize(x["input"])), tok_kps=[lowercase_and_stem(tokenize(keyphrase)) for keyphrase in x[f"keyphrases_{contr}"]]), axis=1)
        split_df[contr] = split_df[[f"keyphrases_{contr}", f"prmu_{contr}"]].apply(lambda x: { key: numpy.array(x[f"keyphrases_{contr}"])[x[f"prmu_{contr}"][key]] for key in prmu }, axis=1)
    dataset = pandas.concat([dataset, split_df]).sort_values("id").reset_index(drop=True)

dataset = dataset[["id", "split", "input"] + controlled]
dataset

### Statistics about each output vocabulary

In [None]:
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

model_name="all-mpnet-base-v2"
model = SentenceTransformer(model_name)

metrics = {
    "cosine": cosine_similarity, "euclidean": euclidean_distances, "manhattan": manhattan_distances
}

def pairwise_distances(index):
    embeddings = model.encode(index, convert_to_tensor=True)
    return {
        distance: metrics[distance](embeddings.numpy(), embeddings.numpy()).mean().round(2) for distance in ["euclidean", "manhattan", "cosine"]
    }
    
stemmed = [False, True]

In [None]:
tables = pandas.DataFrame()

for contr in controlled:
    for stem in stemmed:
        vocabulary = [" ".join([stemmer.stem(word) for word in tokenize(keyphrase)]) if stem else keyphrase for keyphrase in ",".join(dataset[contr].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")]
        ngrams = [len(tokenize(keyphrase)) for keyphrase in vocabulary]
        table = pandas.DataFrame({
            "thesaurus": contr + " (pre-stemmed)" if stem else contr,
            "n_unique_keyphrases": numpy.unique(vocabulary).size.__str__(),
            "n_occurence_keyphrases": f"""{numpy.mean(list(Counter(vocabulary).values())).round(2)} ± {numpy.std(list(Counter(vocabulary).values())).round(2)}""",
            "n_grams_keyphrases": f"""{numpy.mean(ngrams).round(2)} ± {numpy.std(ngrams).round(2)}""",
        }, index=[0])
        
        tables = pandas.concat([tables, table]).reset_index(drop=True)

print(tables.to_latex())
tables

In [None]:
pandas.concat([pandas.DataFrame(Counter([len(tokenize(keyphrase)) for keyphrase in numpy.unique([k for k in ",".join(dataset["controlled"].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")])]).items()).set_index(0).sort_index().transpose(),
               pandas.DataFrame(Counter([len(tokenize(keyphrase)) for keyphrase in numpy.unique([k for k in ",".join(dataset["uncontrolled"].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")])]).items()).set_index(0).sort_index().transpose()]).set_axis(["controlled", "uncontrolled"])

In [None]:
pandas.concat([pandas.DataFrame(Counter([len(tokenize(keyphrase)) for keyphrase in numpy.unique([k for k in ",".join(dataset["controlled"].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")])]).items()).set_index(0).sort_index().transpose() / 2059,
               pandas.DataFrame(Counter([len(tokenize(keyphrase)) for keyphrase in numpy.unique([k for k in ",".join(dataset["uncontrolled"].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")])]).items()).set_index(0).sort_index().transpose() / 16916]).set_axis(["controlled", "uncontrolled"]).round(2)

In [None]:
Counter([k for k in ",".join(dataset["controlled"].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")])

In [None]:
Counter([k for k in ",".join(dataset["uncontrolled"].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")])

In [None]:
tables = pandas.DataFrame()

for contr in controlled:
    n_keyphrases = dataset[contr].apply(lambda x: sum([len(x[key]) for key in prmu]))
    distances = dataset[contr].apply(lambda x: pairwise_distances([",".join(x[key]) for key in prmu]))
    distribution = {
        key: f"{dataset[contr].apply(lambda x: 100 * len(x[key]) / sum([len(x[k]) for k in prmu])).mean().round(2)}%" for key in prmu
    }
    table = pandas.DataFrame({
        "thesaurus": contr,
        "n_keyphrases": f"""{numpy.mean(n_keyphrases).round(2)} ± {numpy.std(n_keyphrases).round(2)}"""
    } | {
        distance: f"""{int(100 * numpy.mean(distances.apply(lambda x: x[distance])).round(2)) / 100} ± {int(100 * numpy.std(distances.apply(lambda x: x[distance])).round(2)) / 100}""" for distance in ["euclidean", "manhattan", "cosine"]
    } | distribution, index=[0])
    
    tables = pandas.concat([tables, table]).reset_index(drop=True)

print(tables.to_latex())
tables

In [None]:
tables = pandas.DataFrame()

for split in splits:
    for contr in ["controlled"]:
        vocabulary = [" ".join([stemmer.stem(word) for word in tokenize(keyphrase)]) if stem else keyphrase for keyphrase in ",".join(dataset[dataset["split"] == split][contr].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")]
        other_vocabulary = [" ".join([stemmer.stem(word) for word in tokenize(keyphrase)]) if stem else keyphrase for keyphrase in ",".join(dataset[dataset["split"] != split][contr].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(",")]
        n_keyphrases = dataset[dataset["split"] == split][contr].apply(lambda x: sum([len(x[key]) for key in prmu]))
        sentences = dataset[dataset["split"] == split]["input"].apply(lambda x: len(sent_tokenize(x)))
        size = len(dataset[dataset["split"] == split])
        distribution = {
            key: f"""{dataset[dataset["split"] == split][contr].apply(lambda x: 100 * len(x[key]) / sum([len(x[k]) for k in prmu])).mean().round(2)}%""" for key in prmu
        }
        table = pandas.DataFrame({
            "split": split,
            "n_documents": size,
            "n_sentences": f"""{numpy.mean(sentences).round(2)} ± {numpy.std(sentences).round(2)}""",
            "n_keyphrases": f"""{numpy.mean(n_keyphrases).round(2)} ± {numpy.std(n_keyphrases).round(2)}""",
            "coverage_keyphrases": f"""{round(100 * len(numpy.unique(vocabulary)) / 2059, 2)}%""",
            "exclusive_keyphrases": f"""{round(100 * len(set(vocabulary) - set(other_vocabulary)) / 2059, 2)}%"""
        } | distribution, index=[0])
        
        tables = pandas.concat([tables, table]).reset_index(drop=True)

print(tables.to_latex())
tables

# Benchmarking models

In [None]:
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from datasets import load_dataset
from itertools import chain
import pandas
import numpy
import json
import time
import os

controlled = ["controlled"]
prmu = ["P", "R", "M", "U"]
splits = ["train", "validation", "test"]
dataset = pandas.DataFrame()
stemmer = PorterStemmer()

def contains(subseq, inseq):
    return any(inseq[pos:pos + len(subseq)] == subseq for pos in range(0, len(inseq) - len(subseq) + 1))

def tokenize(s):
    return word_tokenize(s)

def lowercase_and_stem(_words):
    return [stemmer.stem(w.lower()) for w in _words]

def pmru(tok_text, tok_kps):
    p, r, m, u = [], [], [], []
    absent_words = set()
    # loop through the keyphrases
    for j, kp in enumerate(tok_kps):

        # if kp is present
        if contains(kp, tok_text):
            p.append(j)

        # if kp is considered as absent
        else:
            # find present and absent words
            present_words = [w for w in kp if w in tok_text]
            absent_words.update([w for w in kp if w not in tok_text])

            # if "all" words are present
            if len(present_words) == len(kp):
                r.append(j)
            # if "some" words are present
            elif len(present_words) > 0:
                m.append(j)
            # if "no" words are present
            else:
                u.append(j)

    return {"P": p, "R": r, "M": m, "U": u}

for split in splits:
    keyphrases = { "controlled": json.load(open(f"data/{split}.contr.json")) }
    split_df = load_dataset("taln-ls2n/inspec", split=split, trust_remote_code=True).to_pandas()
    split_df["split"] = split
    split_df["input"] = split_df[["title", "abstract"]].apply(lambda x: (x["title"] + ". " + x["abstract"]).lower() if x["title"][-1].isalpha() else (x["title"] + " " + x["abstract"]).lower(), axis=1)
    for contr in controlled:
        split_df[f"keyphrases_{contr}"] = split_df[["id"]].apply(lambda x: [" ".join(keyphrase) for keyphrase in keyphrases[contr][x["id"]]], axis=1)
        split_df[f"prmu_{contr}"] = split_df[["input", f"keyphrases_{contr}"]].apply(lambda x: pmru(tok_text=lowercase_and_stem(tokenize(x["input"])), tok_kps=[lowercase_and_stem(tokenize(keyphrase)) for keyphrase in x[f"keyphrases_{contr}"]]), axis=1)
        split_df[contr] = split_df[[f"keyphrases_{contr}", f"prmu_{contr}"]].apply(lambda x: { key: numpy.array(x[f"keyphrases_{contr}"])[x[f"prmu_{contr}"][key]] for key in prmu }, axis=1)
    dataset = pandas.concat([dataset, split_df]).sort_values("id").reset_index(drop=True)

indexes = numpy.unique(",".join(dataset[contr].apply(lambda x: ",".join([",".join(x[key]) for key in prmu if len(x[key]) > 0])).values).split(","))
dataset = dataset[["id", "split", "input", contr]]
dataset[contr] = dataset[contr].apply(lambda x: {
    key: set([" ".join([stemmer.stem(word) for word in tokenize(keyphrase)]) for keyphrase in x[key]]) if len(x[key]) > 0 else set() for key in prmu
})
dataset = dataset[dataset["split"] == "test"]
indices = dataset.index
dataset

In [None]:
def evaluate(keyphrases, references):
    O = sum([len(references[key]) for key in prmu])
    P = {
        key: {
            "5": 0 if len(keyphrases[:5]) == 0 else len(set(keyphrases[:5]) & references[key]) / len(keyphrases[:5]) if len(references[key]) != 0 else numpy.nan,
            "O": 0 if len(keyphrases[:5]) == 0 else len(set(keyphrases[:O]) & references[key]) / len(keyphrases[:O]) if len(references[key]) != 0 else numpy.nan
        } for key in prmu
    }
    R = {
        key: {
            "5": 0 if len(keyphrases[:5]) == 0 else len(set(keyphrases[:5]) & references[key]) / len(references[key]) if len(references[key]) != 0 else numpy.nan,
            "O": 0 if len(keyphrases[:5]) == 0 else len(set(keyphrases[:O]) & references[key]) / len(references[key]) if len(references[key]) != 0 else numpy.nan
        } for key in prmu
    }
    F = {
        key: {
            nkeys: 0 if len(keyphrases[:5]) == 0 else numpy.nan if len(references[key]) == 0 else (2*P[key][nkeys]*R[key][nkeys])/(P[key][nkeys]+R[key][nkeys]) if (P[key][nkeys]+R[key][nkeys]) > 0 else 0 for nkeys in P[key].keys()
        } for key in prmu
    }
    return {
        f"{key}_P_{nkeys}": P[key][nkeys] for key in prmu for nkeys in P[key].keys()
    } | {
        f"{key}_R_{nkeys}": R[key][nkeys] for key in prmu for nkeys in R[key].keys()
    } | {
        f"{key}_F_{nkeys}": F[key][nkeys] for key in prmu for nkeys in F[key].keys()
    }

### Extractive Methods

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!python -m spacy download en_core_web_sm

In [None]:
from pke.unsupervised import MultipartiteRank, PositionRank, YAKE, TopicRank, TfIdf, KPMiner

for model in [MultipartiteRank, PositionRank, YAKE, TopicRank, TfIdf, KPMiner]:
    results = pandas.DataFrame({
        "model": [],
        "embedded": [],
        "id": [],
        "keyphrases": []
    } | {
        f"{key}_{metric}_{nkeys}" : [] for metric in ["P", "R", "F"] for key in prmu for nkeys in ["5", "O"]
    } | {
        f"{key}_correct_O" : [] for key in prmu
    })
    extractor = model()

    for id in indices:
        references = dataset.loc[id, "controlled"]
        O = sum([len(references[key]) for key in prmu])
        extractor.load_document(input=dataset.loc[id, "input"], language="en")
        extractor.candidate_selection()
        extractor.candidate_weighting()
        predictions = extractor.get_n_best(n=max(5, sum([len(references[key]) for key in prmu])))
        keyphrases = [" ".join(lowercase_and_stem(tokenize(keyphrase[0]))) for keyphrase in predictions]

        results = pandas.concat([results, pandas.DataFrame({
            "model": [model.__name__],
            "embedded": ["False"],
            "id": [dataset.loc[id, "id"]],
            "keyphrases": [",".join([keyphrase[0] for keyphrase in predictions])]
        } | evaluate(keyphrases, references) | {
            f"{key}_correct_O" : [len(set(keyphrases[:O]) & set(references[key]))] for key in prmu
        })])
    
        results.to_csv(f"benchmark/results_{model.__name__}.csv", sep=";", index=False)

### Transformer One2Seq-Paradigm

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

for model_name in ["beogradjanka/bart_multitask_finetuned_for_title_and_keyphrase_generation", "bloomberg/KeyBART"]:
    results = pandas.DataFrame({
        "model": [],
        "embedded": [],
        "id": [],
        "keyphrases": []
    } | {
        f"{key}_{metric}_{nkeys}" : [] for metric in ["P", "R", "F"] for key in prmu for nkeys in ["5", "O"]
    } | {
        f"{key}_correct_O" : [] for key in prmu
    })
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    if "finetuned" in model_name:
        sep = "\n"
    else:
        sep = ";"

    for id in indices:
        references = dataset.loc[id, "controlled"]
        O = sum([len(references[key]) for key in prmu])
        tokenized_text = tokenizer.prepare_seq2seq_batch(["<|KEYPHRASES|> " + dataset.loc[id, "input"]], return_tensors='pt')
        translation = model.generate(**tokenized_text)
        translated_text = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
        predictions = translated_text.split(sep)
        keyphrases = [" ".join(lowercase_and_stem(tokenize(keyphrase))) for keyphrase in predictions]

        results = pandas.concat([results, pandas.DataFrame({
            "model": [model_name.split("/")[-1].split("_")[0]],
            "embedded": ["False"],
            "id": [dataset.loc[id, "id"]],
            "keyphrases": [",".join([keyphrase for keyphrase in predictions])]
        } | evaluate(keyphrases, references) | {
            f"{key}_correct_O" : [len(set(keyphrases[:O]) & set(references[key]))] for key in prmu
        })])
    
        results.to_csv(f"""benchmark/results_{model_name.split("/")[-1].split("_")[0]}.csv""", sep=";", index=False)

### ChatGPT

In [None]:
from openai import AzureOpenAI
model_name = "ChatGPT"

client = AzureOpenAI(
  azure_endpoint=#
  api_key=#
  api_version=#
)

results = pandas.DataFrame({
    "model": [],
    "embedded": [],
    "id": [],
    "keyphrases": []
} | {
    f"{key}_{metric}_{nkeys}" : [] for metric in ["P", "R", "F"] for key in prmu for nkeys in ["5", "O"]
} | {
    f"{key}_correct_O" : [] for key in prmu
})
sep=","

for id in indices:
    references = dataset.loc[id, "controlled"]
    O = sum([len(references[key]) for key in prmu])
    response = client.chat.completions.create(
      model="oa-coeml-gpt-35-us",
      messages=[
        {
          "role": "system",
          "content": "You will be provided with a block of text, and your task is to extract a list of keywords from it. Separate each keyword with a comma."
        },
        {
          "role": "user",
          "content": dataset.loc[id, "input"]
        }
      ],
      temperature=0,
      max_tokens=512,
      top_p=1
    )

    predictions = response.choices[0].message.content.split(sep)
    keyphrases = [" ".join(lowercase_and_stem(tokenize(keyphrase))) for keyphrase in predictions]

    results = pandas.concat([results, pandas.DataFrame({
        "model": [model_name],
        "embedded": ["False"],
        "id": [dataset.loc[id, "id"]],
        "keyphrases": [",".join([keyphrase for keyphrase in predictions])]
    } | evaluate(keyphrases, references) | {
        f"{key}_correct_O" : [len(set(keyphrases[:O]) & set(references[key]))] for key in prmu
    })])

    results.to_csv(f"benchmark/results_{model_name}.csv", sep=";", index=False)



### One2Set and CatSeqTG_2RF1 and KG-KE-KR-M

We are using results from these repositories:

[repo](https://github.com/jiacheng-ye/kg_one2set) related to this [paper](https://arxiv.org/abs/1906.04106)
    
[repo](https://github.com/kenchan0226/keyphrase-generation-rl?tab=readme-ov-file) related to this [paper](https://arxiv.org/abs/2105.11134) 

[repo](https://github.com/Chen-Wang-CUHK/KG-KE-KR-M) related to this [paper](https://arxiv.org/pdf/1904.03454.pdf)

In [None]:
for model in ["One2Set.txt", "CatSeqTG_2RF1.txt", "KG-KE-KR-M.out"]:
    all_predictions = open(f"data/{model}").readlines()

    results = pandas.DataFrame({
        "model": [],
        "embedded": [],
        "id": [],
        "keyphrases": []
    } | {
        f"{key}_{metric}_{nkeys}" : [] for metric in ["P", "R", "F"] for key in prmu for nkeys in ["5", "O"]
    } | {
        f"{key}_correct_O" : [] for key in prmu
    })

    for i in range(len(indices)):
        references = dataset.loc[indices[i], "controlled"]
        O = sum([len(references[key]) for key in prmu])
        if model == "KG-KE-KR-M.out":
            predictions = all_predictions[i].split(" ; ")
        else:
            predictions = all_predictions[i].split(";")
        keyphrases = [" ".join(lowercase_and_stem(tokenize(keyphrase))) for keyphrase in predictions]

        results = pandas.concat([results, pandas.DataFrame({
            "model": [model.split(".")[0]],
            "embedded": ["False"],
            "id": [dataset.loc[id, "id"]],
            "keyphrases": [",".join([keyphrase for keyphrase in predictions])]
        } | evaluate(keyphrases, references) | {
            f"{key}_correct_O" : [len(set(keyphrases[:O]) & set(references[key]))] for key in prmu
        })])

        results.to_csv(f"""benchmark/results_{model.split(".")[0]}.csv""", sep=";", index=False)

### Embedding-based Approaches

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import umap

corpus = indexes

for model in ["all-MiniLM-L12-v2", "multi-qa-mpnet-base-dot-v1", "all-mpnet-base-v2"]:
    embedder = SentenceTransformer(model)
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True).numpy()
    results = pandas.DataFrame({
        "model": [],
        "embedded": [],
        "id": [],
        "keyphrases": []
    } | {
        f"{key}_{metric}_{nkeys}" : [] for metric in ["P", "R", "F"] for key in prmu for nkeys in ["5", "O"]
    } | {
        f"{key}_correct_O" : [] for key in prmu
    })

    for i in range(len(indices)):
        queries = sent_tokenize(dataset.loc[indices[i], "input"])
        references = dataset.loc[indices[i], "controlled"]
        O = sum([len(references[key]) for key in prmu])
        k = 5

        query_embeddings = embedder.encode(queries, convert_to_tensor=True).numpy()
        for score in ["cosine"]:
            predictions = []
            if score == "cosine":
                scores = torch.from_numpy(cosine_similarity(query_embeddings, corpus_embeddings))
            elif score == "manhattan":
                scores = torch.from_numpy(manhattan_distances(query_embeddings, corpus_embeddings))
            else:
                scores = torch.from_numpy(euclidean_distances(query_embeddings, corpus_embeddings))
            top_results = torch.topk(scores, k=k, largest=(score=="cosine")).indices.numpy()
            predictions.append(list(corpus[top_results]))

            for top_k in [5]:
                keyphrases = list(Counter([" ".join(lowercase_and_stem(tokenize(keyphrase))) for keyphrase in list(chain(*[list(prediction[:top_k]) for prediction in predictions[0]]))]).keys())

                results = pandas.concat([results, pandas.DataFrame({
                    "model": [model],
                    "embedded": ["True"],
                    "id": [dataset.loc[id, "id"]],
                    "keyphrases": [",".join(list(chain(*[list(prediction[:top_k]) for prediction in predictions[0]])))]
                } | evaluate(keyphrases, references) | {
                    f"{key}_correct_O" : [len(set(keyphrases[:O]) & set(references[key]))] for key in prmu
                })])

                results.to_csv(f"benchmark/results_{model}.csv", sep=";", index=False)

# Results

In [None]:
import pandas
import os
results = pandas.concat([pandas.read_csv(f"benchmark/{file}", sep=";") for file in os.listdir("benchmark/") if ".csv" in file])
(100 * results.drop(["id", "keyphrases"], axis=1).groupby(["model", "embedded"]).mean()).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,P_P_5,P_P_O,R_P_5,R_P_O,M_P_5,M_P_O,U_P_5,U_P_O,P_R_5,P_R_O,...,R_F_5,R_F_O,M_F_5,M_F_O,U_F_5,U_F_O,P_correct_O,R_correct_O,M_correct_O,U_correct_O
model,embedded,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
CatSeqTG_2RF1,False,12.2,13.78,1.02,1.11,0.89,0.84,0.71,0.66,38.88,36.72,...,1.44,1.7,1.19,1.12,0.96,0.86,36.8,2.2,3.8,2.0
ChatGPT,False,7.73,8.3,0.0,0.0,0.05,0.08,0.06,0.04,24.12,21.54,...,0.0,0.0,0.07,0.1,0.1,0.07,23.6,0.0,0.2,0.2
KG-KE-KR-M,False,15.85,16.91,0.89,1.17,1.14,1.19,0.53,0.16,49.37,47.8,...,1.3,1.71,1.59,1.62,0.82,0.25,50.0,2.2,4.4,0.8
KPMiner,False,7.24,7.96,0.0,0.0,0.0,0.0,0.06,0.04,17.78,17.91,...,0.0,0.0,0.0,0.0,0.1,0.06,19.6,0.0,0.0,0.2
KeyBART,False,8.13,9.6,1.55,1.45,0.94,0.99,0.38,0.4,23.23,22.43,...,2.31,2.06,1.26,1.27,0.53,0.51,22.2,1.8,3.4,1.0
MultipartiteRank,False,6.6,7.3,0.13,0.08,0.05,0.08,0.06,0.04,20.53,19.09,...,0.16,0.12,0.07,0.1,0.1,0.07,20.8,0.2,0.2,0.2
One2Set,False,14.0,15.81,1.53,1.65,1.31,1.26,0.53,0.53,43.05,42.06,...,2.23,2.49,1.79,1.66,0.77,0.72,42.6,3.0,4.6,1.4
PositionRank,False,6.45,7.12,0.0,0.08,0.0,0.0,0.0,0.0,20.26,19.48,...,0.0,0.12,0.0,0.0,0.0,0.0,21.2,0.2,0.0,0.0
TfIdf,False,7.92,8.06,0.0,0.0,0.0,0.0,0.12,0.18,25.22,22.8,...,0.0,0.0,0.0,0.0,0.2,0.26,24.6,0.0,0.0,0.4
TopicRank,False,6.21,6.4,0.0,0.0,0.0,0.0,0.06,0.04,19.04,17.66,...,0.0,0.0,0.0,0.0,0.1,0.07,19.4,0.0,0.0,0.2


In [None]:
pandas.concat([results.loc[results[[f"{key}_correct_O" for key in prmu]].sum(axis=1) != 0][["model", "embedded"]], 
               100 * results.loc[results[[f"{key}_correct_O" for key in prmu]].sum(axis=1) != 0][[f"{key}_correct_O" for key in prmu]].apply(lambda x: x/x.sum(), axis=1)], axis=1).groupby(["model", "embedded"]).mean().round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,P_correct_O,R_correct_O,M_correct_O,U_correct_O
model,embedded,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CatSeqTG_2RF1,False,82.25,4.62,8.88,4.26
ChatGPT,False,98.68,0.0,0.99,0.33
KG-KE-KR-M,False,88.49,3.06,7.6,0.85
KPMiner,False,99.44,0.0,0.0,0.56
KeyBART,False,77.27,6.61,12.4,3.72
MultipartiteRank,False,98.03,0.54,1.08,0.36
One2Set,False,82.77,4.94,9.63,2.67
PositionRank,False,99.47,0.53,0.0,0.0
TfIdf,False,98.65,0.0,0.0,1.35
TopicRank,False,99.61,0.0,0.0,0.39
