In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install datasets
!python -m spacy download en_core_web_sm

In [None]:
from tqdm.notebook import tqdm
from datasets import load_dataset

benchmark = "KDD"
dataset = load_dataset(
    "parquet",
    data_files={
        "test":  "https://huggingface.co/datasets/midas/kdd/resolve/refs/convert/parquet/raw/test/0000.parquet"
    }
)

In [None]:
for split in dataset.keys():
    dataset[split] = dataset[split].map(lambda x: {'keyphrases': x['extractive_keyphrases'] + x['abstractive_keyphrases']})

In [None]:
def replace_parentheses_before_detokenize(tokens):
    return [
        token.replace('-LRB-', '(')
             .replace('-RRB-', ')')
             .replace('-LSB-', '[')
             .replace('-RSB-', ']')
             .replace('-LCB-', '{')
             .replace('-RCB-', '}')
             .replace('-LT-', '<')
             .replace('-GT-', '>')
        for token in tokens
    ]

In [None]:
for split in dataset.keys():
    dataset[split] = dataset[split].map(lambda x: {
        'document': replace_parentheses_before_detokenize(list(x['document']))
    })

In [None]:
import html

html_char_map = {name: chr(code) for name, code in html.entities.name2codepoint.items()}
html_codepoint_map = {code: chr(code) for code in html.entities.codepoint2name}

In [None]:
def replace_html_special_characters(tokens):
    result = []
    i = 0
    while i < len(tokens):
        if i + 2 < len(tokens) and tokens[i] == '&' and tokens[i + 1] in html_char_map and tokens[i + 2] == ';':
            result.append(html_char_map[tokens[i + 1]])
            i += 3
        elif i < len(tokens) and tokens[i].startswith('&#') and tokens[i].endswith(';'):
            codepoint = int(tokens[i][2:-1])
            result.append(chr(codepoint))
            i += 1
        else:
            result.append(tokens[i])
            i += 1
    return result

In [None]:
for split in dataset.keys():
    dataset[split] = dataset[split].map(lambda x: {
        'document': replace_html_special_characters(list(x['document']))
    })

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

for split in dataset.keys():
    dataset[split] = dataset[split].map(lambda x: {
        'document': tokenizer.convert_tokens_to_string(list(x['document']))
    })

In [None]:
def clean_text_after_detokenize(text):
    text = text.replace('( ', '(')
    text = text.replace(' )', ')')
    text = text.replace('[ ', '[')
    text = text.replace(' ]', ']')
    text = text.replace('{ ', '{')
    text = text.replace(' }', '}')
    text = text.replace(' :', ':')
    text = text.replace(' ;', ';')
    return text

In [None]:
for split in dataset.keys():
    dataset[split] = dataset[split].map(lambda x: {
        'document': clean_text_after_detokenize(x['document'])
    })

In [None]:
def remove_empty_entries_from_dataset(data):
    for split in data.keys():
        original_count = len(data[split])

        data[split] = data[split].filter(
            lambda x: (
                x['id'] and
                x['document'] and
                x['keyphrases']
            )
        )

        cleaned_count = len(data[split])

        print(f"[{split}] Before data cleaning: {original_count}")
        print(f"[{split}] After data cleaning: {cleaned_count}")

    return data

dataset = remove_empty_entries_from_dataset(dataset)

In [None]:
dataset

**NOTE:** Unless otherwise noted in this notebook, the implementations are based on the [Python-based Keyphrase Extraction toolkit (PKE)](https://github.com/boudinfl/pke) by [Boudin (2016)](https://aclanthology.org/C16-2015/), tailored to fit our work.

In [None]:
import spacy
from spacy.tokenizer import _get_regex_pattern

nlp = spacy.load("en_core_web_sm")

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

In [None]:
from nltk.stem.snowball import SnowballStemmer as Stemmer

all_data = []
references = []
stemmer = Stemmer('porter')

for split in dataset.keys():
    for sample in tqdm(dataset[split]):
        all_data.append(nlp(sample["document"]))

        sample_keyphrases = []
        for keyphrase in sample["keyphrases"]:
            tokens = [token.text for token in nlp(keyphrase)]
            stems = [stemmer.stem(tok.lower()) for tok in tokens]
            sample_keyphrases.append(" ".join(stems))
        references.append(sample_keyphrases)

In [None]:
from pke import compute_document_frequency
from string import punctuation
from pke import load_document_frequency_file

In [None]:
compute_document_frequency(
    documents=all_data,
    output_file='data/{}.df.gz'.format(benchmark),
    language='en',
    normalization='stemming',
    stoplist=list(punctuation),
    n=5
)

df = load_document_frequency_file(input_file='data/{}.df.gz'.format(benchmark))

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import logging

from pke.base import LoadFile

In [None]:
# @title TF

class TF(LoadFile):

      def candidate_selection(self, n=3):
          self.ngram_selection(n=n)
          self.candidate_filtering()

      def candidate_weighting(self):
        for k, v in self.candidates.items():
            self.weights[k] = len(v.surface_forms)
            self.weights[k] += (self.candidates[k].offsets[0] * 1e-8)

In [None]:
import networkx as nx

In [None]:
# @title TextRank

class TextRank1(LoadFile):

    def __init__(self):
        super(TextRank1, self).__init__()
        self.graph = nx.Graph()

    def candidate_selection(self, pos=None):

        if pos is None:
            pos = {'NOUN', 'PROPN', 'ADJ'}

        self.longest_pos_sequence_selection(valid_pos=pos)

    def build_word_graph(self, window=2, pos=None):

        if pos is None:
            pos = {'NOUN', 'PROPN', 'ADJ'}

        text = [(word, sentence.pos[i] in pos) for sentence in self.sentences
                for i, word in enumerate(sentence.stems)]

        self.graph.add_nodes_from([word for word, valid in text if valid])

        for i, (node1, is_in_graph1) in enumerate(text):

            if not is_in_graph1:
                continue

            for j in range(i + 1, min(i + window, len(text))):
                node2, is_in_graph2 = text[j]
                if is_in_graph2 and node1 != node2:
                    self.graph.add_edge(node1, node2)

    def candidate_weighting(self,
                            window=2,
                            pos=None,
                            top_percent=None,
                            normalized=False):

        if pos is None:
            pos = {'NOUN', 'PROPN', 'ADJ'}

        self.build_word_graph(window=window, pos=pos)

        w = nx.pagerank(self.graph, alpha=0.85, tol=0.0001, weight=None)

        if top_percent is not None:

            nb_nodes = self.graph.number_of_nodes()
            to_keep = min(math.floor(nb_nodes * top_percent), nb_nodes)

            top_words = sorted(w, key=w.get, reverse=True)

            self.longest_keyword_sequence_selection(top_words[:int(to_keep)])

        for k in self.candidates.keys():
            tokens = self.candidates[k].lexical_form
            self.weights[k] = sum([w[t] for t in tokens])
            if normalized:
                self.weights[k] /= len(tokens)

            self.weights[k] += (self.candidates[k].offsets[0]*1e-8)

**NOTE:** The implementation of KeyBERT [(Grootendorst, 2020)](https://github.com/MaartenGr/KeyBERT) is based on [this](https://github.com/MaartenGr/KeyBERT) and tailored to fit our work.

In [None]:
!pip install keybert

from collections import Counter
from keybert.backend._utils import select_backend
from sklearn.metrics.pairwise import cosine_similarity

KeyBERT_model = select_backend("all-distilroberta-v1")

In [None]:
# @title KeyBERT

class KeyBERT(LoadFile):
    def __init__(self, model=KeyBERT_model):
        super(KeyBERT, self).__init__()
        self.model = model

    def candidate_weighting(self, input):
        doc = str(input)

        words = []

        for key, candidate in self.candidates.items():
            combined_surface_forms = [' '.join(form) for form in candidate.surface_forms]
            form_counter = Counter(combined_surface_forms)
            most_common_forms = form_counter.most_common()
            max_frequency = most_common_forms[0][1]
            candidates_with_max_freq = [form for form, freq in most_common_forms if freq == max_frequency]
            sorted_combined_surface_forms = sorted(zip(candidate.offsets, combined_surface_forms))
            for offset, form in sorted_combined_surface_forms:
                if form in candidates_with_max_freq:
                    words.append(str(form))
                    break

        doc_embedding = self.model.embed(doc).reshape(1, -1)
        word_embeddings = self.model.embed(words)
        similarities = cosine_similarity(doc_embedding, word_embeddings)[0]
        self.weights = {word: round(float(sim), 4) for word, sim in zip(words, similarities)}
        del doc_embedding
        del word_embeddings
        del similarities
        torch.cuda.empty_cache()

**NOTE:** The implementation of MDERank [(Zhang et al., 2022)](https://aclanthology.org/2022.findings-acl.34/) is based on [this](https://github.com/LinhanZ/mderank) and tailored to fit our work.

In [None]:
import zipfile
import urllib.request

url = "https://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip"
file_name = "/content/stanford-corenlp-full-2018-02-27.zip"

urllib.request.urlretrieve(url, file_name)
with zipfile.ZipFile(file_name, 'r') as zip_path:
    zip_path.extractall("/content/")

In [None]:
!pip install stanfordcorenlp

In [None]:
import re
import torch
from accelerate import Accelerator
from transformers import BertForMaskedLM, BertTokenizer

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords as nltk_stopwords
from stanfordcorenlp import StanfordCoreNLP

GRAMMAR1 = """  NP:
        {<NN.*|JJ>*<NN.*>}"""

stopword_dict = set(nltk_stopwords.words('english'))
wnl=nltk.WordNetLemmatizer()

en_model = StanfordCoreNLP(r'/content/stanford-corenlp-full-2018-02-27',quiet=True)

MDERank_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MDERank_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MDERank_model.to(device)

MAX_LEN =512

In [None]:
# @title MDERank

class MDERANK():

    def __init__(self):

        self.tokens = []
        self.tokens_tagged = []
        self.keyphrase_candidate = []

    @staticmethod
    def remove(text):
        text_len = len(text.split())
        remove_chars = '[’!"#$%&\'()*+,./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
        text = re.sub(remove_chars, '', text)
        re_text_len = len(text.split())
        if text_len != re_text_len:
            return True
        else:
            return False

    def extract_candidates(self, tokens_tagged, no_subset=False):
        np_parser = nltk.RegexpParser(GRAMMAR1)
        keyphrase_candidate = []
        np_pos_tag_tokens = np_parser.parse(tokens_tagged)
        count = 0
        for token in np_pos_tag_tokens:
            if (isinstance(token, nltk.tree.Tree) and token._label == "NP"):
                np = ' '.join(word for word, tag in token.leaves())
                length = len(token.leaves())
                start_end = (count, count + length)
                count += length
                keyphrase_candidate.append((np, start_end))

            else:
                count += 1

        return keyphrase_candidate

    @staticmethod
    def generate_absent_doc(ori_encode_dict, candidates):

        count = 0
        doc_pairs = []
        ori_input_ids = ori_encode_dict["input_ids"].squeeze()
        ori_tokens = MDERank_tokenizer.convert_ids_to_tokens(ori_input_ids)

        for id, candidate in enumerate(candidates):
            if MDERANK.remove(candidate):
                count +=1
                continue

            tok_candidate = MDERank_tokenizer.tokenize(candidate)
            candidate_len = len(tok_candidate)
            mask = ' '.join(['[MASK]'] * candidate_len)
            ori_doc = ' '.join(ori_tokens)
            can_token = ' '.join(tok_candidate)

            try:
                candidate_re = re.compile(r"\b" + can_token + r"\b")
                masked_doc = re.sub(candidate_re, mask, ori_doc)
                match = candidate_re.findall(ori_doc)
            except:
                count +=1
                continue
            if len(match) == 0:
                count +=1
                continue

            masked_tokens = masked_doc.split()
            masked_input_ids = MDERank_tokenizer.convert_tokens_to_ids(masked_tokens)
            len_masked_tokens = len(masked_tokens) - masked_tokens.count('[PAD]')

            try:
                assert len(masked_input_ids) == 512
            except:
                count +=1
                continue

            masked_attention_mask = np.zeros(MAX_LEN)
            masked_attention_mask[:len_masked_tokens] = 1
            masked_token_type_ids = np.zeros(MAX_LEN)
            masked_encode_dict = {
                "input_ids": torch.Tensor(masked_input_ids).to(torch.long),
                "token_type_ids": torch.Tensor(masked_token_type_ids).to(torch.long),
                "attention_mask": torch.Tensor(masked_attention_mask).to(torch.long),
                "candidate": candidate,
                "freq": len(match)
            }
            doc_pairs.append([ori_encode_dict, masked_encode_dict])

        return doc_pairs, count

    def mean_pooling(model_output, attention_mask):
        hidden_states = model_output.hidden_states
        token_embeddings = hidden_states[-1]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @staticmethod
    def get_all_dist(dist_list):
        dist_all={}
        for pair in dist_list:
            phrase = pair[0]
            phrase = phrase.lower()
            phrase = wnl.lemmatize(phrase)
            if(phrase in dist_all):
                dist_all[phrase].append(pair[1])
            else:
                dist_all[phrase]=[]
                dist_all[phrase].append(pair[1])
        return dist_all

    @staticmethod
    def get_final_dist(dist_all, method="average"):
        final_dist={}

        if(method=="average"):

            for phrase, dist_list in dist_all.items():
                sum_dist = 0.0
                for dist in dist_list:
                    sum_dist += dist
                if (phrase in stopword_dict):
                    sum_dist = 0.0
                final_dist[phrase] = sum_dist/float(len(dist_list))
            return final_dist

    def MDERank(self, text):
        self.tokens = en_model.word_tokenize(text)
        self.tokens_tagged = en_model.pos_tag(text)
        assert len(self.tokens) == len(self.tokens_tagged)
        for i, token in enumerate(self.tokens):
            if token.lower() in stopword_dict:
                self.tokens_tagged[i] = (token, "IN")
        self.keyphrase_candidate = self.extract_candidates(self.tokens_tagged)

        cans = self.keyphrase_candidate
        candidates = []
        for can, pos in cans:
            candidates.append(can.lower())

        ori_encode_dict = MDERank_tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        doc_pairs, count = MDERANK.generate_absent_doc(ori_encode_dict, candidates)

        MDERank_model.eval()

        dist_list = []

        for doc_pair in doc_pairs:
            ori_input_ids = doc_pair[0]["input_ids"].squeeze(0).to(device)
            ori_token_type_ids = doc_pair[0]["token_type_ids"].squeeze(0).to(device)
            ori_attention_mask = doc_pair[0]["attention_mask"].squeeze(0).to(device)

            masked_input_ids = doc_pair[1]["input_ids"].squeeze(0).to(device)
            masked_token_type_ids = doc_pair[1]["token_type_ids"].squeeze(0).to(device)
            masked_attention_mask = doc_pair[1]["attention_mask"].squeeze(0).to(device)
            candidate = doc_pair[1]["candidate"]

            with torch.no_grad():
                ori_outputs = MDERank_model(input_ids=ori_input_ids.unsqueeze(0), attention_mask=ori_attention_mask.unsqueeze(0), token_type_ids=ori_token_type_ids.unsqueeze(0), output_hidden_states=True)
                masked_outputs = MDERank_model(input_ids=masked_input_ids.unsqueeze(0), attention_mask=masked_attention_mask.unsqueeze(0), token_type_ids=masked_token_type_ids.unsqueeze(0), output_hidden_states=True)

                ori_doc_embed = MDERANK.mean_pooling(ori_outputs, ori_attention_mask)
                masked_doc_embed = MDERANK.mean_pooling(masked_outputs, masked_attention_mask)

            cosine_similarity = torch.cosine_similarity(ori_doc_embed, masked_doc_embed, dim=1).cpu()
            score = cosine_similarity.item()

            dist_list.append((candidate, score))

            dist_all = MDERANK.get_all_dist(dist_list)
            dist_final = MDERANK.get_final_dist(dist_all, method='average')

            seen_stems = {}
            for phrase, score in dist_final.items():
                tokens = [token.text for token in nlp(phrase)]
                stems = [stemmer.stem(tok.lower()) for tok in tokens]
                stemmed_phrase = " ".join(stems)

                if stemmed_phrase not in seen_stems:
                    seen_stems[stemmed_phrase] = score
                elif score < seen_stems[stemmed_phrase]:
                    seen_stems[stemmed_phrase] = score

            dist_sorted = sorted(seen_stems.items(), key=lambda x: x[1], reverse=False)
        return dist_sorted

**NOTE:** The implementation of LMRank [(Giarelies and Karacapilidis, 2023)](https://ieeexplore.ieee.org/document/10179894) is based on [this](https://github.com/NC0DER/LMRank/blob/main/LMRank/model.py) and tailored to fit our work.

In [None]:
!pip install faiss-cpu

In [None]:
from __future__ import annotations
import torch
import numpy.typing
import faiss

from itertools import groupby
from operator import itemgetter
from difflib import get_close_matches
from sentence_transformers import SentenceTransformer
from typing import TypeVar, List, Tuple, Any

Model = TypeVar('Model')

In [None]:
# @title LMRank

class LMRANK():
    def __init__(self: LMRANK) -> None:

        self.model = SentenceTransformer('all-mpnet-base-v2')
        self.text = None
        self.doc = None

    @staticmethod
    def remove_last_seps(string: str, seps: str = '!?.') -> str:
        sep_set = set(seps)
        for i in range(len(string) - 1, -1, -1):
            if string[i - 1] in sep_set:
                return string[:i - 1]
        return string

    @staticmethod
    def find_nth_occurence(string: str, substring: str, start: int, end: int, n: int) -> int:

        i = string.find(substring, start, end)
        while i >= 0 and n > 1:
            i = string.find(substring, i + len(substring))
            n -= 1
        return i

    @staticmethod
    def create_chunks(string: str, max_token_length: int, token_sep: str = ' ') -> List[str]:

        chunk_ranges = []
        chunk_start = 0
        chunk_end = 0

        while chunk_end < len(string):

            chunk_start = chunk_end

            next_sep_pos = LMRANK.find_nth_occurence(
                string, token_sep, chunk_start, len(string),
                max_token_length
            )

            if next_sep_pos == -1:
                chunk_end = len(string)
            else:
                chunk_end = next_sep_pos

            chunk_ranges.append((chunk_start, chunk_end))

        chunks = [string[i:j] for (i,j) in chunk_ranges]

        return chunks

    def extract_candidate_keyphrases(
            self: LMRANK, text: str, sentence_seps: str = '!?.',
            deduplicate: bool = True, keep_nouns_adjs: bool = True,
        ) -> List[Tuple[str, int]]:

        self.text = ' '.join(text.split())

        self.doc = nlp(self.text)

        candidate_keyphrases = [
            (LMRANK.remove_last_seps(chunk.text.lower(), sentence_seps), chunk.start)
            for chunk in self.doc.noun_chunks
            if chunk.text.lower() not in nlp.Defaults.stop_words
            and chunk[0].pos_ not in {'PRON', 'PART'}
            and all(
                term.pos_ in {'NOUN', 'ADJ'}
                if keep_nouns_adjs else True for term in chunk
            )
            and len(chunk.text) > 2
            and not chunk.text[:1].isdigit()
            and not any(term.like_url or term.like_email for term in chunk)
        ]

        candidate_keyphrases = {
            key: next(group)[1]
            for key, group in groupby(
                sorted(candidate_keyphrases, key = itemgetter(0)),
                itemgetter(0))
        }

        if deduplicate:
            string_similarity = 0.65

            for item in list(candidate_keyphrases):
                close_matches = get_close_matches(item, candidate_keyphrases.keys(),
                                                  cutoff = 0.65, n = 10)[1:]
                for close_match in close_matches:
                    if not item.count(' '):
                        candidate_keyphrases.pop(item, None)
                        break
                    elif (len(close_match) > len(item)
                            and len(get_close_matches(item, [close_match], n = 1, cutoff = string_similarity))):
                        candidate_keyphrases.pop(close_match, None)

        return list(candidate_keyphrases.items())

    def encode(
            self: LMRANK, string_list: List[str],
            multi_processing: bool = False, device: str = 'cuda'
        ) -> numpy.typing.NDArray[Any]:

        model = self.model

        if multi_processing:
            pool = model.start_multi_process_pool(target_devices = [device])
            embeddings = model.encode_multi_process(string_list, pool)
            model.stop_multi_process_pool(pool)
        else:
            embeddings = model.encode(string_list, device = device)
        return embeddings

    def model_token_length(self: LMRANK) -> int:

        model = self.model

        return model.max_seq_length

    def get_keyphrases_embeddings(
            self: LMRANK, candidate_keyphrases: List[Tuple[str, List[int]]]) -> numpy.typing.NDArray[Any]:

        embeddings = self.encode([keyphrase for keyphrase, _ in candidate_keyphrases])
        return embeddings

    def get_document_embedding(self: LMRANK) -> numpy.typing.NDArray[np.float32]:

        if len(self.doc) <= self.model_token_length():
            document_embedding = self.encode(self.text)
        else:
            chunks = LMRANK.create_chunks(self.text, self.model_token_length())
            document_embedding = np.mean(
                self.encode(chunks), axis = 0
            )

        return document_embedding

    def calculate_positional_scores(
            self: LMRANK, candidate_keyphrases: List[Tuple[str, int]]
        ) -> numpy.typing.NDArray[np.float32]:

        scores = np.array([
            1 / (position + 1)
            for _, position in candidate_keyphrases
        ])

        e_scores = numpy.exp(scores - np.max(scores))
        scores = e_scores / e_scores.sum(axis = 0)

        return scores

    def LMRank(
            self: LMRANK, text: str, sentence_seps: str = '.?!',
            deduplicate: bool = False, keep_nouns_adjs: bool = True, positional_feature: bool = True
        ) -> List[Tuple[str, float]]:

        candidate_keyphrases = self.extract_candidate_keyphrases(
            text, sentence_seps, deduplicate, keep_nouns_adjs
        )

        if not candidate_keyphrases:
            return []

        embeddings = self.get_keyphrases_embeddings(candidate_keyphrases)
        document_embedding = np.atleast_2d(self.get_document_embedding())

        unranked_ids = np.array(range(len(embeddings))).astype(np.int64)

        embedding_dim = len(embeddings[0])

        index = faiss.index_factory(embedding_dim, 'IDMap,Flat', faiss.METRIC_INNER_PRODUCT)

        faiss.normalize_L2(embeddings)

        index.add_with_ids(embeddings, unranked_ids)

        faiss.normalize_L2(document_embedding)

        similarities, ranked_ids = index.search(document_embedding, len(candidate_keyphrases))

        if positional_feature:
            scores = self.calculate_positional_scores(candidate_keyphrases)

            ranked_list = [
                (candidate_keyphrases[key_id][0], float(sim * score))
                for key_id, sim, score in zip(ranked_ids[0], similarities[0], scores)]

        else:
            ranked_list = [
                (candidate_keyphrases[key_id][0], float(sim))
                for key_id, sim in zip(ranked_ids[0], similarities[0])]

        seen_stems = {}
        for phrase, score in ranked_list:
            tokens = [token.text for token in nlp(phrase)]
            stems = [stemmer.stem(tok.lower()) for tok in tokens]
            stemmed_phrase = " ".join(stems)

            if stemmed_phrase not in seen_stems:
                seen_stems[stemmed_phrase] = score
            elif score > seen_stems[stemmed_phrase]:
                seen_stems[stemmed_phrase] = score

        ranked_list = sorted(seen_stems.items(), key=lambda x: x[1], reverse=True)

        return ranked_list

In [None]:
from pke.unsupervised import *
from timeit import default_timer as timer

In [None]:
outputs = {}
outputs2 = {}
elapsed_times = {}
for model in [TF, TfIdf, KPMiner, YAKE, TextRank1, SingleRank, PositionRank, KeyBERT, MDERANK, LMRANK]:
    outputs[model.__name__] = []
    outputs2[model.__name__] = []

    extractor = model()
    start = timer()

    if model.__name__ == "KPMiner":
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')
            extractor.candidate_selection(lasf=3, cutoff=400)

            n_dynamic = len(references[i])

            extractor.candidate_weighting(df=df)

            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    elif model.__name__ == "YAKE":
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')
            extractor.candidate_selection(n=3)

            n_dynamic = len(references[i])

            use_stems = True
            extractor.candidate_weighting(use_stems=use_stems)

            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    elif model.__name__ == "TextRank1":
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')

            extractor.candidate_weighting(window=2, pos={'NOUN', 'PROPN', 'ADJ'}, top_percent=0.33)
            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    elif model.__name__ == "SingleRank":
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')
            extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})

            extractor.candidate_weighting(window=10, pos={'NOUN', 'PROPN', 'ADJ'})
            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    elif model.__name__ == "PositionRank":
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')
            extractor.candidate_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}", maximum_word_number=3)

            extractor.candidate_weighting(window=10, pos={'NOUN', 'PROPN', 'ADJ'})
            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    elif model.__name__ == "KeyBERT":
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')
            extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

            n_dynamic = len(references[i])

            extractor.candidate_weighting(input=doc)
            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    elif model.__name__ == "MDERANK":

        for i, doc in enumerate(tqdm(all_data)):
            n_dynamic = len(references[i])

            doc = str(doc)
            dist_sorted = extractor.MDERank(text=doc)
            outputs[model.__name__].append([u for u, v in dist_sorted[:n_dynamic]])
            outputs2[model.__name__].append({u: v for u, v in dist_sorted})

    elif model.__name__ == "LMRANK":

        for i, text in enumerate(tqdm(all_data)):
            n_dynamic = len(references[i])

            text = str(text)
            ranked_list = extractor.LMRank(text=text)

            outputs[model.__name__].append([u for u, v in ranked_list[:n_dynamic]])
            outputs2[model.__name__].append({u: v for u, v in ranked_list})

    else:
        for i, doc in enumerate(tqdm(all_data)):
            extractor.load_document(input=doc, language='en')
            extractor.candidate_selection(n=3)

            n_dynamic = len(references[i])

            if model.__name__ == "TfIdf":
                extractor.candidate_weighting(df=df)
            else:
                extractor.candidate_weighting()
            outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=n_dynamic, stemming=True)])
            doc_candidates = dict(extractor.weights)
            outputs2[model.__name__].append(doc_candidates)

    end = timer()
    elapsed_times[model.__name__] = end - start

    if model.__name__ == "KeyBERT":
      for i, doc_candidates in enumerate(outputs[model.__name__]):
          stemmed_candidates = []
          for candidate in doc_candidates:
              tokens = [token.text for token in nlp(candidate)]
              stems = [stemmer.stem(tok.lower()) for tok in tokens]
              stemmed_candidate = " ".join(stems)
              stemmed_candidates.append(stemmed_candidate)
          outputs[model.__name__][i] = stemmed_candidates
      for i, doc_phrases in enumerate(outputs2[model.__name__]):
          stemmed_phrases = {}
          for phrase, score in doc_phrases.items():
              tokens = [token.text for token in nlp(phrase)]
              stems = [stemmer.stem(tok.lower()) for tok in tokens]
              stemmed_phrase = " ".join(stems)
              stemmed_phrases[stemmed_phrase] = score
          outputs2[model.__name__][i] = stemmed_phrases

In [None]:
def evaluate_exact(top_N_candidates, references, cutoff=None):
    if cutoff is None:
        cutoff = len(references)
    cutoff = min(cutoff, len(top_N_candidates))
    P = len(set(top_N_candidates[:cutoff]) & set(references)) / len(top_N_candidates[:cutoff])
    R = len(set(top_N_candidates[:cutoff]) & set(references)) / len(references)
    F = (2 * P * R) / (P + R) if (P + R) > 0 else 0
    return (P, R, F)

def split_into_tokens(phrases):
    tokens = set()
    for phrase in phrases:
        tokens.update(phrase.split())
    return tokens

def evaluate_partial(top_N_candidates, references, cutoff=None):
    if cutoff is None:
        cutoff = len(references)
    cutoff = min(cutoff, len(top_N_candidates))

    predicted_tokens = split_into_tokens(top_N_candidates[:cutoff])
    reference_tokens = split_into_tokens(references)

    intersection = len(predicted_tokens & reference_tokens)

    P = intersection / len(predicted_tokens) if predicted_tokens else 0
    R = intersection / len(reference_tokens) if reference_tokens else 0
    F = (2 * P * R) / (P + R) if (P + R) > 0 else 0
    return (P, R, F)

def evaluate_harmonic(F1, pF1):
    return (2 * F1 * pF1) / (F1 + pF1) if (F1 + pF1) > 0 else 0

In [None]:
import numpy as np

In [None]:
print("## Benchmarking on {}".format(benchmark))
print("| Model       | it/s |     F    |   pF    |    hF    |")
print("| :---------- | ----:| -------: | ------: | -------: |")

results = []

for model in outputs:
    scores_exact = []
    scores_partial = []
    scores_harmonic = []

    for i, output in enumerate(outputs[model]):
        if not output:
            P_exact, R_exact, F_exact = (0, 0, 0)
            P_partial, R_partial, F_partial = (0, 0, 0)
            hF1 = 0
        else:
            P_exact, R_exact, F_exact = evaluate_exact(output, references[i], cutoff=None)
            P_partial, R_partial, F_partial = evaluate_partial(output, references[i], cutoff=None)
            hF1 = evaluate_harmonic(F_exact, F_partial)

        scores_exact.append((P_exact, R_exact, F_exact))
        scores_partial.append((P_partial, R_partial, F_partial))
        scores_harmonic.append(hF1)

    P_exact, R_exact, F_exact = np.mean(scores_exact, axis=0)
    P_partial, R_partial, F_partial = np.mean(scores_partial, axis=0)
    hF1_mean = np.mean(scores_harmonic)

    print("| {}  | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        model,
        len(all_data) / elapsed_times[model],
        F_exact,
        F_partial,
        hF1_mean
    ))

    results.append({
        "Model": model,
        "F": F_exact,
        "pF": F_partial,
        "hF": hF1_mean
    })

In [None]:
import pandas as pd

In [None]:
df_results = pd.DataFrame(results)

In [None]:
df_results

In [None]:
import pickle

In [None]:
with open("KDD_results.pkl", "wb") as f:
    pickle.dump(df_results, f)

In [None]:
import scipy.stats

In [None]:
def normalize_outputs(outputs2):
    outputs3 = {}

    for method, documents in outputs2.items():
        normalized_documents = []

        for doc in documents:
            if not doc:
                normalized_documents.append({})
                continue

            keys = list(doc.keys())
            values = list(doc.values())

            ranks = scipy.stats.rankdata(values, method="dense")
            rank_dict = dict(zip(keys, ranks))

            x_min = min(ranks)
            x_max = max(ranks)

            if x_max == x_min:
                norm_doc = {k: 0.0 for k in doc}

            elif method in ["TF", "TfIdf", "KPMiner", "TextRank1", "SingleRank", "PositionRank", "KeyBERT", "LMRANK"]:
                norm_doc = {k: float((rank_dict[k] - x_min) / (x_max - x_min)) for k in doc}

            elif method in ["YAKE", "MDERANK"]:
                norm_doc = {k: float((x_max - rank_dict[k]) / (x_max - x_min)) for k in doc}

            normalized_documents.append(norm_doc)

        outputs3[method] = normalized_documents

    return outputs3

In [None]:
outputs3 = normalize_outputs(outputs2)

In [None]:
from collections import defaultdict

In [None]:
def build_DTM(outputs, outputs3, method, zeta=0.5, top_k=50):

    docs_phrases = outputs[method]
    docs_scores = outputs3[method]

    k_score = round(top_k * zeta)
    k_freq = top_k - k_score

    score_sum = defaultdict(float)
    for doc_scores in docs_scores:
        for phrase, score in doc_scores.items():
            score_sum[phrase] += score
    top_score_ranked = sorted(score_sum.items(), key=lambda x: x[1], reverse=True)

    freq_counter = Counter()
    for phrases in docs_phrases:
        freq_counter.update(phrases)
    most_common = freq_counter.most_common()

    freq_terms = []
    last_freq = None
    for i, (phrase, freq) in enumerate(most_common):
        if i < k_freq:
            freq_terms.append((phrase, freq))
            last_freq = freq
        elif freq == last_freq:
            freq_terms.append((phrase, freq))
        else:
            break

    if len(freq_terms) > k_freq:
        target_freq = freq_terms[k_freq - 1][1]
        tie_start_idx = next(i for i, (_, freq) in enumerate(freq_terms) if freq == target_freq)
        tied_candidates = [x for x in freq_terms[tie_start_idx:] if x[1] == target_freq]

        tied_candidates_sorted = sorted(
            tied_candidates,
            key=lambda x: score_sum.get(x[0], 0),
            reverse=True
        )

        num_needed = k_freq - tie_start_idx
        freq_terms = freq_terms[:tie_start_idx] + tied_candidates_sorted[:num_needed]

    top_freq_terms = [phrase for phrase, _ in freq_terms]

    selected_terms = set(top_freq_terms)
    final_phrases = list(top_freq_terms)

    for phrase, _ in top_score_ranked:
        if phrase not in selected_terms:
            final_phrases.append(phrase)
            selected_terms.add(phrase)
        if len(final_phrases) == top_k:
            break

    matrix = []
    for doc_scores in docs_scores:
        row = [doc_scores.get(term, 0) for term in final_phrases]
        matrix.append(row)

    df_matrix = pd.DataFrame(matrix, columns=final_phrases)
    return df_matrix

In [None]:
def save_all_dtms(outputs, outputs3, dataset_name, method_list, zeta_list=[0.25, 0.5, 0.75], top_k=50):
    dtm_dict = {}

    for method in method_list:
        for zeta in zeta_list:
            df = build_DTM(outputs, outputs3, method, zeta=zeta, top_k=top_k)

            zeta_str = str(zeta).replace('.', '_')
            var_name = f"{dataset_name}_{method}_zeta{zeta_str}"

            dtm_dict[var_name] = df

    with open(f"{dataset_name}_dtm_dict.pkl", "wb") as f:
        pickle.dump(dtm_dict, f)

    return dtm_dict

In [None]:
dataset_name = "KDD"
method_list = ["TF", "TfIdf", "KPMiner", "YAKE", "TextRank1", "SingleRank", "PositionRank", "KeyBERT", "MDERANK", "LMRANK"]
dtms = save_all_dtms(outputs, outputs3, dataset_name, method_list)

In [None]:
with open("KDD_dtm_dict.pkl", "rb") as f:
    DTM_dict = pickle.load(f)

In [None]:
print(DTM_dict.keys())