In [1]:
#!pip install keybert

Collecting transformers<3.6.0,>=3.1.0
  Downloading transformers-3.5.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 2.9 MB/s 
Collecting tokenizers==0.9.3
  Downloading tokenizers-0.9.3-cp38-cp38-macosx_10_11_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 31.8 MB/s 
[31mERROR: tner 0.0.0 has requirement jinja2==2.11.2, but you'll have jinja2 2.11.3 which is incompatible.[0m
[31mERROR: textattack 0.2.14 has requirement numpy<1.19.0, but you'll have numpy 1.20.1 which is incompatible.[0m
[31mERROR: textattack 0.2.14 has requirement scipy==1.4.1, but you'll have scipy 1.6.0 which is incompatible.[0m
[31mERROR: textattack 0.2.14 has requirement tokenizers==0.8.1-rc2, but you'll have tokenizers 0.9.3 which is incompatible.[0m
[31mERROR: textattack 0.2.14 has requirement tqdm<4.50.0,>=4.27, but you'll have tqdm 4.56.0 which is incompatible.[0m
[31mERROR: textattack 0.2.14 has requirement transformers==3.3.0, but you'll have tra

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple


def mmr(doc_embedding: np.ndarray,
        word_embeddings: np.ndarray,
        words: List[str],
        top_n: int = 5,
        diversity: float = 0.8) -> List[Tuple[str, float]]:
    """ Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords and the document.
    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.
    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        diversity: How diverse the select keywords/keyphrases are.
                   Values between 0 and 1 with 0 being not diverse at all
                   and 1 being most diverse.
    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances
    """

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [(words[idx], round(float(word_doc_similarity.reshape(1, -1)[0][idx]), 4)) for idx in keywords_idx]

In [6]:
import numpy as np
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple


def max_sum_similarity(doc_embedding: np.ndarray,
                       word_embeddings: np.ndarray,
                       words: List[str],
                       top_n: int,
                       nr_candidates: int) -> List[Tuple[str, float]]:
    """ Calculate Max Sum Distance for extraction of keywords
    We take the 2 x top_n most similar words/phrases to the document.
    Then, we take all top_n combinations from the 2 x top_n words and
    extract the combination that are the least similar to each other
    by cosine similarity.
    NOTE:
        This is O(n^2) and therefore not advised if you use a large top_n
    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        nr_candidates: The number of candidates to consider
    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances
    """
    if nr_candidates < top_n:
        raise Exception("Make sure that the number of candidates exceeds the number "
                        "of keywords to return.")

    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, word_embeddings)
    distances_words = cosine_similarity(word_embeddings, word_embeddings)

    # Get 2*top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [words[index] for index in words_idx]
    candidates = distances_words[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = 100_000
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [(words_vals[idx], round(float(distances[0][idx]), 4)) for idx in candidate]


In [7]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
from tqdm import tqdm
from typing import List, Union, Tuple
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Flair
try:
    from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings
    from flair.data import Sentence
    _HAS_FLAIR = True
except ModuleNotFoundError as e:
    DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings = None, None, None
    _HAS_FLAIR = False


class KeyBERT:
    """
    A minimal method for keyword extraction with BERT
    The keyword extraction is done by finding the sub-phrases in
    a document that are the most similar to the document itself.
    First, document embeddings are extracted with BERT to get a
    document-level representation. Then, word embeddings are extracted
    for N-gram words/phrases. Finally, we use cosine similarity to find the
    words/phrases that are the most similar to the document.
    The most similar words could then be identified as the words that
    best describe the entire document.
    """
    def __init__(self,
                 model: Union[str,
                              SentenceTransformer,
                              DocumentEmbeddings,
                              TokenEmbeddings] = 'distilbert-base-nli-mean-tokens'):
        """ KeyBERT initialization
        Arguments:
            model: Use a custom embedding model. You can pass in a string related
                   to one of the following models:
                   https://www.sbert.net/docs/pretrained_models.html
                   You can also pass in a SentenceTransformer() model or a Flair
                   DocumentEmbedding model.
        """
        self.model = self._select_embedding_model(model)

    def extract_keywords(self,
                         docs: Union[str, List[str]],
                         keyphrase_ngram_range: Tuple[int, int] = (1, 1),
                         stop_words: Union[str, List[str]] = 'english',
                         top_n: int = 5,
                         min_df: int = 1,
                         use_maxsum: bool = False,
                         use_mmr: bool = False,
                         diversity: float = 0.5,
                         nr_candidates: int = 20,
                         vectorizer: CountVectorizer = None) -> Union[List[Tuple[str, float]],
                                                                      List[List[Tuple[str, float]]]]:
        """ Extract keywords/keyphrases
        NOTE:
            I would advise you to iterate over single documents as they
            will need the least amount of memory. Even though this is slower,
            you are not likely to run into memory errors.
        Multiple Documents:
            There is an option to extract keywords for multiple documents
            that is faster than extraction for multiple single documents.
            However...this method assumes that you can keep the word embeddings
            for all words in the vocabulary in memory which might be troublesome.
            I would advise against using this option and simply iterating
            over documents instead if you have limited hardware.
        Arguments:
            docs: The document(s) for which to extract keywords/keyphrases
            keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
            stop_words: Stopwords to remove from the document
            top_n: Return the top n keywords/keyphrases
            min_df: Minimum document frequency of a word across all documents
                    if keywords for multiple documents need to be extracted
            use_maxsum: Whether to use Max Sum Similarity for the selection
                        of keywords/keyphrases
            use_mmr: Whether to use Maximal Marginal Relevance (MMR) for the
                     selection of keywords/keyphrases
            diversity: The diversity of the results between 0 and 1 if use_mmr
                       is set to True
            nr_candidates: The number of candidates to consider if use_maxsum is
                           set to True
            vectorizer: Pass in your own CountVectorizer from scikit-learn
        Returns:
            keywords: the top n keywords for a document with their respective distances
                      to the input document
        """

        if isinstance(docs, str):
            return self._extract_keywords_single_doc(docs,
                                                     keyphrase_ngram_range,
                                                     stop_words,
                                                     top_n,
                                                     use_maxsum,
                                                     use_mmr,
                                                     diversity,
                                                     nr_candidates,
                                                     vectorizer)
        elif isinstance(docs, list):
            warnings.warn("Although extracting keywords for multiple documents is faster "
                          "than iterating over single documents, it requires significantly more memory "
                          "to hold all word embeddings. Use this at your own discretion!")
            return self._extract_keywords_multiple_docs(docs,
                                                        keyphrase_ngram_range,
                                                        stop_words,
                                                        top_n,
                                                        min_df,
                                                        vectorizer)

    def _extract_keywords_single_doc(self,
                                     doc: str,
                                     keyphrase_ngram_range: Tuple[int, int] = (1, 1),
                                     stop_words: Union[str, List[str]] = 'english',
                                     top_n: int = 5,
                                     use_maxsum: bool = False,
                                     use_mmr: bool = False,
                                     diversity: float = 0.5,
                                     nr_candidates: int = 20,
                                     vectorizer: CountVectorizer = None) -> List[Tuple[str, float]]:
        """ Extract keywords/keyphrases for a single document
        Arguments:
            doc: The document for which to extract keywords/keyphrases
            keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
            stop_words: Stopwords to remove from the document
            top_n: Return the top n keywords/keyphrases
            use_mmr: Whether to use Max Sum Similarity
            use_mmr: Whether to use MMR
            diversity: The diversity of results between 0 and 1 if use_mmr is True
            nr_candidates: The number of candidates to consider if use_maxsum is set to True
            vectorizer: Pass in your own CountVectorizer from scikit-learn
        Returns:
            keywords: the top n keywords for a document with their respective distances
                      to the input document
        """
        try:
            # Extract Words
            if vectorizer:
                count = vectorizer.fit([doc])
            else:
                count = CountVectorizer(ngram_range=keyphrase_ngram_range, stop_words=stop_words).fit([doc])
            words = count.get_feature_names()

            # Extract Embeddings
            doc_embedding = self._extract_embeddings([doc])
            word_embeddings = self._extract_embeddings(words)
            # doc_embedding = self.model.encode([doc])
            # word_embeddings = self.model.encode(words)

            # Calculate distances and extract keywords
            if use_mmr:
                keywords = mmr(doc_embedding, word_embeddings, words, top_n, diversity)
            elif use_maxsum:
                keywords = max_sum_similarity(doc_embedding, word_embeddings, words, top_n, nr_candidates)
            else:
                distances = cosine_similarity(doc_embedding, word_embeddings)
                keywords = [(words[index], round(float(distances[0][index]), 4))
                            for index in distances.argsort()[0][-top_n:]][::-1]

            return keywords
        except ValueError:
            return []

    def _extract_keywords_multiple_docs(self,
                                        docs: List[str],
                                        keyphrase_ngram_range: Tuple[int, int] = (1, 1),
                                        stop_words: str = 'english',
                                        top_n: int = 5,
                                        min_df: int = 1,
                                        vectorizer: CountVectorizer = None) -> List[List[Tuple[str, float]]]:
        """ Extract keywords/keyphrases for a multiple documents
        This currently does not use MMR as
        Arguments:
            docs: The document for which to extract keywords/keyphrases
            keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
            stop_words: Stopwords to remove from the document
            top_n: Return the top n keywords/keyphrases
            min_df: The minimum frequency of words
            vectorizer: Pass in your own CountVectorizer from scikit-learn
        Returns:
            keywords: the top n keywords for a document with their respective distances
                      to the input document
        """
        # Extract words
        if vectorizer:
            count = vectorizer.fit(docs)
        else:
            count = CountVectorizer(ngram_range=keyphrase_ngram_range, stop_words=stop_words, min_df=min_df).fit(docs)
        words = count.get_feature_names()
        df = count.transform(docs)

        # Extract embeddings
        word_embeddings = self._extract_embeddings(words)
        doc_embeddings = self._extract_embeddings(docs)
        # word_embeddings = self.model.encode(words, show_progress_bar=True)
        # doc_embeddings = self.model.encode(docs, show_progress_bar=True)

        # Extract keywords
        keywords = []
        for index, doc in tqdm(enumerate(docs)):
            doc_words = [words[i] for i in df[index].nonzero()[1]]

            if doc_words:
                doc_word_embeddings = np.array([word_embeddings[i] for i in df[index].nonzero()[1]])
                distances = cosine_similarity([doc_embeddings[index]], doc_word_embeddings)[0]
                doc_keywords = [(doc_words[i], round(float(distances[i]), 4)) for i in distances.argsort()[-top_n:]]
                keywords.append(doc_keywords)
            else:
                keywords.append(["None Found"])

        return keywords

    def _extract_embeddings(self, documents: Union[List[str], str]) -> np.ndarray:
        """ Extract sentence/document embeddings through pre-trained embeddings
        For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html
        Arguments:
            documents: Dataframe with documents and their corresponding IDs
        Returns:
            embeddings: The extracted embeddings using the sentence transformer
                        module. Typically uses pre-trained huggingface models.
        """
        if isinstance(documents, str):
            documents = [documents]

        # Infer embeddings with SentenceTransformer
        if isinstance(self.model, SentenceTransformer):
            embeddings = self.model.encode(documents)

        # Infer embeddings with Flair
        elif isinstance(self.model, DocumentEmbeddings):
            embeddings = []
            for index, document in enumerate(documents):
                try:
                    sentence = Sentence(document) if document else Sentence("an empty document")
                    self.model.embed(sentence)
                except RuntimeError:
                    sentence = Sentence("an empty document")
                    self.model.embed(sentence)
                embedding = sentence.embedding.detach().cpu().numpy()
                embeddings.append(embedding)
            embeddings = np.asarray(embeddings)

        else:
            raise ValueError("An incorrect embedding model type was selected.")

        return embeddings

    def _select_embedding_model(self, model: Union[str,
                                                   SentenceTransformer,
                                                   DocumentEmbeddings,
                                                   TokenEmbeddings]) -> Union[SentenceTransformer,
                                                                              DocumentEmbeddings]:
        """ Select an embedding model based on language or a specific sentence transformer models.
        When selecting a language, we choose distilbert-base-nli-stsb-mean-tokens for English and
        xlm-r-bert-base-nli-stsb-mean-tokens for all other languages as it support 100+ languages.
        Arguments:
            model: Use a custom embedding model. You can pass in a string related
                   to one of the following models:
                   https://www.sbert.net/docs/pretrained_models.html
                   You can also pass in a SentenceTransformer() model or a Flair
                   DocumentEmbedding model.
        Returns:
            model: Either a Sentence-Transformer or Flair model
        """

        # Sentence Transformer embeddings
        if isinstance(model, SentenceTransformer):
            return model

        # Flair word embeddings
        elif _HAS_FLAIR and isinstance(model, TokenEmbeddings):
            return DocumentPoolEmbeddings([model])

        # Flair document embeddings + disable fine tune to prevent CUDA OOM
        # https://github.com/flairNLP/flair/issues/1719
        elif _HAS_FLAIR and isinstance(model, DocumentEmbeddings):
            if "fine_tune" in model.__dict__:
                model.fine_tune = False
            return model

        # Select embedding model based on specific sentence transformer model
        elif isinstance(model, str):
            return SentenceTransformer(model)

        return SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens")

In [8]:
doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs.[1] It infers a
         function from labeled training data consisting of a set of training examples.[2]
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal). 
         A supervised learning algorithm analyzes the training data and produces an inferred function, 
         which can be used for mapping new examples. An optimal scenario will allow for the 
         algorithm to correctly determine the class labels for unseen instances. This requires 
         the learning algorithm to generalize from the training data to unseen situations in a 
         'reasonable' way (see inductive bias).
      """
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc)

In [9]:
model.extract_keywords(doc, stop_words=None)

[('learning', 0.4762),
 ('training', 0.4679),
 ('algorithm', 0.4562),
 ('class', 0.4259),
 ('mapping', 0.3733)]

In [10]:
model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)

[('learning algorithm', 0.7061),
 ('machine learning', 0.6435),
 ('supervised learning', 0.6028),
 ('learning function', 0.5982),
 ('algorithm analyzes', 0.59)]

# Max Sum Similarity

In [11]:
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)

[('set training examples', 0.7632),
 ('generalize training data', 0.7825),
 ('requires learning algorithm', 0.2892),
 ('supervised learning algorithm', 0.3836),
 ('learning machine learning', 0.3873)]

# Maximal Marginal Relevance

In [12]:
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_mmr=True, diversity=0.7) # Diversity high 

[('algorithm generalize training', 0.7825),
 ('labels unseen instances', 0.1559),
 ('new examples optimal', 0.4188),
 ('determine class labels', 0.4855),
 ('supervised learning algorithm', 0.7513)]

In [13]:
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_mmr=True, diversity=0.2) # Diversity low

[('algorithm generalize training', 0.7825),
 ('learning machine learning', 0.7717),
 ('learning algorithm analyzes', 0.767),
 ('supervised learning algorithm', 0.7513),
 ('algorithm analyzes training', 0.7632)]

# Embedding Models 

In [14]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cpu")
model = KeyBERT(model=sentence_model)
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_mmr=True, diversity=0.7)

[('algorithm generalize training', 0.7825),
 ('labels unseen instances', 0.1559),
 ('new examples optimal', 0.4188),
 ('determine class labels', 0.4855),
 ('supervised learning algorithm', 0.7513)]

In [15]:
from flair.embeddings import TransformerDocumentEmbeddings

roberta = TransformerDocumentEmbeddings('roberta-base')
model = KeyBERT(model=roberta)
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_mmr=True, diversity=0.7)

Downloading: 100%|██████████| 481/481 [00:00<00:00, 133kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.51MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 922kB/s]
Downloading: 100%|██████████| 501M/501M [00:49<00:00, 10.1MB/s]


[('supervised learning algorithm', 0.9317),
 ('used mapping new', 0.9279),
 ('output pairs infers', 0.9302),
 ('examples optimal scenario', 0.9308),
 ('labeled training data', 0.9315)]