In [None]:
# !pip install -U sentence-transformers keybert

In [6]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).
      """

sentence_model = SentenceTransformer("all-mpnet-base-v2")
kw_model = KeyBERT(model=sentence_model)
keywords = kw_model.extract_keywords(doc)

## Max sum distance vs Maximal Marginal Relevance

To diversify the results, after calcuate the keyword score by sbert and cosine similarity, use the filter for diverse the results.

### Max sum distance

* The max sum distance is a measure of how far apart the words in a document are.
* **The higher the max sum distance, the more the words are spread out.**
* This is a measure of how much information is contained in a document. 
* The max sum distance is calculated as follows:

$$\text{max sum distance} = \sum_{i=1}^{n} \sum_{j=1}^{n} \text{distance}(w_i, w_j)$$

### Maximal Marginal Relevance

* MMR tries to minimize redundancy and maximize the diversity of results in text summarization tasks.
* using keyword extraction algorithm called EmbedRank.
* the MMR algorithm is as follows:

$$\text{MMR} = \lambda \text{score}(w) + (1 - \lambda) \text{max}_{w' \in W} \text{similarity}(w, w')$$

In [10]:
# Max sum distance
kw_model.extract_keywords(doc, keyphrase_ngram_range=(2, 3), stop_words='english',
                              use_maxsum=True, nr_candidates=20, top_n=5)

[('learning example', 0.5468),
 ('signal supervised', 0.5562),
 ('learning function', 0.5619),
 ('machine learning task', 0.5758),
 ('labeled training data', 0.5767)]

In [16]:
# Maximal Marginal Relevance
kw_model.extract_keywords(doc, keyphrase_ngram_range=(2, 2), stop_words='english',
                              use_mmr=True, diversity=0.5, highlight=True)

[('supervised learning', 0.6538),
 ('learning function', 0.5619),
 ('function labeled', 0.384),
 ('supervisory signal', 0.3386),
 ('input object', 0.2421)]

In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [ 6.41698064e-03  7.04135746e-03 -2.81442106e-02  5.12470901e-02
 -8.93961638e-03  2.12669447e-02  2.30778884e-02 -1.44860223e-02
 -5.55316685e-03 -2.49297079e-02  4.53493297e-02  2.48958822e-02
 -3.07578966e-02  5.66224083e-02  6.32021800e-02 -5.62527888e-02
  5.16509824e-02  5.78279095e-03 -2.62116212e-02  1.31874217e-03
  1.99272223e-02 -1.30594836e-03 -2.28708331e-03  4.72541526e-02
 -3.72494906e-02 -2.85245217e-02 -4.10240963e-02 -1.57976002e-02
  3.17328074e-03 -8.74146994e-04 -2.96459924e-02  3.21501493e-02
  3.51344012e-02  1.09738093e-02  9.16706938e-07 -1.18587702e-03
 -2.53640637e-02 -7.92881101e-03 -5.09481831e-03  7.40649505e-03
  2.80068330e-02  1.06995171e-02  1.07513331e-02  2.76827589e-02
 -5.19132540e-02 -4.98179495e-02  5.34075089e-02  5.79067133e-02
  7.86073431e-02  7.73014352e-02 -1.01112248e-02 -6.35446236e-02
 -1.71579625e-02 -6.77371165e-03 -2.45815422e-03  2.61346288e-02
 -5.38507