In [89]:
# Based on https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea
doc = """
         Star Wars Fatigue Shouldn't Stop You From Watching 'Andor' - CNET
      """

from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 5)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names_out()

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

from sklearn.metrics.pairwise import cosine_similarity

top_n = 3
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

keywords

['star wars fatigue',
 'star wars fatigue shouldn',
 'star wars fatigue shouldn stop']

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def get_keywords(docs):
    proposed_keywords = []

    for doc in docs:
        n_gram_range = (3, 3)
        stop_words = "english"

        # Extract candidate words/phrases
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit(docs)
        candidates = count.get_feature_names_out()

        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        doc_embedding = model.encode([doc])
        candidate_embeddings = model.encode(candidates)

        top_n = 5
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
        proposed_keywords.extend(keywords)

    proposed_keywords = sorted(proposed_keywords, key = proposed_keywords.count, reverse = True)

    top_keywords = set()
    for keyword in proposed_keywords:
        if len(top_keywords) < 5:
            top_keywords.add(keyword)

    return top_keywords

In [91]:
# Results for Star Wars 09/11/2022
docs = ["Count Dooku Voice Actor Corey Burton Tried Something New on Star Wars: Tales of the Jedi", "'Andor' Episode 8 Explained: 'Rogue One' Cameos and a 'Star Wars' Hell Prison - CNET", "Andor Gave Us the Gayest Screen Fade in Star Wars History", "Ahsoka Tano Herself, Ashley Eckstein, Breaks Down Star Wars: Tales of the Jedi", "Star Wars: The Deckbuilding Game could rule the galaxy of 2-player card games", "There's a new Star Wars project from Damon Lindelof in the works", "Star Wars Fatigue Shouldn't Stop You From Watching 'Andor' - CNET", "'Andor' is the best 'Star Wars' show since 'The Mandalorian' — but the least popular", "Star Wars characters take over Mexico City - Reuters"]

print(get_keywords(docs))

{'star wars hell', 'breaks star wars', 'new star wars', 'wars tales jedi', 'star wars mandalorian'}
