# KeyBert

In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple


def mmr(doc_embedding: np.ndarray,
        word_embeddings: np.ndarray,
        words: List[str],
        top_n: int = 5,
        diversity: float = 0.8) -> List[Tuple[str, float]]:
    """ Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords and the document.
    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.
    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        diversity: How diverse the select keywords/keyphrases are.
                   Values between 0 and 1 with 0 being not diverse at all
                   and 1 being most diverse.
    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances
    """

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [(words[idx], round(float(word_doc_similarity.reshape(1, -1)[0][idx]), 4)) for idx in keywords_idx]


In [2]:
import numpy as np
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple


def max_sum_similarity(doc_embedding: np.ndarray,
                       word_embeddings: np.ndarray,
                       words: List[str],
                       top_n: int,
                       nr_candidates: int) -> List[Tuple[str, float]]:
    """ Calculate Max Sum Distance for extraction of keywords
    We take the 2 x top_n most similar words/phrases to the document.
    Then, we take all top_n combinations from the 2 x top_n words and
    extract the combination that are the least similar to each other
    by cosine similarity.
    NOTE:
        This is O(n^2) and therefore not advised if you use a large top_n
    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        nr_candidates: The number of candidates to consider
    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances
    """
    if nr_candidates < top_n:
        raise Exception("Make sure that the number of candidates exceeds the number "
                        "of keywords to return.")

    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, word_embeddings)
    distances_words = cosine_similarity(word_embeddings, word_embeddings)

    # Get 2*top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [words[index] for index in words_idx]
    candidates = distances_words[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = 100_000
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [(words_vals[idx], round(float(distances[0][idx]), 4)) for idx in candidate]

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
from tqdm import tqdm
from typing import List, Union, Tuple
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Flair
try:
    from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings
    from flair.data import Sentence
    _HAS_FLAIR = True
except ModuleNotFoundError as e:
    DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings = None, None, None
    _HAS_FLAIR = False


class KeyBERT:
    """
    A minimal method for keyword extraction with BERT
    The keyword extraction is done by finding the sub-phrases in
    a document that are the most similar to the document itself.
    First, document embeddings are extracted with BERT to get a
    document-level representation. Then, word embeddings are extracted
    for N-gram words/phrases. Finally, we use cosine similarity to find the
    words/phrases that are the most similar to the document.
    The most similar words could then be identified as the words that
    best describe the entire document.
    """
    def __init__(self,
                 model: Union[str,
                              SentenceTransformer,
                              DocumentEmbeddings,
                              TokenEmbeddings] = 'distilbert-base-nli-mean-tokens'):
        """ KeyBERT initialization
        Arguments:
            model: Use a custom embedding model. You can pass in a string related
                   to one of the following models:
                   https://www.sbert.net/docs/pretrained_models.html
                   You can also pass in a SentenceTransformer() model or a Flair
                   DocumentEmbedding model.
        """
        self.model = self._select_embedding_model(model)

    def extract_keywords(self,
                         docs: Union[str, List[str]],
                         keyphrase_ngram_range: Tuple[int, int] = (1, 1),
                         stop_words: Union[str, List[str]] = 'english',
                         top_n: int = 5,
                         min_df: int = 1,
                         use_maxsum: bool = False,
                         use_mmr: bool = False,
                         diversity: float = 0.5,
                         nr_candidates: int = 20,
                         vectorizer: CountVectorizer = None) -> Union[List[Tuple[str, float]],
                                                                      List[List[Tuple[str, float]]]]:
        """ Extract keywords/keyphrases
        NOTE:
            I would advise you to iterate over single documents as they
            will need the least amount of memory. Even though this is slower,
            you are not likely to run into memory errors.
        Multiple Documents:
            There is an option to extract keywords for multiple documents
            that is faster than extraction for multiple single documents.
            However...this method assumes that you can keep the word embeddings
            for all words in the vocabulary in memory which might be troublesome.
            I would advise against using this option and simply iterating
            over documents instead if you have limited hardware.
        Arguments:
            docs: The document(s) for which to extract keywords/keyphrases
            keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
            stop_words: Stopwords to remove from the document
            top_n: Return the top n keywords/keyphrases
            min_df: Minimum document frequency of a word across all documents
                    if keywords for multiple documents need to be extracted
            use_maxsum: Whether to use Max Sum Similarity for the selection
                        of keywords/keyphrases
            use_mmr: Whether to use Maximal Marginal Relevance (MMR) for the
                     selection of keywords/keyphrases
            diversity: The diversity of the results between 0 and 1 if use_mmr
                       is set to True
            nr_candidates: The number of candidates to consider if use_maxsum is
                           set to True
            vectorizer: Pass in your own CountVectorizer from scikit-learn
        Returns:
            keywords: the top n keywords for a document with their respective distances
                      to the input document
        """

        if isinstance(docs, str):
            return self._extract_keywords_single_doc(docs,
                                                     keyphrase_ngram_range,
                                                     stop_words,
                                                     top_n,
                                                     use_maxsum,
                                                     use_mmr,
                                                     diversity,
                                                     nr_candidates,
                                                     vectorizer)
        elif isinstance(docs, list):
            warnings.warn("Although extracting keywords for multiple documents is faster "
                          "than iterating over single documents, it requires significantly more memory "
                          "to hold all word embeddings. Use this at your own discretion!")
            return self._extract_keywords_multiple_docs(docs,
                                                        keyphrase_ngram_range,
                                                        stop_words,
                                                        top_n,
                                                        min_df,
                                                        vectorizer)

    def _extract_keywords_single_doc(self,
                                     doc: str,
                                     keyphrase_ngram_range: Tuple[int, int] = (1, 1),
                                     stop_words: Union[str, List[str]] = 'english',
                                     top_n: int = 5,
                                     use_maxsum: bool = False,
                                     use_mmr: bool = False,
                                     diversity: float = 0.5,
                                     nr_candidates: int = 20,
                                     vectorizer: CountVectorizer = None) -> List[Tuple[str, float]]:
        """ Extract keywords/keyphrases for a single document
        Arguments:
            doc: The document for which to extract keywords/keyphrases
            keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
            stop_words: Stopwords to remove from the document
            top_n: Return the top n keywords/keyphrases
            use_mmr: Whether to use Max Sum Similarity
            use_mmr: Whether to use MMR
            diversity: The diversity of results between 0 and 1 if use_mmr is True
            nr_candidates: The number of candidates to consider if use_maxsum is set to True
            vectorizer: Pass in your own CountVectorizer from scikit-learn
        Returns:
            keywords: the top n keywords for a document with their respective distances
                      to the input document
        """
        try:
            # Extract Words
            if vectorizer:
                count = vectorizer.fit([doc])
            else:
                count = CountVectorizer(ngram_range=keyphrase_ngram_range, stop_words=stop_words).fit([doc])
            words = count.get_feature_names()

            # Extract Embeddings
            doc_embedding = self._extract_embeddings([doc])
            word_embeddings = self._extract_embeddings(words)
            # doc_embedding = self.model.encode([doc])
            # word_embeddings = self.model.encode(words)

            # Calculate distances and extract keywords
            if use_mmr:
                keywords = mmr(doc_embedding, word_embeddings, words, top_n, diversity)
            elif use_maxsum:
                keywords = max_sum_similarity(doc_embedding, word_embeddings, words, top_n, nr_candidates)
            else:
                distances = cosine_similarity(doc_embedding, word_embeddings)
                keywords = [(words[index], round(float(distances[0][index]), 4))
                            for index in distances.argsort()[0][-top_n:]][::-1]

            return keywords
        except ValueError:
            return []

    def _extract_keywords_multiple_docs(self,
                                        docs: List[str],
                                        keyphrase_ngram_range: Tuple[int, int] = (1, 1),
                                        stop_words: str = 'english',
                                        top_n: int = 5,
                                        min_df: int = 1,
                                        vectorizer: CountVectorizer = None) -> List[List[Tuple[str, float]]]:
        """ Extract keywords/keyphrases for a multiple documents
        This currently does not use MMR as
        Arguments:
            docs: The document for which to extract keywords/keyphrases
            keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
            stop_words: Stopwords to remove from the document
            top_n: Return the top n keywords/keyphrases
            min_df: The minimum frequency of words
            vectorizer: Pass in your own CountVectorizer from scikit-learn
        Returns:
            keywords: the top n keywords for a document with their respective distances
                      to the input document
        """
        # Extract words
        if vectorizer:
            count = vectorizer.fit(docs)
        else:
            count = CountVectorizer(ngram_range=keyphrase_ngram_range, stop_words=stop_words, min_df=min_df).fit(docs)
        words = count.get_feature_names()
        df = count.transform(docs)

        # Extract embeddings
        word_embeddings = self._extract_embeddings(words)
        doc_embeddings = self._extract_embeddings(docs)
        # word_embeddings = self.model.encode(words, show_progress_bar=True)
        # doc_embeddings = self.model.encode(docs, show_progress_bar=True)

        # Extract keywords
        keywords = []
        for index, doc in tqdm(enumerate(docs)):
            doc_words = [words[i] for i in df[index].nonzero()[1]]

            if doc_words:
                doc_word_embeddings = np.array([word_embeddings[i] for i in df[index].nonzero()[1]])
                distances = cosine_similarity([doc_embeddings[index]], doc_word_embeddings)[0]
                doc_keywords = [(doc_words[i], round(float(distances[i]), 4)) for i in distances.argsort()[-top_n:]]
                keywords.append(doc_keywords)
            else:
                keywords.append(["None Found"])

        return keywords

    def _extract_embeddings(self, documents: Union[List[str], str]) -> np.ndarray:
        """ Extract sentence/document embeddings through pre-trained embeddings
        For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html
        Arguments:
            documents: Dataframe with documents and their corresponding IDs
        Returns:
            embeddings: The extracted embeddings using the sentence transformer
                        module. Typically uses pre-trained huggingface models.
        """
        if isinstance(documents, str):
            documents = [documents]

        # Infer embeddings with SentenceTransformer
        if isinstance(self.model, SentenceTransformer):
            embeddings = self.model.encode(documents)

        # Infer embeddings with Flair
        elif isinstance(self.model, DocumentEmbeddings):
            embeddings = []
            for index, document in enumerate(documents):
                try:
                    sentence = Sentence(document) if document else Sentence("an empty document")
                    self.model.embed(sentence)
                except RuntimeError:
                    sentence = Sentence("an empty document")
                    self.model.embed(sentence)
                embedding = sentence.embedding.detach().cpu().numpy()
                embeddings.append(embedding)
            embeddings = np.asarray(embeddings)

        else:
            raise ValueError("An incorrect embedding model type was selected.")

        return embeddings

    def _select_embedding_model(self, model: Union[str,
                                                   SentenceTransformer,
                                                   DocumentEmbeddings,
                                                   TokenEmbeddings]) -> Union[SentenceTransformer,
                                                                              DocumentEmbeddings]:
        """ Select an embedding model based on language or a specific sentence transformer models.
        When selecting a language, we choose distilbert-base-nli-stsb-mean-tokens for English and
        xlm-r-bert-base-nli-stsb-mean-tokens for all other languages as it support 100+ languages.
        Arguments:
            model: Use a custom embedding model. You can pass in a string related
                   to one of the following models:
                   https://www.sbert.net/docs/pretrained_models.html
                   You can also pass in a SentenceTransformer() model or a Flair
                   DocumentEmbedding model.
        Returns:
            model: Either a Sentence-Transformer or Flair model
        """

        # Sentence Transformer embeddings
        if isinstance(model, SentenceTransformer):
            return model

        # Flair word embeddings
        elif _HAS_FLAIR and isinstance(model, TokenEmbeddings):
            return DocumentPoolEmbeddings([model])

        # Flair document embeddings + disable fine tune to prevent CUDA OOM
        # https://github.com/flairNLP/flair/issues/1719
        elif _HAS_FLAIR and isinstance(model, DocumentEmbeddings):
            if "fine_tune" in model.__dict__:
                model.fine_tune = False
            return model

        # Select embedding model based on specific sentence transformer model
        elif isinstance(model, str):
            return SentenceTransformer(model)

        return SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens")

# Yake keyword extraction

In [4]:
import numpy as np

class Levenshtein(object):

    @staticmethod
    def __ratio(distance, str_length):
        return 1 - float(distance) / float(str_length)

    @staticmethod
    def ratio(seq1, seq2):
        str_distance = Levenshtein.distance(seq1,seq2)
        str_length = max(len(seq1),len(seq2))
        return Levenshtein.__ratio(str_distance,str_length)

    @staticmethod
    def distance(seq1, seq2):  
        size_x = len(seq1) + 1
        size_y = len(seq2) + 1
        matrix = np.zeros ((size_x, size_y))
        for x in range(size_x):
            matrix [x, 0] = x
        for y in range(size_y):
            matrix [0, y] = y

        for x in range(1, size_x):
            for y in range(1, size_y):
                if seq1[x-1] == seq2[y-1]:
                    matrix [x,y] = min(
                        matrix[x-1, y] + 1,
                        matrix[x-1, y-1],
                        matrix[x, y-1] + 1
                    )
                else:
                    matrix [x,y] = min(
                        matrix[x-1,y] + 1,
                        matrix[x-1,y-1] + 1,
                        matrix[x,y-1] + 1
                    )
        return (matrix[size_x - 1, size_y - 1])

In [5]:
from segtok.segmenter import split_multi
from segtok.tokenizer import web_tokenizer, split_contractions

import networkx as nx
import numpy as np
import string
import os
import math
import jellyfish
import re

STOPWORD_WEIGHT = 'bi'

class DataCore(object):
    
    def __init__(self, text, stopword_set, windowsSize, n, tagsToDiscard = set(['u', 'd']), exclude = set(string.punctuation)):
        self.number_of_sentences = 0
        self.number_of_words = 0
        self.terms = {}
        self.candidates = {}
        self.sentences_obj = []
        self.sentences_str = []
        self.G = nx.DiGraph()
        self.exclude = exclude
        self.tagsToDiscard = tagsToDiscard
        self.freq_ns = {}
        for i in range(n):
            self.freq_ns[i+1] = 0.
        self.stopword_set = stopword_set
        self._build(text, windowsSize, n)

    def build_candidate(self, candidate_string):
        sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
        candidate_terms = []
        for (i, word) in enumerate(sentences_str):
            tag = self.getTag(word, i)
            term_obj = self.getTerm(word, save_non_seen=False)
            if term_obj.tf == 0:
                term_obj = None
            candidate_terms.append( (tag, word, term_obj) )
        if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
            invalid_virtual_cand = composed_word(None)
            return invalid_virtual_cand
        virtual_cand = composed_word(candidate_terms)
        return virtual_cand

    # Build the datacore features
    def _build(self, text, windowsSize, n):
        text = self.pre_filter(text)
        self.sentences_str = [ [w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences = len(self.sentences_str)
        pos_text = 0
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(self.sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([c for c in word if c in self.exclude]) == len(word): # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append( block_of_word_obj )
                        block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append( block_of_word_obj )

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)

        self.number_of_words = pos_text

    def build_single_terms_features(self, features=None):
        validTerms = [ term for term in self.terms.values() if not term.stopword ]
        validTFs = (np.array([ x.tf for x in validTerms ]))
        avgTF = validTFs.mean()
        stdTF = validTFs.std()
        maxTF = max([ x.tf for x in self.terms.values()])
        list(map(lambda x: x.updateH(maxTF=maxTF, avgTF=avgTF, stdTF=stdTF, number_of_sentences=self.number_of_sentences, features=features), self.terms.values()))

    def build_mult_terms_features(self, features=None):
        list(map(lambda x: x.updateH(features=features), [cand for cand in self.candidates.values() if cand.isValid()]))

    def pre_filter(self, text):
        prog = re.compile("^(\\s*([A-Z]))")
        parts = text.split('\n')
        buffer = ''
        for part in parts:
            sep = ' '
            if prog.match(part):
                sep = '\n\n'
            buffer += sep + part.replace('\t',' ')
        return buffer

    def getTag(self, word, i):
        try:
            w2 = word.replace(",","")
            float(w2)
            return "d"
        except:
            cdigit = len([c for c in word if c.isdigit()])
            calpha = len([c for c in word if c.isalpha()])
            if ( cdigit > 0 and calpha > 0 ) or (cdigit == 0 and calpha == 0) or len([c for c in word if c in self.exclude]) > 1:
                return "u"
            if len(word) == len([c for c in word if c.isupper()]):
                return "a"
            if len([c for c in word if c.isupper()]) == 1 and len(word) > 1 and word[0].isupper() and i > 0:
                return "n"
        return "p"

    def getTerm(self, str_word, save_non_seen=True):
        unique_term = str_word.lower()
        simples_sto = unique_term in self.stopword_set
        if unique_term.endswith('s') and len(unique_term) > 3:
            unique_term = unique_term[:-1]

        if unique_term in self.terms:
            return self.terms[unique_term]
                
        # Include this part
        simples_unique_term = unique_term
        for pontuation in self.exclude:
            simples_unique_term = simples_unique_term.replace(pontuation, '')
        # until here
        isstopword = simples_sto or unique_term in self.stopword_set or len(simples_unique_term) < 3
        
        term_id = len(self.terms)
        term_obj = single_word(unique_term, term_id, self.G)
        term_obj.stopword = isstopword

        if save_non_seen:
            self.G.add_node(term_id)
            self.terms[unique_term] = term_obj

        return term_obj

    def addCooccur(self, left_term, right_term):
        if right_term.id not in self.G[left_term.id]:
            self.G.add_edge(left_term.id, right_term.id, TF=0.)
        self.G[left_term.id][right_term.id]["TF"]+=1.
        
    def addOrUpdateComposedWord(self, cand):
        if cand.unique_kw not in self.candidates:
            self.candidates[cand.unique_kw] = cand
        else:
            self.candidates[cand.unique_kw].uptadeCand(cand)
        self.candidates[cand.unique_kw].tf += 1.


class composed_word(object):
    def __init__(self, terms): # [ (tag, word, term_obj) ]
        if terms == None:
             self.start_or_end_stopwords = True
             self.tags = set()
             return
        self.tags = set([''.join([ w[0] for w in terms ])])
        self.unique_kw = ' '.join( [ w[1].lower() for w in terms ] )
        self.size = len(terms)
        self.terms = [ w[2] for w in terms if w[2] != None ]
        self.tf = 0.
        self.integrity = 1.
        self.H = 1.
        self.start_or_end_stopwords = self.terms[0].stopword or self.terms[-1].stopword

    def uptadeCand(self, cand):
        for tag in cand.tags:
            self.tags.add( tag )

    def isValid(self):
        isValid = False
        for tag in self.tags:
            isValid = isValid or ( "u" not in tag and "d" not in tag )
        return isValid and not self.start_or_end_stopwords

    def get_composed_feature(self, feature_name, discart_stopword=True):
        list_of_features = [ getattr(term, feature_name) for term in self.terms if ( discart_stopword and not term.stopword ) or not discart_stopword ]
        sum_f  = sum(list_of_features)
        prod_f = np.prod(list_of_features)
        return ( sum_f, prod_f, prod_f /(sum_f + 1) )

    def build_features(self, doc_id=None, keys=None, rel=True, rel_approx=True, isVirtual=False, features=['WFreq', 'WRel', 'tf', 'WCase', 'WPos', 'WSpread'], _stopword=[True, False]):
        columns = []
        seen = set()
        features_cand = []

        if doc_id != None:
            columns.append('doc_id')
            features_cand.append(doc_id)

        if keys != None:
            if rel:
                columns.append('rel')
                if self.unique_kw in keys or isVirtual:
                    features_cand.append(1)
                    seen.add(self.unique_kw)
                else:
                    features_cand.append(0)

            if rel_approx:
                columns.append('rel_approx')
                max_gold_ = ('', 0.)
                for gold_key in keys:
                    dist = 1.-jellyfish.levenshtein_distance(gold_key, self.unique_kw ) / max(len(gold_key), len(self.unique_kw)) # _tL
                    if max_gold_[1] < dist:
                        max_gold_ = ( gold_key, dist )
                features_cand.append(max_gold_[1])

        columns.append('kw')
        features_cand.append(self.unique_kw)
        columns.append('h')
        features_cand.append(self.H)
        columns.append('tf')
        features_cand.append(self.tf)
        columns.append('size')
        features_cand.append(self.size)
        columns.append('isVirtual')
        features_cand.append(int(isVirtual))

        for feature_name in features:

            for discart_stopword in _stopword:
                (f_sum, f_prod, f_sum_prod) = self.get_composed_feature(feature_name, discart_stopword=discart_stopword)
                columns.append('%ss_sum_K%s' % ('n' if discart_stopword else '', feature_name) )
                features_cand.append(f_sum)

                columns.append('%ss_prod_K%s' % ('n' if discart_stopword else '', feature_name) )
                features_cand.append(f_prod)

                columns.append('%ss_sum_prod_K%s' % ('n' if discart_stopword else '', feature_name) )
                features_cand.append(f_sum_prod)

        return (features_cand, columns, seen)

    def updateH(self, features=None, isVirtual=False):
        sum_H  = 0.
        prod_H = 1.

        for (t, term_base) in enumerate(self.terms):
            if not term_base.stopword:
                sum_H += term_base.H
                prod_H *= term_base.H

            else:
                if STOPWORD_WEIGHT == 'bi':
                    prob_t1 = 0.
                    if term_base.G.has_edge(self.terms[t-1].id, self.terms[ t ].id):
                        prob_t1 = term_base.G[self.terms[t-1].id][self.terms[ t ].id]["TF"] / self.terms[t-1].tf

                    prob_t2 = 0.
                    if term_base.G.has_edge(self.terms[ t ].id, self.terms[t+1].id):
                        prob_t2 = term_base.G[self.terms[ t ].id][self.terms[t+1].id]["TF"] / self.terms[t+1].tf

                    prob = prob_t1 * prob_t2
                    prod_H *= (1 + (1 - prob ) )
                    sum_H -= (1 - prob)
                elif STOPWORD_WEIGHT == 'h':
                    sum_H += term_base.H
                    prod_H *= term_base.H
                elif STOPWORD_WEIGHT == 'none':
                    pass

        tf_used = 1.
        if features == None or "KPF" in features:
            tf_used = self.tf

        if isVirtual:
            tf_used = np.mean( [term_obj.tf for term_obj in self.terms] )

        self.H = prod_H / ( ( sum_H + 1 ) * tf_used )

    def updateH_old(self, features=None, isVirtual=False):
        sum_H  = 0.
        prod_H = 1.

        for (t, term_base) in enumerate(self.terms):
            if isVirtual and term_base.tf==0:
                continue

            if term_base.stopword:
                prob_t1 = 0.
                if term_base.G.has_edge(self.terms[t-1].id, self.terms[ t ].id):
                    prob_t1 = term_base.G[self.terms[t-1].id][self.terms[ t ].id]["TF"] / self.terms[t-1].tf

                prob_t2 = 0.
                if term_base.G.has_edge(self.terms[ t ].id, self.terms[t+1].id):
                    prob_t2 = term_base.G[self.terms[ t ].id][self.terms[t+1].id]["TF"] / self.terms[t+1].tf

                prob = prob_t1 * prob_t2
                prod_H *= (1 + (1 - prob ) )
                sum_H -= (1 - prob)
            else:
                sum_H += term_base.H
                prod_H *= term_base.H
        tf_used = 1.
        if features == None or "KPF" in features:
            tf_used = self.tf
        if isVirtual:
            tf_used = np.mean( [term_obj.tf for term_obj in self.terms] )
        self.H = prod_H / ( ( sum_H + 1 ) * tf_used )


class single_word(object):

    def __init__(self, unique, idx, graph):
        self.unique_term = unique
        self.id = idx
        self.tf = 0.
        self.WFreq = 0.0
        self.WCase = 0.0
        self.tf_a = 0.
        self.tf_n = 0.
        self.WRel = 1.0
        self.PL = 0.
        self.PR = 0.
        self.occurs = {}
        self.WPos = 1.0
        self.WSpread = 0.0
        self.H = 0.0
        self.stopword = False
        self.G = graph

        self.pagerank = 1.

    def updateH(self, maxTF, avgTF, stdTF, number_of_sentences, features=None):
        """if features == None or "WRel" in features:
            self.PL = self.WDL / maxTF
            self.PR = self.WDR / maxTF
            self.WRel = ( (0.5 + (self.PWL * (self.tf / maxTF) + self.PL)) + (0.5 + (self.PWR * (self.tf / maxTF) + self.PR)) )"""

        if features == None or "WRel" in features:
            self.PL = self.WDL / maxTF
            self.PR = self.WDR / maxTF
            self.WRel = ( (0.5 + (self.PWL * (self.tf / maxTF))) + (0.5 + (self.PWR * (self.tf / maxTF))) )

        if features == None or "WFreq" in features:
            self.WFreq = self.tf / (avgTF + stdTF)
        
        if features == None or "WSpread" in features:
            self.WSpread = len(self.occurs) / number_of_sentences
        
        if features == None or "WCase" in features:
            self.WCase = max(self.tf_a, self.tf_n) / (1. + math.log(self.tf))
        
        if features == None or "WPos" in features:
            self.WPos = math.log( math.log( 3. + np.median(list(self.occurs.keys())) ) )

        self.H = (self.WPos * self.WRel) / (self.WCase + (self.WFreq / self.WRel) + (self.WSpread / self.WRel))
        
    @property
    def WDR(self):
        return len( self.G.out_edges(self.id) )

    @property
    def WIR(self):
        return sum( [ d['TF'] for (u,v,d) in self.G.out_edges(self.id, data=True) ] )

    @property
    def PWR(self):
        wir = self.WIR
        if wir == 0:
            return 0
        return self.WDR / wir 
    
    @property
    def WDL(self):
        return len( self.G.in_edges(self.id) )

    @property
    def WIL(self):
        return sum( [ d['TF'] for (u,v,d) in self.G.in_edges(self.id, data=True) ] )

    @property
    def PWL(self):
        wil = self.WIL
        if wil == 0:
            return 0
        return self.WDL / wil 

    def addOccur(self, tag, sent_id, pos_sent, pos_text):
        if sent_id not in self.occurs:
            self.occurs[sent_id] = []

        self.occurs[sent_id].append( (pos_sent, pos_text) )
        self.tf += 1.

        if tag == "a":
            self.tf_a += 1.
        if tag == "n":
            self.tf_n += 1.

In [21]:
# -*- coding: utf-8 -*-

"""Main module."""

import string
import os
import jellyfish

class KeywordExtractor(object):

    def __init__(self, lan="en", n=3, dedupLim=0.9, dedupFunc='seqm', windowsSize=1, top=20, features=None):
        self.lan = lan

        # dir_path = os.path.dirname(os.path.realpath("/Users/sdeshpande/Desktop/bioinformatices/"))
        # print(dir_path)

        # local_path = os.path.join("StopwordsList", "stopwords_%s.txt" % lan[:2].lower())


        # if os.path.exists(os.path.join(dir_path,local_path)) == False:
        #     local_path = os.path.join("StopwordsList", "stopwords_noLang.txt")
        
        resource_path = "/Users/sdeshpande/Desktop/bioinformatices/StopwordsList/stopwords_noLang.txt"

        try:
            with open(resource_path, encoding='utf-8') as stop_fil:
                self.stopword_set = set( stop_fil.read().lower().split("\n") )
        except:
            print('Warning, read stopword list as ISO-8859-1')
            with open(resource_path, encoding='ISO-8859-1') as stop_fil:
                self.stopword_set = set( stop_fil.read().lower().split("\n") )

        self.n = n
        self.top = top
        self.dedupLim = dedupLim
        self.features = features
        self.windowsSize = windowsSize
        if dedupFunc == 'jaro_winkler' or dedupFunc == 'jaro':
            self.dedu_function = self.jaro
        elif dedupFunc.lower() == 'sequencematcher' or dedupFunc.lower() == 'seqm':
            self.dedu_function = self.seqm
        else:
            self.dedu_function = self.levs

    def jaro(self, cand1, cand2):
        return jellyfish.jaro_winkler(cand1, cand2 )

    def levs(self, cand1, cand2):
        return 1.-jellyfish.levenshtein_distance(cand1, cand2 ) / max(len(cand1),len(cand2))

    def seqm(self, cand1, cand2):
        return Levenshtein.ratio(cand1, cand2)

    def extract_keywords(self, text):
        text = text.replace('\n\t',' ')
        dc = DataCore(text=text, stopword_set=self.stopword_set, windowsSize=self.windowsSize, n=self.n)
        dc.build_single_terms_features(features=self.features)
        dc.build_mult_terms_features(features=self.features)
        resultSet = []
        todedup = sorted([cc for cc in dc.candidates.values() if cc.isValid()], key=lambda c: c.H)

        if self.dedupLim >= 1.:
            return ([ (cand.H, cand.unique_kw) for cand in todedup])[:self.top]

        for cand in todedup:
            toadd = True
            for (h, candResult) in resultSet:
                dist = self.dedu_function(cand.unique_kw, candResult.unique_kw)
                if dist > self.dedupLim:
                    toadd = False
                    break
            if toadd:
                resultSet.append( (cand.H, cand) )
            if len(resultSet) == self.top:
                break

        return [ (cand.unique_kw,h) for (h,cand) in resultSet]


https://github.com/NC0DER/KeyphraseExtraction/tree/main/KeyExt/yake/StopwordsList

# Rake keyword extraction

In [None]:
# -*- coding: utf-8 -*-
"""Implementation of Rapid Automatic Keyword Extraction algorithm.
As described in the paper `Automatic keyword extraction from individual
documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
"""

import string
from collections import Counter, defaultdict
from itertools import chain, groupby, product

import nltk
from enum import Enum
from nltk.tokenize import wordpunct_tokenize


class Metric(Enum):
    """Different metrics that can be used for ranking."""

    DEGREE_TO_FREQUENCY_RATIO = 0  # Uses d(w)/f(w) as the metric
    WORD_DEGREE = 1  # Uses d(w) alone as the metric
    WORD_FREQUENCY = 2  # Uses f(w) alone as the metric


class Rake(object):
    """Rapid Automatic Keyword Extraction Algorithm."""

    def __init__(
        self,
        stopwords=None,
        punctuations=None,
        language="english",
        ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
        max_length=100000,
        min_length=1,
    ):
        """Constructor.
        :param stopwords: List of Words to be ignored for keyword extraction.
        :param punctuations: Punctuations to be ignored for keyword extraction.
        :param language: Language to be used for stopwords
        :param max_length: Maximum limit on the number of words in a phrase
                           (Inclusive. Defaults to 100000)
        :param min_length: Minimum limit on the number of words in a phrase
                           (Inclusive. Defaults to 1)
        """
        # By default use degree to frequency ratio as the metric.
        if isinstance(ranking_metric, Metric):
            self.metric = ranking_metric
        else:
            self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO

        # If stopwords not provided we use language stopwords by default.
        self.stopwords = stopwords
        if self.stopwords is None:
            self.stopwords = nltk.corpus.stopwords.words(language)

        # If punctuations are not provided we ignore all punctuation symbols.
        self.punctuations = punctuations
        if self.punctuations is None:
            self.punctuations = string.punctuation

        # All things which act as sentence breaks during keyword extraction.
        self.to_ignore = set(chain(self.stopwords, self.punctuations))

        # Assign min or max length to the attributes
        self.min_length = min_length
        self.max_length = max_length

        # Stuff to be extracted from the provided text.
        self.frequency_dist = None
        self.degree = None
        self.rank_list = None
        self.ranked_phrases = None

    def extract_keywords_from_text(self, text):
        """Method to extract keywords from the text provided.
        :param text: Text to extract keywords from, provided as a string.
        """
        sentences = nltk.tokenize.sent_tokenize(text)
        self.extract_keywords_from_sentences(sentences)

    def extract_keywords_from_sentences(self, sentences):
        """Method to extract keywords from the list of sentences provided.
        :param sentences: Text to extraxt keywords from, provided as a list
                          of strings, where each string is a sentence.
        """
        phrase_list = self._generate_phrases(sentences)
        self._build_frequency_dist(phrase_list)
        self._build_word_co_occurance_graph(phrase_list)
        self._build_ranklist(phrase_list)

    def get_ranked_phrases(self):
        """Method to fetch ranked keyword strings.
        :return: List of strings where each string represents an extracted
                 keyword string.
        """
        return self.ranked_phrases

    def get_ranked_phrases_with_scores(self):
        """Method to fetch ranked keyword strings along with their scores.
        :return: List of tuples where each tuple is formed of an extracted
                 keyword string and its score. Ex: (5.68, 'Four Scoures')
        """
        return self.rank_list

    def get_word_frequency_distribution(self):
        """Method to fetch the word frequency distribution in the given text.
        :return: Dictionary (defaultdict) of the format `word -> frequency`.
        """
        return self.frequency_dist

    def get_word_degrees(self):
        """Method to fetch the degree of words in the given text. Degree can be
        defined as sum of co-occurances of the word with other words in the
        given text.
        :return: Dictionary (defaultdict) of the format `word -> degree`.
        """
        return self.degree

    def _build_frequency_dist(self, phrase_list):
        """Builds frequency distribution of the words in the given body of text.
        :param phrase_list: List of List of strings where each sublist is a
                            collection of words which form a contender phrase.
        """
        self.frequency_dist = Counter(chain.from_iterable(phrase_list))

    def _build_word_co_occurance_graph(self, phrase_list):
        """Builds the co-occurance graph of words in the given body of text to
        compute degree of each word.
        :param phrase_list: List of List of strings where each sublist is a
                            collection of words which form a contender phrase.
        """
        co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
        for phrase in phrase_list:
            # For each phrase in the phrase list, count co-occurances of the
            # word with other words in the phrase.
            #
            # Note: Keep the co-occurances graph as is, to help facilitate its
            # use in other creative ways if required later.
            for (word, coword) in product(phrase, phrase):
                co_occurance_graph[word][coword] += 1
        self.degree = defaultdict(lambda: 0)
        for key in co_occurance_graph:
            self.degree[key] = sum(co_occurance_graph[key].values())

    def _build_ranklist(self, phrase_list):
        """Method to rank each contender phrase using the formula
              phrase_score = sum of scores of words in the phrase.
              word_score = d(w)/f(w) where d is degree and f is frequency.
        :param phrase_list: List of List of strings where each sublist is a
                            collection of words which form a contender phrase.
        """
        self.rank_list = []
        for phrase in phrase_list:
            rank = 0.0
            for word in phrase:
                if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
                    rank += 1.0 * self.degree[word] / self.frequency_dist[word]
                elif self.metric == Metric.WORD_DEGREE:
                    rank += 1.0 * self.degree[word]
                else:
                    rank += 1.0 * self.frequency_dist[word]
            self.rank_list.append((rank, " ".join(phrase)))
        self.rank_list.sort(reverse=True)
        self.ranked_phrases = [ph[1] for ph in self.rank_list]

    def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.
        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()
        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list

    def _get_phrase_list_from_words(self, word_list):
        """Method to create contender phrases from the list of words that form
        a sentence by dropping stopwords and punctuations and grouping the left
        words into phrases. Only phrases in the given length range (both limits
        inclusive) would be considered to build co-occurrence matrix. Ex:
        Sentence: Red apples, are good in flavour.
        List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
        List after dropping punctuations and stopwords.
        List of words: ['red', 'apples', *, *, good, *, 'flavour']
        List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
        List of phrases with a correct length:
        For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)]
        For the range [1, 1]: [('good',), ('flavour',)]
        For the range [2, 2]: [('red', 'apples')]
        :param word_list: List of words which form a sentence when joined in
                          the same order.
        :return: List of contender phrases that are formed after dropping
                 stopwords and punctuations.
        """
        groups = groupby(word_list, lambda x: x not in self.to_ignore)
        phrases = [tuple(group[1]) for group in groups if group[0]]
        return list(
            filter(
                lambda x: self.min_length <= len(x) <= self.max_length, phrases
            )
        )

# Utilities

In [7]:
import os
import platform
from string import punctuation
from nltk.stem import SnowballStemmer

# Initialize all required stemmers once.
stemmers = {
    'english': SnowballStemmer('english'),
    'french': SnowballStemmer('french'),
    'spanish': SnowballStemmer('spanish'),
    'portuguese': SnowballStemmer('portuguese')
}

def preprocess(lis, language):
    """
    Function which applies stemming to a 
    lowercase version of each string of the list,
    which has all punctuation removed.
    """
    return list(map(stemmers[language].stem, 
           map(lambda s: s.translate(str.maketrans('', '', punctuation)),
           map(str.lower, lis))))


def rreplace(s, old, new, occurrence):
    """
    Function which replaces a string occurence
    in a string from the end of the string.
    """
    return new.join(s.rsplit(old, occurrence))

def clear_screen():
    """
    Function which clears the output of the terminal 
    by using the platform specific system call.
    """
    if platform.system() == 'Windows':
        os.system('cls')
    else:
        os.system('clear') # Linux/OS X.
    return


# Keyphrase extraction models

In [12]:
!pip install rake-nltk



In [28]:
import pke
import spacy
import pytextrank
from string import printable
from statistics import mean
from operator import itemgetter
from itertools import islice, combinations
from nltk import sent_tokenize
from rake_nltk import Rake
from spacy.language import Language
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util


def tfidfvectorizer(text, ngram_range = (1, 3), top_n = 10):
    # Tokenize the text into sentences.
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer (
        stop_words = 'english', 
        ngram_range = ngram_range
    )
    # Vectorizer fits and transform the sentences.
    vectorizer.fit_transform(sentences)
    results = {
        key: val 
        for key, val in sorted (
            vectorizer.vocabulary_.items(), 
            key = lambda item: item[1],
            reverse = True
        )
    }
    return list(islice(results, top_n))

def keybert(text, ngram_range = (1, 3), top_n = 10, method = None, diversity = 0.5):
    # Initialize the keybert model using the pretrained sentence transformer model.
    # This call takes some time on the first execution.
    model = KeyBERT('distiluse-base-multilingual-cased-v2')

    # Returned the extracted keywords based on the specified arguments. 
    return [
        keyphrase for (keyphrase, _) in 
        model.extract_keywords (
            text, 
            keyphrase_ngram_range = ngram_range,
            stop_words = 'english',
            top_n = top_n,
            nr_candidates = 2 * top_n,
            use_maxsum = True if method == 'maxsum' else False,
            use_mmr = True if method == 'mmr' else False,
            diversity = diversity
    )]


def singlerank(text, top_n = 10):
    # Clean the text from non-printable characters.
    text = ''.join(word for word in text if word in printable)

    # Initialize the keyphrase extraction model.
    extractor = pke.unsupervised.SingleRank()

    # Load the content of the document and preprocess it with spacy.
    # Then, select the keyphrase candidates from the document,
    # and weight them using a random walk algorithm.
    extractor.load_document(input = text, language = 'en')
    extractor.candidate_selection()
    extractor.candidate_weighting()
    
    # Return the n-highest scored candidates.
    return [
        keyphrase for (keyphrase, score)
        in extractor.get_n_best(n = top_n, redundancy_removal = True)
    ]
    
def rake(text, top_n = 10):
    # Clean the text from non-printable characters.
    text = ''.join(word for word in text if word in printable)

    # Uses all english stopwords and punctuation from NLTK.
    r = Rake()
    r.extract_keywords_from_text(text)
    return [keyphrase for (score, keyphrase) in r.get_ranked_phrases_with_scores()[:top_n]]

def yake(text, top_n = 10, n = 3, dedupLim = 0.9, dedupFunc = 'seqm', windowsSize = 1):
    # Initialize the keyword extractor object and its parameters.
    kw_extractor = KeywordExtractor (
        top = top_n,
        n = n,
        dedupLim = dedupLim,
        dedupFunc = dedupFunc,
        windowsSize = windowsSize
    )
    # Return the extracted keywords, in a list.
    return [keyword for (keyword, score) in kw_extractor.extract_keywords(text)]

# Apply models on your own data

In [12]:
with open("/Users/sdeshpande/Desktop/bioinformatices/bioinformatics_title.txt", 'r') as f:
    bio_titles = f.readlines()

In [13]:
len(bio_titles)

20140

In [14]:
bio_titles_trial = bio_titles[0:10]

In [15]:
bio_titles_trial = [x.strip("\n") for x in bio_titles_trial]
text = ". ".join(bio_titles_trial)

In [16]:
text

"Absence of surface expression of feline infectious peritonitis virus (FIPV) antigens on infected cells isolated from cats with FIP. Correlation between antimicrobial consumption and incidence of health-care- associated infections due to methicillin- resistant Staphylococcus aureus and vancomycin-resistant enterococci at a university hospital in Taiwan from 2000 to 2010. Laboratory-based surveillance of hospital-acquired respiratory virus infection in a tertiary care hospital. Pneumonie virale sévère de l'immunocompétent Viral pneumonia in immunocompetent patients. Microheterogeneity of S-glycoprotein of mouse hepatitis virus temperature-sensitive mutants. immunity to pathogens taught by specialized human dendritic cell subsets. The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3. Enhancement of feline infectious peritonitis virus Type I infection in cell cultures using low-speed centrifuga

In [29]:
def keyword_extraction(text):
    # Create all ngrams with range (1, 3) for the text.
    ngrams = {
        '1-tfidfvectorizer': tfidfvectorizer(text, top_n = 10),
        '2-keybert-maxsum': keybert(text, top_n = 10, method = 'maxsum', diversity = 0.7),
        '3-keybert-mmr': keybert(text, top_n = 10, method = 'mmr', diversity = 0.7),
        '4-singlerank': singlerank(text, top_n = 10),
        '5-rake': rake(text, top_n = 10),
        '6-yake-seqm': yake(text, top_n = 10, dedupFunc = 'seqm'),
    }

    return ngrams

    # # Write ngrams from each method to a json file.
    # with open(r'C:\Users\USER\Desktop\ngrams.json', 'w',
    #     encoding = 'utf-8-sig', errors = 'ignore') as file:
    #     file.write(json.dumps(texts, indent = 4, separators = (',', ':')))

In [30]:
keyword_extraction(text)

{'1-tfidfvectorizer': ['virus type infection',
  'virus type',
  'virus temperature sensitive',
  'virus temperature',
  'virus infection tertiary',
  'virus infection',
  'virus fipv antigens',
  'virus fipv',
  'virus dispensable genome',
  'virus dispensable'],
 '2-keybert-maxsum': ['cells isolated cats',
  'virus infection tertiary',
  'respiratory virus infection',
  'isolated cats',
  'immunocompétent viral pneumonia',
  'viral pneumonia immunocompetent',
  'fip correlation antimicrobial',
  'cats fip correlation',
  'expression feline infectious',
  'isolated cats fip'],
 '3-keybert-mmr': ['virus enhancement feline',
  '2000',
  'university hospital taiwan',
  'absence surface expression',
  'correlation',
  'cells isolated',
  'laboratory based surveillance',
  'using low speed',
  'antimicrobial consumption incidence',
  'case sentence reform'],
 '4-singlerank': ['feline infectious peritonitis virus type',
  'respiratory virus infection',
  'mouse hepatitis virus temperature',