In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import logging
logging.basicConfig(level = logging.INFO, filename = 'keyword_extract_notebook.log', filemode = 'w', format = '%(asctime)s - %(levelname)s - %(message)s')

logging.info('All libraries exported')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

In [3]:
allstopwords = stopwords.words('English')

try:
    cvector = CountVectorizer(ngram_range=(1,1), stop_words=allstopwords)
    cvector.fit_transform([data])
    logging.info('Data is vectorized and data is tranformed')
except:
    print('Vectorized failed')
    logging.error('Vectorized failed')

keywords = cvector.get_feature_names_out()

In [4]:
try:
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    data_embed = model.encode([data])
    keyword_embed = model.encode(keywords)
    logging.info('Data is embedded using Distilbert model')
except:
    print('Data embedding failed')
    logging.error('Data embedding failed')

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.81it/s]


In [5]:
try:
    top_n = 5
    distances = cosine_similarity(data_embed, keyword_embed)
    final_keywords = [keywords[index] for index in distances.argsort()[0][-top_n:]]
    logging.info('cosine similarity distance calculated')
except:
    print('cosine similarity calculation failed')
    logging.error('cosine similarity calculation failed')

In [6]:
keywords, distances, final_keywords

(array(['algorithm', 'allow', 'also', 'analyzes', 'based', 'bias',
        'called', 'class', 'consisting', 'correctly', 'data', 'desired',
        'determine', 'example', 'examples', 'function', 'generalize',
        'inductive', 'inferred', 'infers', 'input', 'instances', 'labeled',
        'labels', 'learning', 'machine', 'mapping', 'maps', 'new',
        'object', 'optimal', 'output', 'pair', 'pairs', 'produces',
        'reasonable', 'requires', 'scenario', 'see', 'set', 'signal',
        'situations', 'supervised', 'supervisory', 'task', 'training',
        'typically', 'unseen', 'used', 'value', 'vector', 'way'],
       dtype=object),
 array([[0.45560038, 0.1336013 , 0.07479402, 0.31460872, 0.13824819,
         0.09117435, 0.11441343, 0.4086999 , 0.09669925, 0.13623556,
         0.20845458, 0.18884563, 0.19503789, 0.16670373, 0.20907958,
         0.23956454, 0.21112514, 0.23319237, 0.20896897, 0.2012858 ,
         0.27665883, 0.12842578, 0.20875162, 0.18689686, 0.46048343,
     