In [1]:
from keybert import KeyBERT

  from tqdm.autonotebook import tqdm, trange





In [14]:
doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).
      """
kw_model = KeyBERT()
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
#keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=stop_words)
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=stop_words, use_mmr=True, diversity=0.5)

In [15]:
print(keywords)

[('supervised', 0.6676), ('training', 0.4134), ('function', 0.2658), ('bias', 0.2644), ('inductive', 0.2577)]


In [48]:
# training it on a dataset found on github: https://github.com/SDuari/Keyword-Extraction-Datasets/tree/master
# Marujo2012 are a list of online news articles

import os
import pandas as pd

# Specify the folder containing the .txt and .key files
folder_path = 'Training'

# Initialize an empty list to store data from all files
all_data = []

# Iterate over each file in the folder to process .txt and corresponding .key files
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(folder_path, filename)
        
        # Corresponding .key file
        key_file_path = os.path.join(folder_path, filename.replace('.txt', '.key'))
        
        # Read only the second line from the .txt file
        with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
            txt_line = ''
            for i, line in enumerate(txt_file):
                if i == 1:  # 0-based index; second line is index 1
                    txt_line = line.strip()
                    break

        # Read all lines from the .key file and store as a list
        if os.path.exists(key_file_path):
            with open(key_file_path, 'r', encoding='utf-8') as key_file:
                key_lines = [line.strip() for line in key_file.readlines()]
        else:
            key_lines = []  # Empty list if no .key file exists
        
        # Add the file's data as a row in the list
        all_data.append({
            'filename': filename,
            'txt_lines': txt_line,  # List of lines from the .txt file
            'key_lines': key_lines   # List of lines from the .key file
        })

# Convert list to a DataFrame
df = pd.DataFrame(all_data)

# Display the DataFrame
df

Unnamed: 0,filename,txt_lines,key_lines
0,art_and_culture-20893614.txt,Canadian pop star Michael Buble kisses his bri...,"[married Argentine, his Grammy, ceremony, kiss..."
1,art_and_culture-20900470.txt,"Daniel Craig will do Stieg Larsson justice, fi...","[role, American, board, Staermose, Daniel, ver..."
2,art_and_culture-20902975.txt,Will Kutcher and Bieber make as woeful a doubl...,"[production, coming, affinity, primarily, doub..."
3,art_and_culture-20904497.txt,"April 1, 2011 - """"Bones"""" actress Emily Descha...","[confirmed, year, pregnant, September, TV, emb..."
4,art_and_culture-20906308.txt,If I wasn't already totally pumped for Tim Bur...,"[would play Carolyn, join cast, 039Dark Shadow..."
...,...,...,...
445,tech-20950200.txt,Imagine you are a pro-democracy protester on t...,"[role, blackouts, wasnt, bloggers, names, acti..."
446,tech-20952351.txt,It had to happen. When Google showed off a new...,"[cooked up, make Gmail Motion work, Kinect cam..."
447,tech-20953586.txt,Franco poses with 826 student Angelita Bowden....,"[need, explained, black, organizers screened, ..."
448,tech-20954339.txt,Microsoft has made its first-ever formal compl...,"[understand, benefit, compete, search, forward..."


#check a specific file
specific_filename = 'art_and_culture-20906975.txt'

# Filter the DataFrame to get the specific file's data
file_data = df[df['filename'] == specific_filename]

file_data

In [55]:
'''

The kw_model.extract_keywords method is part of the KeyBERT library, which is commonly used for extracting keywords from text. Here’s an overview of the parameters you can pass to extract_keywords:

python
Copy code
kw_model.extract_keywords(
    docs,               # Input text(s) from which to extract keywords
    keyphrase_ngram_range=(1, 1),   # Tuple indicating min and max n-gram lengths for keywords
    stop_words=None,    # Language for stop words (e.g., "english") or custom stop words list
    top_n=5,            # Number of keywords/phrases to return
    use_maxsum=False,   # Whether to use Max Sum Similarity for keyword selection
    use_mmr=False,      # Whether to use Maximal Marginal Relevance for keyword selection
    diversity=0.5,      # Diversity parameter for MMR (only if use_mmr=True)
    nr_candidates=20,   # Number of candidates considered for Max Sum Similarity (only if use_maxsum=True)
    vectorizer=None,    # Custom vectorizer (e.g., TfidfVectorizer) if you wish to override the default
    highlight=False,    # Whether to highlight keywords in the original text (only in Jupyter Notebooks)
)

'''
outputs = []

for val in df['txt_lines']:
    keywords = kw_model.extract_keywords(val, keyphrase_ngram_range=(1, 1), stop_words=stop_words, top_n=10, 
                                         use_mmr=True, diversity=0.2)
    outputs.append(keywords)

In [56]:
outputs[321]

[('obesity', 0.4363),
 ('oestrogen', 0.3312),
 ('waistline', 0.2984),
 ('blindness', 0.2955),
 ('macular', 0.2953),
 ('diabetes', 0.2808),
 ('inflammation', 0.2485),
 ('risk', 0.2412),
 ('aged', 0.2322),
 ('women', 0.2123)]