# DATABASE EXTRACTION and PROCESSING


In [1]:
import os
import requests
import zipfile
import pandas as pd

In [2]:
# Define the URL for the dataset repository and the local storage directory
data_url = "https://github.com/LIAAD/KeywordExtractor-Datasets/archive/refs/heads/master.zip"
local_zip_path = "datasets.zip"
unzip_dir = "KeywordExtractor-Datasets"

In [3]:
# Step 1: Download the dataset repository
def download_datasets():
    print("Downloading datasets...")
    response = requests.get(data_url)
    if response.status_code == 200:
        with open(local_zip_path, "wb") as file:
            file.write(response.content)
        print("Datasets downloaded successfully.")
    else:
        print(f"Failed to download datasets. Status code: {response.status_code}")
        exit(1)

In [4]:
# Step 2: Extract the downloaded zip file
def extract_datasets():
    print("Extracting datasets...")
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)
    print("Datasets extracted successfully.")

In [5]:
# Step 3: Extract individual dataset zips
def extract_inner_zips():
    datasets_path = os.path.join(unzip_dir, "KeywordExtractor-Datasets-master/datasets")
    for file in os.listdir(datasets_path):
        if file.endswith(".zip"):
            zip_path = os.path.join(datasets_path, file)
            extract_path = os.path.join(datasets_path, file.replace(".zip", ""))
            if not os.path.exists(extract_path):
                print(f"Extracting {file}...")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
                print(f"Extracted {file} to {extract_path}.")

In [6]:
# Step 4: Process a specific dataset and convert it into a usable format
def process_dataset(dataset_name):
    dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

    # Check if dataset directory exists
    if not os.path.exists(dataset_path):
        print(f"Dataset {dataset_name} not found.")
        return None

    docs_folder = os.path.join(dataset_path, "docsutf8")
    keys_folder = os.path.join(dataset_path, "keys")

    if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
        print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
        return None

    # Load documents and keywords
    print(f"Processing dataset: {dataset_name}")
    documents = []
    keywords = []

    for doc_file in sorted(os.listdir(docs_folder)):
        doc_path = os.path.join(docs_folder, doc_file)
        if doc_file.endswith(".txt"):
            with open(doc_path, "r", encoding="utf-8") as f:
                documents.append(f.read().strip())

    for key_file in sorted(os.listdir(keys_folder)):
        key_path = os.path.join(keys_folder, key_file)
        if key_file.endswith(".key"):
            with open(key_path, "r", encoding="utf-8") as f:
                keywords.append(f.read().strip().split(','))

    # Combine documents and keywords into a DataFrame
    data = pd.DataFrame({"document": documents, "keywords": keywords})
    return data

In [7]:
# Step 4: Save processed data to a CSV file
def save_to_csv(data, output_path):
    print(f"Saving processed data to {output_path}...")
    data.to_csv(output_path, index=False, encoding="utf-8")
    print("Data saved successfully.")

In [8]:
# Example: Process the "Nguyen2007" dataset
download_datasets()
extract_datasets()
extract_inner_zips()

dataset_name = "Nguyen2007"
processed_data = process_dataset(dataset_name)

if processed_data is not None:
  output_csv = f"{dataset_name}_processed.csv"
  save_to_csv(processed_data, output_csv)
  #in the format documents,keywords
  #keywords are separated either by commas or \n

  # Clean up downloaded zip file
if os.path.exists(local_zip_path):
  os.remove(local_zip_path)
  print("Cleaned up temporary files.")

Downloading datasets...
Datasets downloaded successfully.
Extracting datasets...
Datasets extracted successfully.
Extracting Schutz2008.zip...
Extracted Schutz2008.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/Schutz2008.
Extracting WikiNews.zip...
Extracted WikiNews.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/WikiNews.
Extracting Nguyen2007.zip...
Extracted Nguyen2007.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/Nguyen2007.
Extracting fao780.zip...
Extracted fao780.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/fao780.
Extracting www.zip...
Extracted www.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/www.
Extracting theses100.zip...
Extracted theses100.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/theses100.
Extracting Inspec.zip...
Extracted Inspec.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-maste

In [9]:
import pandas as pd

# Load the CSV file
file_path = "./Nguyen2007_processed.csv"  # Update with the correct path
data = pd.read_csv(file_path)

# Extract documents and keywords
documents = data['document']  # This is a pandas Series of text documents
keywords = data['keywords']  # This is a pandas Series of keyword strings

# Convert keywords from string representation to Python lists
keywords = keywords.apply(lambda x: eval(x))  # Use `eval` to parse strings into lists if necessary
#Now it is a list containing only one long string in the format ['Keyword1\nKeyword2\nKeyword3....']
# keywords = keywords.apply(lambda x: x[0].lower()) #only one element in the list
#Now x is a string lowercase
keywords = keywords.apply(lambda x: x[0].split('\n'))
#now keywords is a list of lowercase strings

# KeyBERT

In [10]:
! pip install keybert

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvi

In [11]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is enabled.
print(torch.cuda.get_device_name(0))  # Prints the name of the GPU.

True
Tesla T4


In [12]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # Use GPU for embeddings

# kw_model = KeyBERT(model='all-mpnet-base-v2') #remove argument to use base model "all-MiniLM-L6-v2".
kw_model = KeyBERT(model=model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
from time import time

# extract 30 keyword to be able to select the preferred one in term of most frequent pos
start = time()
cosine_keywords = kw_model.extract_keywords(documents, keyphrase_ngram_range=(1,3), top_n=30)
end = time()
cosine_time = end-start
print(f"Total time to extract keywords from Nguyen2007 (cosine similarity, n_gram=(1,3), top_n=30): {cosine_time:.3f}")
#remember that doing it all together passing a list of documents is faster than iterating on one document at a time

# Example KeyBERT output
# keybert_output = [[('ramsey', 0.3427),
#   ('extractors', 0.3425),
#   ('subgraph', 0.3406),
#   ('derandomization', 0.3376),
#   ('entropy', 0.3142)],
#  [('retrieval', 0.5007),
#   ('relevance', 0.4068),
#   ('occurrence', 0.3692),
#   ('ranking', 0.3454),
#   ('idf', 0.333)],
#  [('crawled', 0.6136),
#   ('crawler', 0.6018),
#   ('crawlers', 0.5513),
#   ('crawling', 0.551),
#   ('crawl', 0.5431)]]

extracted_keywords_cosine = []
#for each list in the list of lists -> lower, strip and keep only literal part (no score) of each element
for kws in cosine_keywords:
    extracted_keywords_cosine.append([kw[0].lower().strip() for kw in kws])

Total time to extract keywords from Nguyen2007 (cosine similarity, n_gram=(1,3), top_n=30): 221.922


In [14]:
print(extracted_keywords_cosine[0:10])
print(len(extracted_keywords_cosine))

[['entropy ramsey graphs', 'ramsey graphs entropy', 'polynomial entropy ramsey', 'built ramsey graphs', 'ramsey bipartite graphs', 'bipartite ramsey graphs', 'entropy ramsey', 'ramsey graphs frankl', 'extractors entropy note', 'extractors entropy', 'source extractors entropy', '2n ramsey graphs', 'ordinary ramsey graphs', 'sources bits entropy', 'entropy number sources', 'ramsey graphs', 'bipartite graphs ramsey', 'informally ramsey graphs', 'graphs log ramsey', 'entropy sufficient extractor', 'entropy independent sources', 'case ramsey graphs', 'ramsey bipartite graph', 'context ramsey graphs', 'bipartite ramsey construction', 'ramsey graphs beating', 'source extractor entropy', 'ramsey graph construction', 'generators derandomization extractors', 'graphs entropy'], ['probabilistic retrieval model', 'probabilistic retrieval models', 'information retrieval models', 'probabilistic retrieval', 'retrieval models', 'retrieval models general', 'probabilities probabilistic retrieval', 'retri

# PART-OF-SPEECH TAGGING (of the document)


In [15]:
# the function compute part-of-speech tagging using spacy on a text and return a list of pair (word, pos) = (str, str)

import spacy

def pos_tag_document(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Store and return tagged words with their parts of speech
    # each token object has a token.text and a token.pos_ attribute here are saved in a list of pairs
    tagged_words = [(token.text, token.pos_) for token in doc]
    return tagged_words


In [16]:
# EXECUTE ON KeyBERT EXTRACTION

# place documents and keywords in a pandas DataFrame for structure and semplicty
documents_df = pd.DataFrame({'text': documents})
keywords_df = pd.DataFrame({'text': keywords})

# the function is executed on each document in documents (actualy in documents_df['text'])
# and the results are saved in document_df['pos'] as a list of pair (word, pos) for each doc.
# [7m17s]
start = time()
documents_df['pos'] = documents_df['text'].apply(pos_tag_document)
end = time()
print(f'part-of-speech tagging of the documents {end-start}')
# print(documents_df.head())
# print(documents_df.iloc[0])
# print(documents_df.shape)

part-of-speech tagging of the documents 336.48897790908813


# PART-OF-SPEECH ASSOCIATION (for the keyword)

In [17]:
from collections import Counter
from typing import List, Tuple
import re

In [18]:
nlp = spacy.load("en_core_web_sm")  # Load Spacy model for POS tagging and lemmatization

def pos_of_keywords_method_one(keywords, text_pos, window_size=5):
    """
    Identify the most common Part-of-Speech (POS) tag for each word in a multi-word keyword
    by analyzing its occurrences in a given text. The function checks if all lemmatized parts of a
    keyword appear within a dynamically adjusted window in the text and assigns the most
    frequent POS tags accordingly.
    If a keyword or part of it is not found in a relevant context, the corresponding POS
    position is left empty (None).

    Args:
    - keywords (list of str): The keywords to find in the text.
    - text_pos (list of tuples): A list of (word, POS tag) tuples.
    - window_size (int): The default context window size around words.

    Returns:
    - list of tuples: Each tuple contains the most frequent POS tag(s)
                      for each word in a keyword. If no match is found, returns None.

    """
    key_pos = []

    # extract words and their POS tags separately, converting words to lowercase
    text_tokens = [token.lower() for token, _ in text_pos]
    text_pos_tags = [pos for _, pos in text_pos]

    # Precompute lemmas for the entire text (word-by-word approach)
    text_lemmas = {word: nlp(word)[0].lemma_ for word in set(text_tokens)}
    lemmatized_text = [text_lemmas[word] for word in text_tokens] # lemmatized version of text tokens

    for key in keywords:
        key_tokens = re.findall(r'\w+|[-/()]', key.lower())  # tokenize the keyword, keeping hyphens (-) and slashes (/) as separate tokens
        key_lemmas = [text_lemmas.get(word, word) for word in key_tokens]  # lemmatize keyword tokens

        dynamic_window_size = max(window_size, len(key_tokens) * 2) # adjust window size based on keyword length
        keyword_word_pos = [] #store POS tags for each word in the keyword

        # Store word positions for faster lookup
        word_positions = {word: [] for word in key_tokens}
        for i, word in enumerate(text_tokens):
            if word in word_positions:
                word_positions[word].append(i)

        # Search for each word in the keyword separately
        for token in key_tokens:
            token_pos_counts = Counter()
            for i in word_positions.get(token, []):  # direct lookup of positions
                # Look at words in the surrounding context window
                start, end = max(0, i - dynamic_window_size), min(len(text_tokens), i + dynamic_window_size + 1)

                context_lemmas = set(lemmatized_text[start:end])  # convert the context words to their lemmas for matching

                # Get remaining lemmas of the keyword (excluding the current word)
                current_lemma = text_lemmas.get(token, token)
                remaining_keyword_lemmas = [kw_lemma for kw_lemma in key_lemmas if kw_lemma != current_lemma]

                # Check if all other keyword lemmas exist in the context window
                if all(kw_lemma in context_lemmas for kw_lemma in remaining_keyword_lemmas):
                    token_pos_counts[text_pos_tags[i]] += 1  # record the POS tag for this token

            # Determine the most frequent POS tag for the token
            if token_pos_counts:
                most_common_pos = token_pos_counts.most_common(1)[0][0]
                keyword_word_pos.append(most_common_pos)
            else:
                keyword_word_pos.append(None) #no match found

        key_pos.append(tuple(keyword_word_pos))


    return (key_pos)



In [19]:
def pos_of_keywords_method_two(keywords, text_pos):

    """
    Identify the most common POS tag for each word in a multi-word keyword
    based on its occurrences in a given text with POS tagging.

    Args:
        - keywords (list of str): The keywords to find in the text.
        - text_pos (list of tuples): A list of (word, POS tag) tuples.

    Returns:
        - list of tuples: Each tuple contains the most frequent POS tag(s)
          for each word in a keyword. If no match is found, returns None.
    """

    key_pos = []

    # extract lowercase words and POS tags from text for comparison
    text = [text.lower() for text, _ in text_pos]
    pos = [pos for _, pos in text_pos]


    for key in keywords:
        key_tokens = key.lower().split()
        word_pos = []

        for word in key_tokens: #process each word separately

            # Find all occurrences of the word
            occurrences = []
            for i in range(len(text)):
              if text[i] == word:
                occurrences.append(pos[i]) #store corresponding POS tag

            # Determine the most common POS tag for this word
            if occurrences:
                most_common_pos = Counter(occurrences).most_common(1)[0][0]
                word_pos.append(most_common_pos)
            else:
                word_pos.append(None)

        key_pos.append(tuple(word_pos))

    return(key_pos)

In [28]:
#METHOD ONE
# EXECUTE ON KeyBERT EXTRACTION

extracted_keywords_one_df = pd.DataFrame({'text':extracted_keywords_cosine})

# execute pos_of_keywords on every row of the dataframes
start = time()
# sostituire con keywords_df for ground truth
extracted_keywords_one_df['pos'] = extracted_keywords_one_df.apply(lambda x: pos_of_keywords_method_one(x['text'], documents_df['pos'][x.name]), axis=1)
end = time()
print(f'part-of-speech tagging with method one of the keywords {end-start}') # by associaton of the keywords to their occurrences in the text

part-of-speech tagging with method one of the keywords 1049.8712611198425


In [29]:
#METHOD TWO
# EXECUTE ON KeyBERT EXTRACTION

extracted_keywords_two_df = pd.DataFrame({'text':extracted_keywords_cosine})

# execute pos_of_keywords on every row of the dataframes
start = time()
extracted_keywords_two_df['pos'] = extracted_keywords_two_df.apply(lambda x: pos_of_keywords_method_two(x['text'], documents_df['pos'][x.name]), axis=1)
end = time()
print(f'part-of-speech tagging with method two of the keywords {end-start}') # by associaton of the keywords to their occurrences in the text

part-of-speech tagging with method two of the keywords 5.2613525390625


# KEYWORD SELECTION (based on frequent POS sequences)

In [22]:
import requests
frequent_pos_one_path="https://raw.githubusercontent.com/s319848/DNLP-project-2025/main/notebooks/Second-extension/keywords_combinations_method_one.csv"
response_one = requests.get(frequent_pos_one_path)

frequent_pos_one=[]

if response_one.status_code == 200:
    # Split the content into lines and remove \r
    lines = response_one.text.strip().split("\n")

    for line in lines:
        # Clean the line and split into a tuple, removing any extra whitespace
        line = line.strip().replace("\r", "")
        # Convert the line to a tuple of elements
        frequent_pos_one.append(tuple(line.split(',')))

frequent_pos_one

[('NOUN', 'NOUN'),
 ('ADJ', 'NOUN'),
 ('NOUN',),
 ('PROPN', 'NOUN'),
 ('PROPN', 'PROPN'),
 ('ADJ', 'NOUN', 'NOUN'),
 ('PROPN',),
 ('VERB', 'NOUN'),
 ('NOUN', 'VERB'),
 ('NOUN', 'NOUN', 'NOUN'),
 ('VERB',),
 ('ADJ', 'ADJ', 'NOUN'),
 ('PROPN', 'PROPN', 'PROPN'),
 ('NOUN', 'PROPN'),
 ('VERB', 'NOUN', 'NOUN'),
 ('ADJ', 'PROPN'),
 ('ADJ',),
 ('NOUN', 'ADJ', 'NOUN'),
 ('NOUN', 'ADP', 'NOUN'),
 ('NOUN', 'PUNCT', 'NOUN'),
 ('PROPN', 'NOUN', 'NOUN'),
 ('ADJ', 'NOUN', 'VERB'),
 ('PROPN', 'PROPN', 'NOUN'),
 ('ADJ', 'PROPN', 'NOUN'),
 ('NOUN', 'PUNCT', 'VERB'),
 ('PROPN', 'VERB'),
 ('PROPN', 'ADJ', 'NOUN'),
 ('NOUN', 'PROPN', 'NOUN'),
 ('ADJ', 'VERB'),
 ('PROPN', 'ADP', 'PROPN')]

In [23]:
frequent_pos_two_path="https://raw.githubusercontent.com/s319848/DNLP-project-2025/main/notebooks/Second-extension/keywords_combinations_method_two.csv"
response_two = requests.get(frequent_pos_two_path)

frequent_pos_two=[]

if response_two.status_code == 200:
    # Split the content into lines and remove \r
    lines = response_two.text.strip().split("\n")

    for line in lines:
        # Clean the line and split into a tuple, removing any extra whitespace
        line = line.strip().replace("\r", "")
        # Convert the line to a tuple of elements
        frequent_pos_two.append(tuple(line.split(',')))

frequent_pos_two

[('NOUN', 'NOUN'),
 ('ADJ', 'NOUN'),
 ('NOUN',),
 ('ADJ', 'NOUN', 'NOUN'),
 ('PROPN', 'NOUN'),
 ('PROPN',),
 ('VERB', 'NOUN'),
 ('NOUN', 'VERB'),
 ('NOUN', 'NOUN', 'NOUN'),
 ('NOUN', 'PROPN'),
 ('VERB',),
 ('ADJ', 'ADJ', 'NOUN'),
 ('PROPN', 'PROPN'),
 ('ADJ', 'PROPN'),
 ('VERB', 'NOUN', 'NOUN'),
 ('NOUN', 'ADP', 'NOUN'),
 ('ADJ',),
 ('NOUN', 'ADJ', 'NOUN'),
 ('ADJ', 'VERB'),
 ('PROPN', 'NOUN', 'NOUN'),
 ('ADJ', 'PROPN', 'NOUN'),
 ('PROPN', 'ADJ', 'NOUN'),
 ('NOUN', 'VERB', 'NOUN'),
 ('ADJ', 'NOUN', 'VERB'),
 ('NOUN', 'PROPN', 'NOUN'),
 ('PROPN', 'VERB'),
 ('VERB', 'PROPN'),
 ('VERB', 'ADJ', 'NOUN'),
 ('ADJ', 'ADJ'),
 ('NOUN', 'ADJ'),
 ('PROPN', 'NOUN', 'PROPN'),
 ('X', 'X', 'NOUN'),
 ('NOUN', 'NOUN', 'VERB')]

In [26]:
# We want to generate a certain amount of keyword so we fill/truncate if the one
# corresponding to a desired pos sequence are not enough/too many
def filter_and_truncate_lists(keywords, frequent_pos, target_size=10):

    key_pos = list(keywords['pos'])
    key_text = list(keywords['text'])

    # Convert to set for O(1) lookup
    frequent_set = set(frequent_pos)

    # Initialize result lists
    matches = []  # Keyword whose pos is frequen
    non_matches = []  # Keyword whose pos is not frequent

    for i, pos in enumerate(key_pos):
      if pos in frequent_set:
        matches.append(key_text[i])
      else:
        non_matches.append(key_text[i])

    # Truncate matches to target_size

    # If matches list is exactly target_size, return it
    if len(matches) == target_size:
        return matches

    # If matches list is too long, truncate it
    elif len(matches) > target_size:
        return matches[:target_size]

    # If matches list is too short, add elements from non_matches
    else:
        remaining_slots = target_size - len(matches)
        return matches + non_matches[:remaining_slots]

In [30]:
#METHOD ONE
selected_keywords_method_one = pd.DataFrame()
selected_keywords_method_one['text'] = extracted_keywords_one_df.apply(lambda x: filter_and_truncate_lists(x, frequent_pos_one, 10), axis=1)

print(selected_keywords_method_one[0:10])
print(len(selected_keywords_method_one))

                                                text
0  [entropy ramsey graphs, ramsey graphs entropy,...
1  [probabilistic retrieval model, probabilistic ...
2  [crawling systems, high performance crawling, ...
3  [mobile access network, hiperlan access networ...
4  [graphics schematic depictions, schematic depi...
5  [targeted advertising keywords, match ads keyw...
6  [information technology curriculum, informatio...
7  [modeling personalized search, personalized se...
8  [nearest neighbor search, nearest neighbour se...
9  [embedded interpreters, embedded interpreters ...
209


In [32]:
#METHOD TWO
selected_keywords_method_two = pd.DataFrame()
selected_keywords_method_two['text'] = extracted_keywords_two_df.apply(lambda x: filter_and_truncate_lists(x, frequent_pos_two, 10), axis=1)

print(selected_keywords_method_two[0:10])
print(len(selected_keywords_method_two))

                                                text
0  [entropy ramsey graphs, ramsey graphs entropy,...
1  [probabilistic retrieval model, probabilistic ...
2  [crawling systems, high performance crawling, ...
3  [hiperlan mobile access, hiperlan access netwo...
4  [graphics schematic depictions, schematic depi...
5  [targeted advertising keywords, content target...
6  [information technology curriculum, technology...
7  [modeling personalized search, personalized se...
8  [nearest neighbor search, nearest neighbour se...
9  [evaluation embedded interpreters, embedded in...
209


In [33]:
# SAVE ON FILE FOR DELAYED EVALUATION

# With additional options for more control
selected_keywords_method_one.to_csv('extension2_extraction_Nguyen2007_method_one.csv',
          index=False,
          sep=',',
          encoding='utf-8',
          decimal='.',
          header=None
)
selected_keywords_method_two.to_csv('extension2_extraction_Nguyen2007_method_two.csv',
          index=False,
          sep=',',
          encoding='utf-8',
          decimal='.',
          header=None

)
keywords_df.to_csv('extension2_groundtruth_Nguyen2007.csv',
          index=False,
          sep=',',
          encoding='utf-8',
          decimal='.',
          header=None

)

In [34]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files
files.download('extension2_extraction_Nguyen2007_method_one.csv')
files.download('extension2_extraction_Nguyen2007_method_two.csv')
files.download('extension2_groundtruth_Nguyen2007.csv')

Mounted at /content/drive


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Now do the same process for SemEval dataset

In [35]:
#DATABASE EXTRACTION and PROCESSING
dataset_name = "SemEval2010"
processed_data = process_dataset(dataset_name)

if processed_data is not None:
  output_csv = f"{dataset_name}_processed.csv"
  save_to_csv(processed_data, output_csv)
if os.path.exists(local_zip_path):
  os.remove(local_zip_path)
  print("Cleaned up temporary files.")
file_path = "./SemEval2010_processed.csv"
data = pd.read_csv(file_path)

documents = data['document']
keywords = data['keywords']

keywords = keywords.apply(lambda x: eval(x))
keywords = keywords.apply(lambda x: x[0].split('\n'))

Processing dataset: SemEval2010
Saving processed data to SemEval2010_processed.csv...
Data saved successfully.


In [36]:
#KeyBERT
start = time()
cosine_keywords = kw_model.extract_keywords(documents, keyphrase_ngram_range=(1,3), top_n=30)
end = time()
cosine_time = end-start
print(f"Total time to extract keywords from SemEval (cosine similarity, n_gram=(1,3), top_n=30): {cosine_time:.3f}")
extracted_keywords_cosine = []
for kws in cosine_keywords:
    extracted_keywords_cosine.append([kw[0].lower().strip() for kw in kws])
print(extracted_keywords_cosine[0:10])
print(len(extracted_keywords_cosine))

Total time to extract keywords from SemEval (cosine similarity, n_gram=(1,3), top_n=30): 385.972
[['grid service discovery', 'service discovery grids', 'service discovery grid', 'discovery grid services', 'grid discovery services', 'discovery grid service', 'scalable grid service', 'services grids', 'specialized grid service', 'grid services', 'grid service providers', 'grid service', 'uddi grid service', 'grid information services', 'services grids provide', 'grid services architecture', 'grid services essential', 'discovery grid computing', 'service discovery supporting', 'service discovery solution', 'service discovery', 'service discovery mechanisms', 'service discovery based', 'service discovery using', 'services discovery', 'service discovery needs', 'based service discovery', 'service apis grid', 'service discovery mechanism', 'service discovery architecture'], ['deployment sensor networks', 'sensors observed deployment', 'sensor deployment strategy', 'detection sensor network',

In [37]:
#PART-OF-SPEECH TAGGING (of the document)
documents_df = pd.DataFrame({'text': documents})
keywords_df = pd.DataFrame({'text': keywords})
start = time()
documents_df['pos'] = documents_df['text'].apply(pos_tag_document)
end = time()
print(f'part-of-speech tagging of the documents {end-start}')



part-of-speech tagging of the documents 532.4184370040894


In [38]:
#PART-OF-SPEECH ASSOCIATION (for the keyword)
extracted_keywords_one_df = pd.DataFrame({'text':extracted_keywords_cosine})
start = time()
extracted_keywords_one_df['pos'] = extracted_keywords_one_df.apply(lambda x: pos_of_keywords_method_one(x['text'], documents_df['pos'][x.name]), axis=1)
end = time()
print(f'part-of-speech tagging with method one of the keywords {end-start}')

extracted_keywords_two_df = pd.DataFrame({'text':extracted_keywords_cosine})
start = time()
extracted_keywords_two_df['pos'] = extracted_keywords_two_df.apply(lambda x: pos_of_keywords_method_two(x['text'], documents_df['pos'][x.name]), axis=1)
end = time()
print(f'part-of-speech tagging with method two of the keywords {end-start}')

part-of-speech tagging with method one of the keywords 1663.4572792053223
part-of-speech tagging with method two of the keywords 11.06962537765503


In [39]:
#KEYWORD SELECTION (based on frequent POS sequences)
selected_keywords_method_one = pd.DataFrame()
selected_keywords_method_one['text'] = extracted_keywords_one_df.apply(lambda x: filter_and_truncate_lists(x, frequent_pos_one, 10), axis=1)

print(selected_keywords_method_one[0:10])
print(len(selected_keywords_method_one))

selected_keywords_method_two = pd.DataFrame()
selected_keywords_method_two['text'] = extracted_keywords_two_df.apply(lambda x: filter_and_truncate_lists(x, frequent_pos_two, 10), axis=1)

print(selected_keywords_method_two[0:10])
print(len(selected_keywords_method_two))

                                                text
0  [grid service discovery, service discovery gri...
1  [deployment sensor networks, sensor deployment...
2  [voip audio conferencing, distributed voip con...
3  [swarm worm details, malware exhibiting swarm,...
4  [modular protocols based, modular protocols, m...
5  [availability data center, outages data center...
6  [metrics mobile object, runtimes mobile object...
7  [download data allocation, servers allocation ...
8  [localizing sensor network, localization senso...
9  [throughput blast grid, sequence comparison bl...
243
                                                text
0  [grid service discovery, service discovery gri...
1  [deployment sensor networks, sensors observed ...
2  [voip audio conferencing, distributed voip con...
3  [swarm worm details, host swarm worm, swarm wo...
4  [modular protocols based, building modular pro...
5  [availability data center, outages data center...
6  [metrics mobile object, runtimes mobile

In [40]:
# SAVE ON FILE FOR DELAYED EVALUATION
selected_keywords_method_one.to_csv('extension2_extraction_Semeval2010_method_one.csv',
          index=False,
          sep=',',
          encoding='utf-8',
          decimal='.',
          header=None
)
selected_keywords_method_two.to_csv('extension2_extraction_Semeval2010_method_two.csv',
          index=False,
          sep=',',
          encoding='utf-8',
          decimal='.',
          header=None

)
keywords_df.to_csv('extension2_groundtruth_Semeval2010.csv',
          index=False,
          sep=',',
          encoding='utf-8',
          decimal='.',
          header=None

)
files.download('extension2_extraction_Semeval2010_method_one.csv')
files.download('extension2_extraction_Semeval2010_method_two.csv')
files.download('extension2_groundtruth_Semeval2010.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>