# DATABASE EXTRACTION and PROCESSING


In [1]:
import os
import requests
import zipfile
import pandas as pd

In [2]:
# Define the URL for the dataset repository and the local storage directory
data_url = "https://github.com/LIAAD/KeywordExtractor-Datasets/archive/refs/heads/master.zip"
local_zip_path = "datasets.zip"
unzip_dir = "KeywordExtractor-Datasets"

In [3]:
# Step 1: Download the dataset repository
def download_datasets():
    print("Downloading datasets...")
    response = requests.get(data_url)
    if response.status_code == 200:
        with open(local_zip_path, "wb") as file:
            file.write(response.content)
        print("Datasets downloaded successfully.")
    else:
        print(f"Failed to download datasets. Status code: {response.status_code}")
        exit(1)

In [4]:
# Step 2: Extract the downloaded zip file
def extract_datasets():
    print("Extracting datasets...")
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)
    print("Datasets extracted successfully.")

In [5]:
# Step 3: Extract individual dataset zips
def extract_inner_zips():
    datasets_path = os.path.join(unzip_dir, "KeywordExtractor-Datasets-master/datasets")
    for file in os.listdir(datasets_path):
        if file.endswith(".zip"):
            zip_path = os.path.join(datasets_path, file)
            extract_path = os.path.join(datasets_path, file.replace(".zip", ""))
            if not os.path.exists(extract_path):
                print(f"Extracting {file}...")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
                print(f"Extracted {file} to {extract_path}.")

In [6]:
# Step 4: Process a specific dataset and convert it into a usable format
def process_dataset(dataset_name):
    dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

    # Check if dataset directory exists
    if not os.path.exists(dataset_path):
        print(f"Dataset {dataset_name} not found.")
        return None

    docs_folder = os.path.join(dataset_path, "docsutf8")
    keys_folder = os.path.join(dataset_path, "keys")

    if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
        print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
        return None

    # Load documents and keywords
    print(f"Processing dataset: {dataset_name}")
    documents = []
    keywords = []

    for doc_file in sorted(os.listdir(docs_folder)):
        doc_path = os.path.join(docs_folder, doc_file)
        if doc_file.endswith(".txt"):
            with open(doc_path, "r", encoding="utf-8") as f:
                documents.append(f.read().strip())

    for key_file in sorted(os.listdir(keys_folder)):
        key_path = os.path.join(keys_folder, key_file)
        if key_file.endswith(".key"):
            with open(key_path, "r", encoding="utf-8") as f:
                keywords.append(f.read().strip().split(','))

    # Combine documents and keywords into a DataFrame
    data = pd.DataFrame({"document": documents, "keywords": keywords})
    return data

In [7]:
# Step 4: Save processed data to a CSV file
def save_to_csv(data, output_path):
    print(f"Saving processed data to {output_path}...")
    data.to_csv(output_path, index=False, encoding="utf-8")
    print("Data saved successfully.")

In [8]:
# Example: Process the "Nguyen2007" dataset
download_datasets()
extract_datasets()
extract_inner_zips()

dataset_name = "Nguyen2007"
processed_data = process_dataset(dataset_name)

if processed_data is not None:
  output_csv = f"{dataset_name}_processed.csv"
  save_to_csv(processed_data, output_csv)
  #in the format documents,keywords
  #keywords are separated either by commas or \n

  # Clean up downloaded zip file
if os.path.exists(local_zip_path):
  os.remove(local_zip_path)
  print("Cleaned up temporary files.")

Downloading datasets...
Datasets downloaded successfully.
Extracting datasets...
Datasets extracted successfully.
Extracting Inspec.zip...
Extracted Inspec.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/Inspec.
Extracting PubMed.zip...
Extracted PubMed.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/PubMed.
Extracting citeulike180.zip...
Extracted citeulike180.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/citeulike180.
Extracting pak2018.zip...
Extracted pak2018.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/pak2018.
Extracting fao780.zip...
Extracted fao780.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/fao780.
Extracting wicc.zip...
Extracted wicc.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/wicc.
Extracting theses100.zip...
Extracted theses100.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/dataset

In [9]:
import pandas as pd

# Load the CSV file
file_path = "./Nguyen2007_processed.csv"  # Update with the correct path
data = pd.read_csv(file_path)

# Extract documents and keywords
documents = data['document']  # This is a pandas Series of text documents
keywords = data['keywords']  # This is a pandas Series of keyword strings

# Convert keywords from string representation to Python lists
keywords = keywords.apply(lambda x: eval(x))  # Use `eval` to parse strings into lists if necessary
#Now it is a list containing only one long string in the format ['Keyword1\nKeyword2\nKeyword3....']
# keywords = keywords.apply(lambda x: x[0].lower()) #only one element in the list
#Now x is a string lowercase
keywords = keywords.apply(lambda x: x[0].split('\n'))
#now keywords is a list of lowercase strings

# KeyBERT

In [10]:
! pip install keybert

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvi

In [11]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is enabled.
print(torch.cuda.get_device_name(0))  # Prints the name of the GPU.

True
Tesla T4


In [12]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # Use GPU for embeddings

# kw_model = KeyBERT(model='all-mpnet-base-v2') #remove argument to use base model "all-MiniLM-L6-v2".
kw_model = KeyBERT(model=model)
# Base model is simpler and faster but slower worse in performances. At least in theory, for now it all sucks the same.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
from time import time

# extract 20 keyword to be able to select the preferred one in term of most frequent pos
start = time()
cosine_keywords = kw_model.extract_keywords(documents, keyphrase_ngram_range=(1,3), top_n=20)
end = time()
cosine_time = end-start
print(f"Total time to extract keywords from Nguyen2007 (cosine similarity, n_gram=(1,3), top_n=20): {cosine_time:.3f}")
#remember that doing it all together passing a list of documents is faster than iterating on one document at a time

# Example KeyBERT output
# keybert_output = [[('ramsey', 0.3427),
#   ('extractors', 0.3425),
#   ('subgraph', 0.3406),
#   ('derandomization', 0.3376),
#   ('entropy', 0.3142)],
#  [('retrieval', 0.5007),
#   ('relevance', 0.4068),
#   ('occurrence', 0.3692),
#   ('ranking', 0.3454),
#   ('idf', 0.333)],
#  [('crawled', 0.6136),
#   ('crawler', 0.6018),
#   ('crawlers', 0.5513),
#   ('crawling', 0.551),
#   ('crawl', 0.5431)]]

extracted_keywords_cosine = []
#for each list in the list of lists -> lower, strip and keep only literal part (no score) of each element
for kws in cosine_keywords:
    extracted_keywords_cosine.append([kw[0].lower().strip() for kw in kws])

Total time to extract keywords from Nguyen2007 (cosine similarity, n_gram=(1,3), top_n=20): 391.698


In [14]:
print(extracted_keywords_cosine[0:10])
print(len(extracted_keywords_cosine))

[['grid service discovery', 'service discovery grids', 'service discovery grid', 'discovery grid services', 'grid discovery services', 'discovery grid service', 'scalable grid service', 'services grids', 'specialized grid service', 'grid services', 'grid service providers', 'grid service', 'uddi grid service', 'grid information services', 'services grids provide', 'grid services architecture', 'grid services essential', 'discovery grid computing', 'service discovery supporting', 'service discovery solution'], ['deployment sensor networks', 'sensors observed deployment', 'sensor deployment strategy', 'detection sensor network', 'infrastructures deployment sensor', 'sensor network deployed', 'observe sensors deployed', 'sensor deployment', 'sensor networks performing', 'deployment strategies sensor', 'detection sensor networks', 'sensors deployment optimal', 'sensor network target', 'sensor network achieves', 'traffic traversal sensors', 'sensor network', 'sensor networks', 'sensors depl

# PART-OF-SPEECH TAGGING (of the document)


In [15]:
# the function compiute part-of-speech tagging using spacy on a text and return a list of pair (word, pos) = (str, str)

import spacy

def pos_tag_document(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Store and return tagged words with their parts of speech
    # each token object has a token.texta and a token.pos_ attribute here are saved in a list of pairs
    tagged_words = [(token.text, token.pos_) for token in doc]
    return tagged_words


In [None]:
# EXECUTE ON KeyBERT EXTRACTION

# place documents and keywords in a pandas DataFrame for structre and semplicty
documents_df = pd.DataFrame({'text': documents})
keywords_df = pd.DataFrame({'text': keywords})

# the function is executed on each document in documents (actualy in documents_df['text'])
# and the results are saved in document_df['pos'] as a list of pair (word, pos) for each doc.
# [7m17s]
start = time()
documents_df['pos'] = documents_df['text'].apply(pos_tag_document)
end = time()
print(f'part-of-speech tagging of the documents {end-start}')
# print(documents_df.head())
# print(documents_df.iloc[0])
# print(documents_df.shape)

# PART-OF-SPEECH ASSOCIATION (for the keyword)

In [17]:
# DISCARDED VERSION

# it recive the pos tagging of a document and the list of keyword of that document and return
# the pos of the keywords obtained by collecting the pos taggs of all the occurrence of the keyword (as exact sequence of words)
# in the text and keeping the most common

from collections import Counter
from typing import List, Tuple
import re

def pos_of_keywords_discarded(keywords, text_pos):

  key_pos = []
  # lower the word of the text to have better comparison
  text = [text.lower() for text, _ in text_pos]
  pos = [pos for _, pos in text_pos]


  for key in keywords:
        # PREVIOUS : Split the keywords in single words and lower them for better comparison
        # key_tokens = key.lower().split()

        # using findall regular exression I split the string and consider "-" and "/" as a string
        # as well sonce the POS includes them
        # - and / because are the recurring one in the keyword based on a sample of the kewords not found
        key_tokens = re.findall(r'\w+|[-/()]', key.lower())

        # Find all occurrences of the keyword
        occurrences = []
        for i in range(len(text_pos) - len(key_tokens) + 1):
            if text[i:i+len(key_tokens)] == key_tokens:
                # Extract POS sequence for this occurrence
                pos_sequence = [pos[j] for j in range(i, i+len(key_tokens))]
                occurrences.append(tuple(pos_sequence)) # tuple because counter require hushable type

        # If occurrences found, use the most common POS sequence
        if occurrences:
            # Counter collect occurrences and # of occurrence ((('ADJ','ADJ'), 3), ('NOUN', 'ADJ'), 1)
            # most_common(n) selects a list of the n most common occurrences
            most_common_pos = Counter(occurrences).most_common(1)[0][0]
            key_pos.append(most_common_pos)
        else:
            key_pos.append(None)

  return(key_pos)

In [18]:
# NEW VERSION

# it recive the pos tagging of a document and the list of keyword of that document and return
# the pos of the keywords obtained by collecting the pos taggs of all the occurrence of each word of the keyword
# in the text and keeping the most common

from collections import Counter
from typing import List, Tuple
import re

def pos_of_keywords(keywords, text_pos):

  key_pos = []

  # lower the word of the text to have better comparison
  text = [text.lower() for text, _ in text_pos]
  pos = [pos for _, pos in text_pos]


  for key in keywords:
        # PREVIOUS : Split the keywords in single words and lower them for better comparison
        key_tokens = key.lower().split()
        word_pos = []

        # using findall regular exression I split the string and consider "-" and "/" as a string
        # as well sonce the POS includes them
        # - and / because are the recurring one in the keyword based on a sample of the kewords not found
        # key_tokens = re.findall(r'\w+|[-/()]', key.lower())
        for word in key_tokens:
          # Find all occurrences of the word
          occurrences = []
          for i in range(len(text)):
              if text[i] == word:
                occurrences.append(pos[i])

          if occurrences:
            most_common_pos = Counter(occurrences).most_common(1)[0][0]
            word_pos.append(most_common_pos)
          else:
            word_pos.append(None)

        key_pos.append(tuple(word_pos))

  return(key_pos)

In [19]:
# EXECUTE ON KeyBERT EXTRACTION

# per comodità rendo documents and keywords Pandas Dataframe
extracted_keywords_df = pd.DataFrame({'text':extracted_keywords_cosine})

# execute pos_of_keywords on every row of the dataframes
start = time()
# sostituire con keywords_df for ground truth
extracted_keywords_df['pos'] = extracted_keywords_df.apply(lambda x: pos_of_keywords(x['text'], documents_df['pos'][x.name]), axis=1)
end = time()
print(f'part-of-speech tagging of the keywords {end-start}') # by associaton of the keywords to their occurrences in the text

# ATT! lots ok keyword extracted with keybert are not found in the text. Find a solution. [EG: use pos tahgging directly on keywords]

part-of-speech tagging of the keywords 12.074630737304688


# KEYWORD SELECTION (based on frequent POS sequences)

In [20]:
# imporve the selection in the other file
frequent_pos_results = [(('NOUN', 'NOUN'), 220), (None, 219), (('ADJ', 'NOUN'), 183), (('NOUN',), 132), (('PROPN', 'PROPN'), 55), (('ADJ', 'NOUN', 'NOUN'), 40), (('PROPN',), 27), (('VERB', 'NOUN'), 25), (('NOUN', 'VERB'), 22), (('PROPN', 'NOUN'), 19), (('NOUN', 'NOUN', 'NOUN'), 17), (('VERB',), 17), (('PROPN', 'PROPN', 'PROPN'), 14), (('NOUN', 'PUNCT', 'VERB', 'NOUN'), 12), (('ADJ', 'ADJ', 'NOUN'), 9), (('ADJ', 'PUNCT', 'NOUN', 'NOUN'), 9), (('VERB', 'NOUN', 'NOUN'), 7), (('ADJ', 'ADJ', 'ADJ', 'NOUN'), 6), (('ADJ',), 5), (('NOUN', 'PUNCT', 'NOUN'), 5), (('PROPN', 'VERB'), 5), (('PROPN', 'NOUN', 'NOUN'), 4), (('PROPN', 'PUNCT', 'PROPN', 'NOUN'), 4), (('ADJ', 'PROPN'), 4), (('NOUN', 'PUNCT', 'NOUN', 'NOUN'), 4), (('NOUN', 'PROPN', 'NOUN'), 4), (('NOUN', 'PROPN'), 3), (('PROPN', 'ADJ', 'NOUN'), 3), (('NOUN', 'VERB', 'NOUN'), 3), (('NOUN', 'PUNCT', 'ADJ', 'NOUN'), 3), (('NOUN', 'PUNCT', 'ADP', 'PUNCT', 'NOUN'), 3), (('NOUN', 'ADP', 'NOUN'), 3), (('PROPN', 'SYM', 'PROPN'), 3), (('ADP', 'PUNCT', 'NOUN', 'NOUN'), 2), (('ADJ', 'ADJ', 'PROPN'), 2), (('PROPN', 'PROPN', 'NOUN'), 2), (('ADJ', 'NOUN', 'VERB'), 2), (('ADJ', 'ADJ', 'NOUN', 'NOUN'), 2), (('NOUN', 'PUNCT', 'VERB'), 2), (('PROPN', 'ADP', 'PROPN'), 2), (('VERB', 'PROPN'), 2), (('VERB', 'PUNCT', 'NOUN', 'NOUN'), 2), (('VERB', 'ADJ', 'NOUN'), 2), (('NOUN', 'ADJ', 'NOUN'), 2), (('NOUN', 'PUNCT', 'VERB', 'NOUN', 'NOUN'), 2), (('ADV', 'ADJ', 'NOUN'), 2), (('ADJ', 'VERB'), 2), (('PROPN', 'PUNCT', 'NOUN'), 2), (('ADJ', 'PUNCT', 'NOUN', 'NOUN', 'NOUN'), 2), (('NOUN', 'ADP', 'DET', 'NOUN'), 1)]
frequent_pos = [tup[0] for tup in frequent_pos_results]

In [21]:
# We want to generate a certain amount of keyword so we fill/truncate if the one
# corresponding to a desired pos sequence are not enough/too many
def filter_and_truncate_lists(keywords, frequent_pos, target_size=10):

    key_pos = list(keywords['pos'])
    key_text = list(keywords['text'])

    # Convert to set for O(1) lookup
    frequent_set = set(frequent_pos)

    # Initialize result lists
    matches = []  # Keyword whose pos is frequen
    non_matches = []  # Keyword whose pos is not frequent

    for i, pos in enumerate(key_pos):
      if pos in frequent_set:
        matches.append(key_text[i])
      else:
        non_matches.append(key_text[i])

    # Truncate matches to target_size

    # If matches list is exactly target_size, return it
    if len(matches) == target_size:
        return matches

    # If matches list is too long, truncate it
    elif len(matches) > target_size:
        return matches[:target_size]

    # If matches list is too short, add elements from non_matches
    else:
        remaining_slots = target_size - len(matches)
        return matches + non_matches[:remaining_slots]

In [22]:
# Example usage
if __name__ == "__main__":

    selected_keywords = pd.DataFrame()
    selected_keywords['text'] = extracted_keywords_df.apply(lambda x: filter_and_truncate_lists(x, frequent_pos, 10), axis=1)
    """
    for i in range(len(extracted_keywords_df.shape[])):
      selected_keywords.append(filter_and_truncate_lists(extracted_keywords_df[i], accepted_pos_sequences))
    """


    # Test both implementations
    # result = filter_and_truncate_lists(source, reference)
    # result_alt = filter_tuples_alt(source, reference)

    print(selected_keywords[0:10])
    print(len(selected_keywords))
    #print(f"Filtered list (not truncated): {result_alt}")

                                                text
0  [grid service discovery, service discovery gri...
1  [deployment sensor networks, sensors observed ...
2  [voip audio conferencing, distributed voip con...
3  [swarm worm details, host swarm worm, malware ...
4  [modular protocols based, building modular pro...
5  [availability data center, outages data center...
6  [metrics mobile object, runtimes mobile object...
7  [servers allocation strategies, distributed co...
8  [localizing sensor network, localization senso...
9  [throughput blast grid, sequence comparison bl...
243


In [23]:
# SAVE ON FILE FOR DELAYED EVALUATION
# Basic CSV save
selected_keywords.to_csv('extension2_extraction.csv', index=False)

# With additional options for more control
selected_keywords.to_csv('extension2_extraction.csv',
          index=False,  # Don't save row numbers
          sep=',',      # Use comma as separator
          encoding='utf-8',  # Specify encoding for special characters
          decimal='.'    # Use period for decimal points
)

# Basic CSV save
keywords_df.to_csv('extension2_groundtruth.csv', index=False)

# With additional options for more control
keywords_df.to_csv('extension2_groundtruth.csv',
          index=False,  # Don't save row numbers
          sep=',',      # Use comma as separator
          encoding='utf-8',  # Specify encoding for special characters
          decimal='.'    # Use period for decimal points
)

In [24]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful