# DATABASE EXTRACTION and PROCESSING

In [28]:
import os
import requests
import zipfile
import time
import numpy as np
import pandas as pd

In [1]:
# Define the URL for the dataset repository and the local storage directory
data_url = "https://github.com/LIAAD/KeywordExtractor-Datasets/archive/refs/heads/master.zip"
local_zip_path = "datasets.zip"
unzip_dir = "KeywordExtractor-Datasets"

In [6]:
# Step 1: Download the dataset repository
def download_datasets():
    print("Downloading datasets...")
    response = requests.get(data_url)
    if response.status_code == 200:
        with open(local_zip_path, "wb") as file:
            file.write(response.content)
        print("Datasets downloaded successfully.")
    else:
        print(f"Failed to download datasets. Status code: {response.status_code}")
        exit(1)

In [3]:
# Step 2: Extract the downloaded zip file
def extract_datasets():
    print("Extracting datasets...")
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)
    print("Datasets extracted successfully.")

In [7]:
# Step 3: Extract individual dataset zips
def extract_inner_zips():
    datasets_path = os.path.join(unzip_dir, "KeywordExtractor-Datasets-master/datasets")
    for file in os.listdir(datasets_path):
        if file.endswith(".zip"):
            zip_path = os.path.join(datasets_path, file)
            extract_path = os.path.join(datasets_path, file.replace(".zip", ""))
            if not os.path.exists(extract_path):
                print(f"Extracting {file}...")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
                print(f"Extracted {file} to {extract_path}.")

In [None]:
# PROCESSING WITHOUT SAMPLING
# Step 4: Process a specific dataset and convert it into a usable format
def process_dataset(dataset_name):
    dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

    # Check if dataset directory exists
    if not os.path.exists(dataset_path):
        print(f"Dataset {dataset_name} not found.")
        return None

    docs_folder = os.path.join(dataset_path, "docsutf8")
    keys_folder = os.path.join(dataset_path, "keys")

    if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
        print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
        return None

    # Load documents and keywords
    print(f"Processing dataset: {dataset_name}")
    documents = []
    keywords = []

    for doc_file in sorted(os.listdir(docs_folder)):
        doc_path = os.path.join(docs_folder, doc_file)
        if doc_file.endswith(".txt"):
            with open(doc_path, "r", encoding="utf-8") as f:
                documents.append(f.read().strip())

    for key_file in sorted(os.listdir(keys_folder)):
        key_path = os.path.join(keys_folder, key_file)
        if key_file.endswith(".key"):
            with open(key_path, "r", encoding="utf-8") as f:
                keywords.append(f.read().strip().split(','))

    # Combine documents and keywords into a DataFrame
    data = pd.DataFrame({"document": documents, "keywords": keywords})
    return data

In [13]:
# PROCESSING WITH SAMPLING

def process_dataset(dataset_name, sample_fraction=0.1, random_seed=64):
    dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

    # Check if dataset directory exists
    if not os.path.exists(dataset_path):
        print(f"Dataset {dataset_name} not found.")
        return None

    docs_folder = os.path.join(dataset_path, "docsutf8")
    keys_folder = os.path.join(dataset_path, "keys")

    if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
        print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
        return None

    # Get list of all document files
    doc_files = sorted([f for f in os.listdir(docs_folder) if f.endswith(".txt")])

    # Set random seed and sample file indices
    np.random.seed(random_seed)
    sample_size = int(len(doc_files) * sample_fraction)
    sampled_indices = np.random.choice(len(doc_files), size=sample_size, replace=False)

    print(f"Original dataset size: {len(doc_files)}")
    print(f"Sampled dataset size: {sample_size}")

    # Load only sampled documents and their corresponding keywords
    documents = []
    keywords = []

    for idx in sorted(sampled_indices):
        # Get document
        doc_file = doc_files[idx]
        doc_path = os.path.join(docs_folder, doc_file)
        with open(doc_path, "r", encoding="utf-8") as f:
            documents.append(f.read().strip())

        # Get corresponding keywords
        key_file = doc_file.replace('.txt', '.key')
        key_path = os.path.join(keys_folder, key_file)
        with open(key_path, "r", encoding="utf-8") as f:
            keywords.append(f.read().strip().split(','))

    # Combine documents and keywords into a DataFrame
    data = pd.DataFrame({"document": documents, "keywords": keywords})
    return data

In [10]:
# Step 4: Save processed data to a CSV file
def save_to_csv(data, output_path):
    print(f"Saving processed data to {output_path}...")
    data.to_csv(output_path, index=False, encoding="utf-8")
    print("Data saved successfully.")

In [14]:
# Main execution
if __name__ == "__main__":
    download_datasets()
    extract_datasets()
    extract_inner_zips()

    # Example: Process the "Krapivin2009" dataset
    dataset_name = "Krapivin2009"
    processed_data = process_dataset(dataset_name)

    if processed_data is not None:
        output_csv = f"{dataset_name}_processed.csv"
        save_to_csv(processed_data, output_csv)
        #in the format documents,keywords
        #keywords are separated either by commas or \n

    # Clean up downloaded zip file
    if os.path.exists(local_zip_path):
        os.remove(local_zip_path)
        print("Cleaned up temporary files.")

Downloading datasets...
Datasets downloaded successfully.
Extracting datasets...
Datasets extracted successfully.
Original dataset size: 2304
Sampled dataset size: 230
Saving processed data to Krapivin2009_processed.csv...
Data saved successfully.
Cleaned up temporary files.


In [20]:
# Load the CSV file
file_path = "./Krapivin2009_processed.csv"  # Update with the correct path
data = pd.read_csv(file_path)

"""
# Extract documents and keywords
documents = data['document']  # This is a pandas Series of text documents
keywords = data['keywords']  # This is a pandas Series of keyword strings
"""

# Extract documents and keywords
# save them in pandas dataframe in the 'text' column
documents = pd.DataFrame({'text': data['document']})
keywords = pd.DataFrame({'text': data['keywords']})

# Convert keywords from string representation to Python lists
keywords['text'] = keywords['text'].apply(lambda x: eval(x))  # Use `eval` to parse strings into lists if necessary
#Now it is a list containing only one long string in the format ['Keyword1\nKeyword2\nKeyword3....']
# keywords = keywords.apply(lambda x: x[0].lower()) #only one element in the list
#Now x is a string lowercase
keywords['text'] = keywords['text'].apply(lambda x: x[0].split('\n'))
#now keywords is a list of lowercase strings

# PART-OF-SPEECH TAGGING [documents]

In [23]:
import spacy

In [22]:
# the function compiute part-of-speech tagging using spacy on a text and return a list of pair (word, pos)

def pos_tag_document(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Store and return tagged words with their parts of speech
    # each token object has a token.texta and a token.pos_ attribute here are saved in a list of pairs
    tagged_words = [(token.text, token.pos_) for token in doc]

    return tagged_words

In [29]:
# main

start = time.time()
documents['pos'] = documents['text'].apply(pos_tag_document)
end = time.time()
print(f'part-of-speech tagging of the documents {end-start}')

part-of-speech tagging of the documents 596.9443778991699


# PART-OF-SPEECH ASSOCIATION [keywords]

In [37]:
from collections import Counter
from typing import List, Tuple
import re

In [38]:
# it recive the pos tagging of a document and the list of keyword of that document and return
# the pos of the keywords obtained by collecting the pos taggs of all the occurrence of the keyword in the
# text and keeping the most common

def pos_of_keywords(keywords, text_pos):

  key_pos = []
  # lower the word of the text to have better comparison
  text = [text.lower() for text, _ in text_pos]
  pos = [pos for _, pos in text_pos]


  for key in keywords:
        # PREVIOUS : Split the keywords in single words and lower them for better comparison
        # key_tokens = key.lower().split()

        # using findall regular exression I split the string and consider "-" and "/" as a string
        # as well sonce the POS includes them
        # - and / because are the recurring one in the keyword based on a sample of the kewords not found
        key_tokens = re.findall(r'\w+|[-/()]', key.lower())

        # Find all occurrences of the keyword
        occurrences = []
        for i in range(len(text_pos) - len(key_tokens) + 1):
            if text[i:i+len(key_tokens)] == key_tokens:
                # Extract POS sequence for this occurrence
                pos_sequence = [pos[j] for j in range(i, i+len(key_tokens))]
                occurrences.append(tuple(pos_sequence)) # tuple because counter require hushable type

        # If occurrences found, use the most common POS sequence
        if occurrences:
            # Counter collect occurrences and # of occurrence ((('ADJ','ADJ'), 3), ('NOUN', 'ADJ'), 1)
            # most_common(n) selects a list of the n most common occurrences
            most_common_pos = Counter(occurrences).most_common(1)[0][0]
            key_pos.append(most_common_pos)
        else:
            key_pos.append(None)

  return(key_pos)

In [40]:
# execute pos_of_keywords on every row of the dataframes

start = time.time()
keywords['pos'] = keywords.apply(lambda x: pos_of_keywords(x['text'], documents['pos'][x.name]), axis=1)
end = time.time()
print(f'part-of-speech tagging of the keywords {end-start}') # by associaton of the keywords to their occurrences in the text

part-of-speech tagging of the keywords 3.2202203273773193


In [None]:
# TO DO : decide how many pos sequence to select

def most_common_pos_sequences(pos_list, n=50):
  most_common_pos = Counter(pos_list).most_common(n)
  # print(most_common_pos)
  # print(len(most_common_pos))
  return [mcs[0] for mcs in most_common_pos if mcs[0] is not None]

# have all the keyword pos sequence in a single list
flattened = tuple(keywords['pos'].explode().tolist())

accepted_pos_sequences = most_common_pos_sequences(flattened)
print(accepted_pos_sequences)

# Forse poche poarole chiave
# Da eliminare quelle da 4
# Tutte ragionevoli e ben distribuite da 50