<a href="https://colab.research.google.com/github/s319848/DNLP-project-2025/blob/main/notebooks/Second-extension/Frequent_POS_Sequence_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATABASE EXTRACTION and PROCESSING

In [18]:
import os
import requests
import zipfile
import time
import numpy as np
import pandas as pd

In [19]:
# Define the URL for the dataset repository and the local storage directory
data_url = "https://github.com/LIAAD/KeywordExtractor-Datasets/archive/refs/heads/master.zip"
local_zip_path = "datasets.zip"
unzip_dir = "KeywordExtractor-Datasets"

In [20]:
# Step 1: Download the dataset repository
def download_datasets():
    print("Downloading datasets...")
    response = requests.get(data_url)
    if response.status_code == 200:
        with open(local_zip_path, "wb") as file:
            file.write(response.content)
        print("Datasets downloaded successfully.")
    else:
        print(f"Failed to download datasets. Status code: {response.status_code}")
        exit(1)

In [21]:
# Step 2: Extract the downloaded zip file
def extract_datasets():
    print("Extracting datasets...")
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)
    print("Datasets extracted successfully.")

In [22]:
# Step 3: Extract individual dataset zips
def extract_inner_zips():
    datasets_path = os.path.join(unzip_dir, "KeywordExtractor-Datasets-master/datasets")
    for file in os.listdir(datasets_path):
        if file.endswith(".zip"):
            zip_path = os.path.join(datasets_path, file)
            extract_path = os.path.join(datasets_path, file.replace(".zip", ""))
            if not os.path.exists(extract_path):
                print(f"Extracting {file}...")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
                print(f"Extracted {file} to {extract_path}.")

In [23]:
# PROCESSING WITHOUT SAMPLING
# Step 4: Process a specific dataset and convert it into a usable format
def process_dataset(dataset_name):
    dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

    # Check if dataset directory exists
    if not os.path.exists(dataset_path):
        print(f"Dataset {dataset_name} not found.")
        return None

    docs_folder = os.path.join(dataset_path, "docsutf8")
    keys_folder = os.path.join(dataset_path, "keys")

    if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
        print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
        return None

    # Load documents and keywords
    print(f"Processing dataset: {dataset_name}")
    documents = []
    keywords = []

    for doc_file in sorted(os.listdir(docs_folder)):
        doc_path = os.path.join(docs_folder, doc_file)
        if doc_file.endswith(".txt"):
            with open(doc_path, "r", encoding="utf-8") as f:
                documents.append(f.read().strip())

    for key_file in sorted(os.listdir(keys_folder)):
        key_path = os.path.join(keys_folder, key_file)
        if key_file.endswith(".key"):
            with open(key_path, "r", encoding="utf-8") as f:
                keywords.append(f.read().strip().split(','))

    # Combine documents and keywords into a DataFrame
    data = pd.DataFrame({"document": documents, "keywords": keywords})
    return data

In [24]:
# PROCESSING WITH SAMPLING

def process_dataset(dataset_name, sample_fraction=0.2, random_seed=64):
    dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

    # Check if dataset directory exists
    if not os.path.exists(dataset_path):
        print(f"Dataset {dataset_name} not found.")
        return None

    docs_folder = os.path.join(dataset_path, "docsutf8")
    keys_folder = os.path.join(dataset_path, "keys")

    if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
        print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
        return None

    # Get list of all document files
    doc_files = sorted([f for f in os.listdir(docs_folder) if f.endswith(".txt")])

    # Set random seed and sample file indices
    np.random.seed(random_seed)
    sample_size = int(len(doc_files) * sample_fraction)
    sampled_indices = np.random.choice(len(doc_files), size=sample_size, replace=False)

    print(f"Original dataset size: {len(doc_files)}")
    print(f"Sampled dataset size: {sample_size}")

    # Load only sampled documents and their corresponding keywords
    documents = []
    keywords = []

    for idx in sorted(sampled_indices):
        # Get document
        doc_file = doc_files[idx]
        doc_path = os.path.join(docs_folder, doc_file)
        with open(doc_path, "r", encoding="utf-8") as f:
            documents.append(f.read().strip())

        # Get corresponding keywords
        key_file = doc_file.replace('.txt', '.key')
        key_path = os.path.join(keys_folder, key_file)
        with open(key_path, "r", encoding="utf-8") as f:
            keywords.append(f.read().strip().split(','))

    # Combine documents and keywords into a DataFrame
    data = pd.DataFrame({"document": documents, "keywords": keywords})
    return data

In [25]:
# Step 4: Save processed data to a CSV file
def save_to_csv(data, output_path):
    print(f"Saving processed data to {output_path}...")
    data.to_csv(output_path, index=False, encoding="utf-8")
    print("Data saved successfully.")

In [26]:
# Main execution
if __name__ == "__main__":
    download_datasets()
    extract_datasets()
    extract_inner_zips()

    # Example: Process the "Krapivin2009" dataset
    dataset_name = "Krapivin2009"
    processed_data = process_dataset(dataset_name)

    if processed_data is not None:
        output_csv = f"{dataset_name}_processed.csv"
        save_to_csv(processed_data, output_csv)
        #in the format documents,keywords
        #keywords are separated either by commas or \n

    # Clean up downloaded zip file
    if os.path.exists(local_zip_path):
        os.remove(local_zip_path)
        print("Cleaned up temporary files.")

Downloading datasets...
Datasets downloaded successfully.
Extracting datasets...
Datasets extracted successfully.
Original dataset size: 2304
Sampled dataset size: 460
Saving processed data to Krapivin2009_processed.csv...
Data saved successfully.
Cleaned up temporary files.


In [27]:
# Load the CSV file
file_path = "./Krapivin2009_processed.csv"
data = pd.read_csv(file_path)

"""
# Extract documents and keywords
documents = data['document']  # This is a pandas Series of text documents
keywords = data['keywords']  # This is a pandas Series of keyword strings
"""

# Extract documents and keywords
# save them in pandas dataframe in the 'text' column
documents = pd.DataFrame({'text': data['document']})
keywords = pd.DataFrame({'text': data['keywords']})

# Convert keywords from string representation to Python lists
keywords['text'] = keywords['text'].apply(lambda x: eval(x))  # Use `eval` to parse strings into lists if necessary
#Now it is a list containing only one long string in the format ['Keyword1\nKeyword2\nKeyword3....']
keywords['text'] = keywords['text'].apply(lambda x: x[0].split('\n'))
#now keywords is a list of lowercase strings

# PART-OF-SPEECH TAGGING [documents]

In [28]:
import spacy

In [29]:
# the function compiute part-of-speech tagging using spacy on a text and return a list of pair (word, pos)

def pos_tag_document(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Store and return tagged words with their parts of speech
    # each token object has a token.texta and a token.pos_ attribute here are saved in a list of pairs
    tagged_words = [(token.text, token.pos_) for token in doc]

    return tagged_words

In [30]:
# main

start = time.time()
documents['pos'] = documents['text'].apply(pos_tag_document)
end = time.time()
print(f'part-of-speech tagging of the documents {end-start}')

"""Time to tag documents: 909.2577395439148"""



part-of-speech tagging of the documents 954.178055524826


'Time to tag documents: 909.2577395439148'

# PART-OF-SPEECH ASSOCIATION [keywords]

In [31]:
from collections import Counter
from typing import List, Tuple
import re

In [32]:
nlp = spacy.load("en_core_web_sm")  # Load Spacy model for POS tagging and lemmatization

def pos_of_keywords_method_one(keywords, text_pos, window_size=5):
    """
    Identify the most common Part-of-Speech (POS) tag for each word in a multi-word keyword
    by analyzing its occurrences in a given text. The function checks if all lemmatized parts of a
    keyword appear within a dynamically adjusted window in the text and assigns the most
    frequent POS tags accordingly.
    If a keyword or part of it is not found in a relevant context, the corresponding POS
    position is left empty (None).

    Args:
    - keywords (list of str): The keywords to find in the text.
    - text_pos (list of tuples): A list of (word, POS tag) tuples.
    - window_size (int): The default context window size around words.

    Returns:
    - list of tuples: Each tuple contains the most frequent POS tag(s)
                      for each word in a keyword. If no match is found, returns None.

    """
    key_pos = []

    # extract words and their POS tags separately, converting words to lowercase
    text_tokens = [token.lower() for token, _ in text_pos]
    text_pos_tags = [pos for _, pos in text_pos]

    # Precompute lemmas for the entire text (word-by-word approach)
    text_lemmas = {word: nlp(word)[0].lemma_ for word in set(text_tokens)}
    lemmatized_text = [text_lemmas[word] for word in text_tokens] # lemmatized version of text tokens

    for key in keywords:
        key_tokens = re.findall(r'\w+|[-/()]', key.lower())  # tokenize the keyword, keeping hyphens (-) and slashes (/) as separate tokens
        key_lemmas = [text_lemmas.get(word, word) for word in key_tokens]  # lemmatize keyword tokens

        dynamic_window_size = max(window_size, len(key_tokens) * 2) # adjust window size based on keyword length
        keyword_word_pos = [] #store POS tags for each word in the keyword

        # Store word positions for faster lookup
        word_positions = {word: [] for word in key_tokens}
        for i, word in enumerate(text_tokens):
            if word in word_positions:
                word_positions[word].append(i)

        # Search for each word in the keyword separately
        for token in key_tokens:
            token_pos_counts = Counter()
            for i in word_positions.get(token, []):  # direct lookup of positions
                # Look at words in the surrounding context window
                start, end = max(0, i - dynamic_window_size), min(len(text_tokens), i + dynamic_window_size + 1)

                context_lemmas = set(lemmatized_text[start:end])  # convert the context words to their lemmas for matching

                # Get remaining lemmas of the keyword (excluding the current word)
                current_lemma = text_lemmas.get(token, token)
                remaining_keyword_lemmas = [kw_lemma for kw_lemma in key_lemmas if kw_lemma != current_lemma]

                # Check if all other keyword lemmas exist in the context window
                if all(kw_lemma in context_lemmas for kw_lemma in remaining_keyword_lemmas):
                    token_pos_counts[text_pos_tags[i]] += 1  # record the POS tag for this token

            # Determine the most frequent POS tag for the token
            if token_pos_counts:
                most_common_pos = token_pos_counts.most_common(1)[0][0]
                keyword_word_pos.append(most_common_pos)
            else:
                keyword_word_pos.append(None) #no match found

        key_pos.append(tuple(keyword_word_pos))


    return key_pos



In [33]:
def pos_of_keywords_method_two(keywords, text_pos):

    """
    Identify the most common POS tag for each word in a multi-word keyword
    based on its occurrences in a given text with POS tagging.

    Args:
        - keywords (list of str): The keywords to find in the text.
        - text_pos (list of tuples): A list of (word, POS tag) tuples.

    Returns:
        - list of tuples: Each tuple contains the most frequent POS tag(s)
          for each word in a keyword. If no match is found, returns None.
    """

    key_pos = []

    # extract lowercase words and POS tags from text for comparison
    text = [text.lower() for text, _ in text_pos]
    pos = [pos for _, pos in text_pos]


    for key in keywords:
        key_tokens = key.lower().split()
        word_pos = []

        for word in key_tokens: #process each word separately

            # Find all occurrences of the word
            occurrences = []
            for i in range(len(text)):
              if text[i] == word:
                occurrences.append(pos[i]) #store corresponding POS tag

            # Determine the most common POS tag for this word
            if occurrences:
                most_common_pos = Counter(occurrences).most_common(1)[0][0]
                word_pos.append(most_common_pos)
            else:
                word_pos.append(None)

        key_pos.append(tuple(word_pos))

    return(key_pos)

In [34]:
#METHOD ONE
# execute pos_of_keywords on every row of the dataframes using method one

start = time.time()
keywords['pos_one'] = keywords.apply(lambda x: pos_of_keywords_method_one(x['text'], documents['pos'][x.name]), axis=1)
end = time.time()
print(f'part-of-speech tagging with method one of the keywords {end-start}')
"""Time to execute method one: 2768s"""

part-of-speech tagging with method one of the keywords 2768.3002519607544


'Time to execute method one: '

In [35]:
#METHOD TWO
# execute pos_of_keywords on every row of the dataframes using method two

start = time.time()
keywords['pos_two'] = keywords.apply(lambda x: pos_of_keywords_method_two(x['text'], documents['pos'][x.name]), axis=1)
end = time.time()
print(f'part-of-speech tagging with method two of the keywords {end-start}')
"""Time to execute method two: 3s"""

part-of-speech tagging with method two of the keywords 3.185098171234131


'Time to execute method two: '

In [36]:
def most_common_pos_sequences(pos_list, n=50):
  most_common_pos = Counter(pos_list).most_common(n)
  #discard those with None elements and those with lenght > 3
  return [mcs[0] for mcs in most_common_pos if all(item is not None for item in mcs[0]) and len(mcs[0]) <= 3]


In [37]:
# have all the keyword pos sequence in a single list
flattened_method_one = tuple(keywords['pos_one'].explode().tolist())

accepted_pos_sequences_method_one = most_common_pos_sequences(flattened_method_one)
print(len(accepted_pos_sequences_method_one))
accepted_pos_sequences_method_one

30


[('NOUN', 'NOUN'),
 ('ADJ', 'NOUN'),
 ('NOUN',),
 ('PROPN', 'NOUN'),
 ('PROPN', 'PROPN'),
 ('ADJ', 'NOUN', 'NOUN'),
 ('PROPN',),
 ('VERB', 'NOUN'),
 ('NOUN', 'VERB'),
 ('NOUN', 'NOUN', 'NOUN'),
 ('VERB',),
 ('ADJ', 'ADJ', 'NOUN'),
 ('PROPN', 'PROPN', 'PROPN'),
 ('NOUN', 'PROPN'),
 ('VERB', 'NOUN', 'NOUN'),
 ('ADJ', 'PROPN'),
 ('ADJ',),
 ('NOUN', 'ADJ', 'NOUN'),
 ('NOUN', 'ADP', 'NOUN'),
 ('NOUN', 'PUNCT', 'NOUN'),
 ('PROPN', 'NOUN', 'NOUN'),
 ('ADJ', 'NOUN', 'VERB'),
 ('PROPN', 'PROPN', 'NOUN'),
 ('ADJ', 'PROPN', 'NOUN'),
 ('NOUN', 'PUNCT', 'VERB'),
 ('PROPN', 'VERB'),
 ('PROPN', 'ADJ', 'NOUN'),
 ('NOUN', 'PROPN', 'NOUN'),
 ('ADJ', 'VERB'),
 ('PROPN', 'ADP', 'PROPN')]

In [38]:
flattened_method_two = tuple(keywords['pos_two'].explode().tolist())

accepted_pos_sequences_method_two = most_common_pos_sequences(flattened_method_two)
accepted_pos_sequences_method_two

[('NOUN', 'NOUN'),
 ('ADJ', 'NOUN'),
 ('NOUN',),
 ('ADJ', 'NOUN', 'NOUN'),
 ('PROPN', 'NOUN'),
 ('PROPN',),
 ('VERB', 'NOUN'),
 ('NOUN', 'VERB'),
 ('NOUN', 'NOUN', 'NOUN'),
 ('NOUN', 'PROPN'),
 ('VERB',),
 ('ADJ', 'ADJ', 'NOUN'),
 ('PROPN', 'PROPN'),
 ('ADJ', 'PROPN'),
 ('VERB', 'NOUN', 'NOUN'),
 ('NOUN', 'ADP', 'NOUN'),
 ('ADJ',),
 ('NOUN', 'ADJ', 'NOUN'),
 ('ADJ', 'VERB'),
 ('PROPN', 'NOUN', 'NOUN'),
 ('ADJ', 'PROPN', 'NOUN'),
 ('PROPN', 'ADJ', 'NOUN'),
 ('NOUN', 'VERB', 'NOUN'),
 ('ADJ', 'NOUN', 'VERB'),
 ('NOUN', 'PROPN', 'NOUN'),
 ('PROPN', 'VERB'),
 ('VERB', 'PROPN'),
 ('VERB', 'ADJ', 'NOUN'),
 ('ADJ', 'ADJ'),
 ('NOUN', 'ADJ'),
 ('PROPN', 'NOUN', 'PROPN'),
 ('X', 'X', 'NOUN'),
 ('NOUN', 'NOUN', 'VERB')]

In [41]:
selection1 = accepted_pos_sequences_method_one[:20]
print(selection1)

selection2 = accepted_pos_sequences_method_two[:20]
print(selection2)

[('NOUN', 'NOUN'), ('ADJ', 'NOUN'), ('NOUN',), ('PROPN', 'NOUN'), ('PROPN', 'PROPN'), ('ADJ', 'NOUN', 'NOUN'), ('PROPN',), ('VERB', 'NOUN'), ('NOUN', 'VERB'), ('NOUN', 'NOUN', 'NOUN'), ('VERB',), ('ADJ', 'ADJ', 'NOUN'), ('PROPN', 'PROPN', 'PROPN'), ('NOUN', 'PROPN'), ('VERB', 'NOUN', 'NOUN'), ('ADJ', 'PROPN'), ('ADJ',), ('NOUN', 'ADJ', 'NOUN'), ('NOUN', 'ADP', 'NOUN'), ('NOUN', 'PUNCT', 'NOUN')]
[('NOUN', 'NOUN'), ('ADJ', 'NOUN'), ('NOUN',), ('ADJ', 'NOUN', 'NOUN'), ('PROPN', 'NOUN'), ('PROPN',), ('VERB', 'NOUN'), ('NOUN', 'VERB'), ('NOUN', 'NOUN', 'NOUN'), ('NOUN', 'PROPN'), ('VERB',), ('ADJ', 'ADJ', 'NOUN'), ('PROPN', 'PROPN'), ('ADJ', 'PROPN'), ('VERB', 'NOUN', 'NOUN'), ('NOUN', 'ADP', 'NOUN'), ('ADJ',), ('NOUN', 'ADJ', 'NOUN'), ('ADJ', 'VERB'), ('PROPN', 'NOUN', 'NOUN')]


#Save combinations

In [42]:
import csv
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [43]:
output_dir_one = "keywords_combinations_method_one.csv"
with open(output_dir_one, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(accepted_pos_sequences_method_one)

%cd  /content/

files.download(output_dir_one)

/content


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
output_dir_two = "keywords_combinations_method_two.csv"
with open(output_dir_two, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(accepted_pos_sequences_method_two)

%cd  /content/

files.download(output_dir_two)

/content


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>