<a href="https://colab.research.google.com/github/s319848/DNLP-project-2025/blob/main/notebooks/First-extension/extension1_fine_tune_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install keybert transformers datasets spacy bert-score rouge


Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.wh

In [2]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is enabled.
print(torch.cuda.get_device_name(0))  # Prints the name of the GPU.

True
Tesla T4


In [3]:
import random
import os
from datasets import Dataset
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from keybert import KeyBERT
import torch
from tqdm import tqdm
import requests
import zipfile
from time import time
import wandb
wandb.init(mode='disabled')

In [4]:
def prepare_dataset(dataset_name, sample_fraction):
    data_url = "https://github.com/LIAAD/KeywordExtractor-Datasets/archive/refs/heads/master.zip"
    local_zip_path = "datasets.zip"
    unzip_dir = "KeywordExtractor-Datasets"
    # Download the dataset repository
    def download_datasets():
        print("Downloading datasets...")
        response = requests.get(data_url)
        if response.status_code == 200:
            with open(local_zip_path, "wb") as file:
                file.write(response.content)
            print("Datasets downloaded successfully.")
        else:
            print(f"Failed to download datasets. Status code: {response.status_code}")
            exit(1)
    # Extract the downloaded zip file
    def extract_datasets():
        print("Extracting datasets...")
        with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_dir)
        print("Datasets extracted successfully.")

    # Extract individual dataset zips
    def extract_inner_zips():
        datasets_path = os.path.join(unzip_dir, "KeywordExtractor-Datasets-master/datasets")
        for file in os.listdir(datasets_path):
            if file.endswith(".zip"):
                zip_path = os.path.join(datasets_path, file)
                extract_path = os.path.join(datasets_path, file.replace(".zip", ""))
                if not os.path.exists(extract_path):
                    print(f"Extracting {file}...")
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_path)
                    print(f"Extracted {file} to {extract_path}.")

    # Process a specific dataset and convert it into a usable format
    def process_dataset(dataset_name, sample_fraction=0.2):
        dataset_path = os.path.join(unzip_dir, f"KeywordExtractor-Datasets-master/datasets/{dataset_name}/{dataset_name}")

        # Check if dataset directory exists
        if not os.path.exists(dataset_path):
            print(f"Dataset {dataset_name} not found.")
            return None

        docs_folder = os.path.join(dataset_path, "docsutf8")
        keys_folder = os.path.join(dataset_path, "keys")

        if not os.path.exists(docs_folder) or not os.path.exists(keys_folder):
            print(f"Required folders (docsutf8, keys) are missing in {dataset_name}.")
            return None


        # Get all document IDs
        doc_ids = [f.split(".")[0] for f in os.listdir(docs_folder) if f.endswith(".txt")]
        # Sample the dataset
        sample_size = max(1, int(sample_fraction * len(doc_ids)))
        sampled_ids = random.sample(doc_ids, sample_size)

        data = []
        for doc_id in tqdm(sampled_ids, desc="Loading documents"):
            doc_file = os.path.join(docs_folder, f"{doc_id}.txt")
            key_file = os.path.join(keys_folder, f"{doc_id}.key")

            if os.path.exists(doc_file) and os.path.exists(key_file):
                with open(doc_file, "r", encoding="utf-8") as df, open(key_file, "r", encoding="utf-8") as kf:
                    text = df.read().strip()
                    keywords = [kw.strip() for kw in kf.read().strip().split("\n") if kw.strip()]

                    # Create positive pairs: each keyword with the document
                    for keyword in keywords:
                        data.append({
                            "text1": text,
                            "text2": keyword,
                            "label": 1.0  # positive pair
                        })

                    # Create some negative pairs using keywords from other documents
                    other_docs_keywords = []
                    for other_id in random.sample([i for i in sampled_ids if i != doc_id], min(3, len(sampled_ids)-1)):
                        with open(os.path.join(keys_folder, f"{other_id}.key"), "r", encoding="utf-8") as okf:
                            other_docs_keywords.extend([kw.strip() for kw in okf.read().strip().split("\n") if kw.strip()])

                    # Add negative examples
                    for neg_keyword in random.sample(other_docs_keywords, min(len(keywords), len(other_docs_keywords))):
                        data.append({
                            "text1": text,
                            "text2": neg_keyword,
                            "label": 0.0  # negative pair
                        })

        return Dataset.from_list(data)

    download_datasets()
    extract_datasets()
    extract_inner_zips()

    dataset = process_dataset(dataset_name, sample_fraction)

    return dataset


In [5]:
def create_dataloaders(dataset, train_ratio=0.8, batch_size=8):
    """
    Create train and evaluation dataloaders.
    """
    # Split dataset
    train_size = int(train_ratio * len(dataset))
    train_dataset = dataset.select(range(train_size))
    eval_dataset = dataset.select(range(train_size, len(dataset)))

    # Convert to SBERT format
    train_samples = [
        InputExample(texts=[d["text1"], d["text2"]], label=float(d["label"]))
        for d in train_dataset
    ]

    eval_samples = [
        InputExample(texts=[d["text1"], d["text2"]], label=float(d["label"]))
        for d in eval_dataset
    ]

    # Create DataLoaders
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
    eval_dataloader = DataLoader(eval_samples, shuffle=False, batch_size=batch_size)

    return train_dataloader, eval_dataloader, eval_samples

def fine_tune_model(dataset_name, sample_fraction, output_dir="fine_tuned_keybert_model", epochs=3, batch_size=8):
    """
    Fine-tune KeyBERT model on the dataset.

    Args:
        dataset_path: Path to the dataset directory
        output_dir: Directory to save the fine-tuned model
        epochs: Number of training epochs
        batch_size: Batch size for training

    Returns:
        model: a fine tuned all-miniLM-L6-v2 model
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Prepare dataset
    dataset = prepare_dataset(dataset_name, sample_fraction)
    train_dataloader, eval_dataloader, eval_samples = create_dataloaders(
        dataset, batch_size=batch_size
    )

    # Initialize model
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

    # Define loss function
    train_loss = losses.CosineSimilarityLoss(model)

    # Create evaluator
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        eval_samples, batch_size=batch_size
    )

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=epochs,
        evaluation_steps=100,
        warmup_steps=100,
        output_path=output_dir,
        show_progress_bar=True
    )

    model.save(output_dir)

    # Create KeyBERT model with fine-tuned embeddings
    return model

# Usage example
if __name__ == "__main__":
    #UNCOMMENT THIS CELL AND COMMENT THE CELL BELOW TO CHANGE THE DIRECTORY PATH
    # dataset_path = "drive/MyDrive/Colab Notebooks/Deep Natural Language/Project-DNLP/Krapivin2009"
    output_dir = "./fine_tuned_keybert_model_0_2"

    # Fine-tune the model
    ft_start = time()
    fine_tuned_model = fine_tune_model("Krapivin2009", 0.2, output_dir=output_dir)
    ft_end= time()
    print(f"Time to fine tune the model: {ft_end-ft_start:.3f}")

    # The model is automatically saved in the output_dir during training
    # You can later load it using:
    # loaded_model = SentenceTransformer('fine_tuned_keybert_model')
    # keybert_model = KeyBERT(model=loaded_model)

    """Time to fine tune the model: 1148.749"""

Downloading datasets...
Datasets downloaded successfully.
Extracting datasets...
Datasets extracted successfully.
Extracting wicc.zip...
Extracted wicc.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/wicc.
Extracting Nguyen2007.zip...
Extracted Nguyen2007.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/Nguyen2007.
Extracting WikiNews.zip...
Extracted WikiNews.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/WikiNews.
Extracting wiki20.zip...
Extracted wiki20.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/wiki20.
Extracting www.zip...
Extracted www.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/www.
Extracting citeulike180.zip...
Extracted citeulike180.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/citeulike180.
Extracting kdd.zip...
Extracted kdd.zip to KeywordExtractor-Datasets/KeywordExtractor-Datasets-master/datasets/kdd.

Loading documents: 100%|██████████| 460/460 [00:00<00:00, 5900.44it/s]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
100,No log,No log,0.705555,0.714801
200,No log,No log,0.706084,0.718904
300,No log,No log,0.716201,0.724057
400,No log,No log,0.695838,0.709328
488,No log,No log,0.686119,0.697944
500,0.156300,No log,0.689902,0.700954
600,0.156300,No log,0.688661,0.700138
700,0.156300,No log,0.697787,0.706507
800,0.156300,No log,0.698205,0.706004
900,0.156300,No log,0.697987,0.705779


Time to fine tune the model: 1148.749


# Saving the model manually

In [6]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/

!zip -r fine_tuned_keybert_model_0_2.zip fine_tuned_keybert_model_0_2/

from google.colab import files
files.download('fine_tuned_keybert_model_0_2.zip')

Mounted at /content/drive
/content
  adding: fine_tuned_keybert_model_0_2/ (stored 0%)
  adding: fine_tuned_keybert_model_0_2/tokenizer.json (deflated 71%)
  adding: fine_tuned_keybert_model_0_2/model.safetensors (deflated 8%)
  adding: fine_tuned_keybert_model_0_2/README.md (deflated 70%)
  adding: fine_tuned_keybert_model_0_2/config_sentence_transformers.json (deflated 34%)
  adding: fine_tuned_keybert_model_0_2/2_Normalize/ (stored 0%)
  adding: fine_tuned_keybert_model_0_2/eval/ (stored 0%)
  adding: fine_tuned_keybert_model_0_2/eval/similarity_evaluation_results.csv (deflated 31%)
  adding: fine_tuned_keybert_model_0_2/modules.json (deflated 62%)
  adding: fine_tuned_keybert_model_0_2/tokenizer_config.json (deflated 73%)
  adding: fine_tuned_keybert_model_0_2/1_Pooling/ (stored 0%)
  adding: fine_tuned_keybert_model_0_2/1_Pooling/config.json (deflated 57%)
  adding: fine_tuned_keybert_model_0_2/vocab.txt (deflated 53%)
  adding: fine_tuned_keybert_model_0_2/sentence_bert_config.js

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>