In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

# Constants
BASE_URL = "https://www.emsa.europa.eu/publications.html"
DOWNLOAD_PREFIX = "/publications/download"
OUTPUT_DIR = "data/emsa_texts"
HEADERS = {"User-Agent": "Mozilla/5.0"}
VERIFY_SSL = False

os.makedirs(OUTPUT_DIR, exist_ok=True)

def sanitize_filename(name):
    return re.sub(r"[\\/*?\"<>|]", "_", name)[:80]

def find_html_pages():
    print("🔍 Scraping EMSA index for publication links...")
    try:
        res = requests.get(BASE_URL, headers=HEADERS, verify=VERIFY_SSL)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to load index page: {e}")
        return []

    soup = BeautifulSoup(res.content, "html.parser")
    links = []

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith(DOWNLOAD_PREFIX) and href.endswith(".html"):
            full_url = urljoin(BASE_URL, href)
            title = sanitize_filename(a.get_text(strip=True) or "EMSA_Document")
            links.append((full_url, title))

    return links

def extract_and_save_text(url, title, index):
    filename = f"{index:03d}_{title}.txt"
    path = os.path.join(OUTPUT_DIR, filename)

    if os.path.exists(path):
        print(f"✅ Already extracted: {filename}")
        return

    try:
        print(f"📝 Extracting: {filename}")
        res = requests.get(url, headers=HEADERS, verify=VERIFY_SSL)
        res.raise_for_status()
        soup = BeautifulSoup(res.content, "html.parser")

        # Extract text from all relevant tags
        content = []
        for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'li']):
            text = tag.get_text(strip=True)
            if text:
                content.append(text)

        full_text = "\n".join(content)

        with open(path, "w", encoding="utf-8") as f:
            f.write(full_text)

    except Exception as e:
        print(f"❌ Failed to extract from {url}: {e}")

def main():
    pages = find_html_pages()
    print(f"\n🔗 Found {len(pages)} document-like HTML pages.\n")

    for i, (url, title) in enumerate(pages):
        extract_and_save_text(url, title, i)

if __name__ == "__main__":
    main()


🔍 Scraping EMSA index for publication links...





🔗 Found 33 document-like HTML pages.

📝 Extracting: 000_EMSA Catalogue 2025 v26.06.pdf.txt




📝 Extracting: 001_EMSA CAAR2024.pdf.txt




📝 Extracting: 002_EMSA_FACTS_FIGURES_2024.pdf.txt




📝 Extracting: 003_Seafarers Statistics in the EU 2023 data report.pdf.txt




📝 Extracting: 004_AFVs Guidance 1.2 2025.pdf.txt




📝 Extracting: 005_ADER 2024.pdf.txt




📝 Extracting: 006_emsa ipa-enp newsletter issue 2.pdf.txt




📝 Extracting: 007_EMTER_F&F_2025_EN.pdf.txt




📝 Extracting: 008_EMTER_F&F_2025_BG.pdf.txt




📝 Extracting: 009_EMTER_F&F_2025_CS.pdf.txt




📝 Extracting: 010_EMTER_F&F_2025_DA.pdf.txt




📝 Extracting: 011_EMTER_F&F_2025_DE.pdf.txt




📝 Extracting: 012_EMTER_F&F_2025_EL.pdf.txt




📝 Extracting: 013_EMTER_F&F_2025_ES.pdf.txt




📝 Extracting: 014_EMTER_F&F_2025_ET.pdf.txt




📝 Extracting: 015_EMTER_F&F_2025_FI.pdf.txt




📝 Extracting: 016_EMTER_F&F_2025_FR.pdf.txt




📝 Extracting: 017_EMTER_F&F_2025_GA.pdf.txt




📝 Extracting: 018_EMTER_F&F_2025_HR.pdf.txt




📝 Extracting: 019_EMTER_F&F_2025_HU.pdf.txt




📝 Extracting: 020_EMTER_F&F_2025_IT.pdf.txt




📝 Extracting: 021_EMTER_F&F_2025_LT.pdf.txt




📝 Extracting: 022_EMTER_F&F_2025_LV.pdf.txt




📝 Extracting: 023_EMTER_F&F_2025_MT.pdf.txt




📝 Extracting: 024_EMTER_F&F_2025_NL.pdf.txt




📝 Extracting: 025_EMTER_F&F_2025_PL.pdf.txt




📝 Extracting: 026_EMTER_F&F_2025_PT.pdf.txt




📝 Extracting: 027_EMTER_F&F_2025_RO.pdf.txt




📝 Extracting: 028_EMTER_F&F_2025_SK.pdf.txt




📝 Extracting: 029_EMTER_F&F_2025_SL.pdf.txt




📝 Extracting: 030_EMTER_F&F_2025_SV.pdf.txt




📝 Extracting: 031_European Maritime Transport Environmental Report 2025.pdf.txt




📝 Extracting: 032_EMSA Outlook 2025_v3.0.pdf.txt




In [3]:
import os
import re

INPUT_DIR = "data/emsa_texts"
OUTPUT_DIR = "data/emsa_cleaned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_text(text):
    # Basic cleaning: remove multiple line breaks, weird whitespace, control characters
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
    return text.strip()

def process_file(path):
    with open(path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return clean_text(raw_text)

def process_all_files():
    files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".txt")]
    print(f"🧹 Cleaning {len(files)} files...")

    for f in files:
        raw_path = os.path.join(INPUT_DIR, f)
        clean_path = os.path.join(OUTPUT_DIR, f.replace(".txt", "_cleaned.txt"))

        cleaned = process_file(raw_path)

        with open(clean_path, "w", encoding="utf-8") as out:
            out.write(cleaned)

        print(f"✅ Cleaned: {f}")

process_all_files()


🧹 Cleaning 33 files...
✅ Cleaned: 028_EMTER_F&F_2025_SK.pdf.txt
✅ Cleaned: 008_EMTER_F&F_2025_BG.pdf.txt
✅ Cleaned: 022_EMTER_F&F_2025_LV.pdf.txt
✅ Cleaned: 000_EMSA Catalogue 2025 v26.06.pdf.txt
✅ Cleaned: 031_European Maritime Transport Environmental Report 2025.pdf.txt
✅ Cleaned: 001_EMSA CAAR2024.pdf.txt
✅ Cleaned: 030_EMTER_F&F_2025_SV.pdf.txt
✅ Cleaned: 003_Seafarers Statistics in the EU 2023 data report.pdf.txt
✅ Cleaned: 024_EMTER_F&F_2025_NL.pdf.txt
✅ Cleaned: 023_EMTER_F&F_2025_MT.pdf.txt
✅ Cleaned: 018_EMTER_F&F_2025_HR.pdf.txt
✅ Cleaned: 011_EMTER_F&F_2025_DE.pdf.txt
✅ Cleaned: 005_ADER 2024.pdf.txt
✅ Cleaned: 004_AFVs Guidance 1.2 2025.pdf.txt
✅ Cleaned: 026_EMTER_F&F_2025_PT.pdf.txt
✅ Cleaned: 016_EMTER_F&F_2025_FR.pdf.txt
✅ Cleaned: 029_EMTER_F&F_2025_SL.pdf.txt
✅ Cleaned: 002_EMSA_FACTS_FIGURES_2024.pdf.txt
✅ Cleaned: 006_emsa ipa-enp newsletter issue 2.pdf.txt
✅ Cleaned: 025_EMTER_F&F_2025_PL.pdf.txt
✅ Cleaned: 020_EMTER_F&F_2025_IT.pdf.txt
✅ Cleaned: 014_EMTER_F&F_202

In [7]:
# 🚢 03_dataset_formatting.ipynb

import os
import json
from pathlib import Path

# 📂 Input & Output Paths
CLEANED_DIR = Path("data/emsa_cleaned")
OUTPUT_FILE = Path("data/emsa_dataset.jsonl")

# ⚙️ Config
SYSTEM_PROMPT = (
    "You are a maritime compliance assistant. "
    "Answer questions based on maritime safety, regulatory, and operational documents. "
    "Use precise and factual tone."
)

def read_cleaned_files(directory):
    """
    Load all cleaned text files and return as list of strings.
    """
    files = sorted(directory.glob("*.txt"))
    docs = []

    for f in files:
        try:
            text = f.read_text(encoding='utf-8').strip()
            if len(text) < 100:
                continue  # skip tiny fragments
            docs.append({
                "filename": f.name,
                "text": text
            })
        except Exception as e:
            print(f"⚠️ Skipping {f.name}: {e}")

    return docs


def convert_to_chat_format(text, filename):
    """
    Wrap each document as a prompt-response chat pair (instruction-tuning style).
    """
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"Summarize the key points of the following maritime document:\n\n{text[:2000]}"},
            {"role": "assistant", "content": f"This is a regulatory excerpt from file '{filename}'. It includes technical maritime details. [PLACEHOLDER RESPONSE]"}
        ]
    }

# 📥 Load cleaned docs
documents = read_cleaned_files(CLEANED_DIR)
print(f"📄 Loaded {len(documents)} documents.")

# 💬 Format to chat-style dataset
chat_data = [convert_to_chat_format(d["text"], d["filename"]) for d in documents]

# 💾 Save to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for item in chat_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Saved JSONL dataset: {OUTPUT_FILE} ({len(chat_data)} samples)")


📄 Loaded 1 documents.
✅ Saved JSONL dataset: data/emsa_dataset.jsonl (1 samples)


In [8]:
!pip install -q bitsandbytes accelerate peft transformers datasets


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
#🧱 2. Load Dataset (from JSONL)
from datasets import load_dataset

dataset = load_dataset("json", data_files="data/emsa_dataset.jsonl", split="train")
print(f"✅ Loaded {len(dataset)} examples")
#🧱 3. Load Tokenizer & Apply Chat Template
from transformers import AutoTokenizer

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # avoid tokenizer errors

def apply_chat_template(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

dataset = dataset.map(apply_chat_template)
#🧱 4. Tokenize Dataset
def tokenize(example):
    tokens = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors=None  # important!
    )
    tokens["labels"] = tokens["input_ids"].copy()  # now copy() will work!
    return tokens


tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
#5. Load Model with LoRA
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
#🧱 6. Training Setup

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="outputs/tinyllama-emsa",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True if torch.cuda.is_available() else False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


trainer.train()


model.save_pretrained("outputs/final_tinyllama_emsa")
tokenizer.save_pretrained("outputs/final_tinyllama_emsa")


✅ Loaded 1 examples


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

2025-07-04 10:34:33.983205: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751625274.197444      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751625274.259536      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


('outputs/final_tinyllama_emsa/tokenizer_config.json',
 'outputs/final_tinyllama_emsa/special_tokens_map.json',
 'outputs/final_tinyllama_emsa/tokenizer.model',
 'outputs/final_tinyllama_emsa/added_tokens.json',
 'outputs/final_tinyllama_emsa/tokenizer.json')

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_path = "outputs/final_tinyllama_emsa"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
from datasets import load_dataset

eval_dataset = load_dataset("json", data_files="data/emsa_dataset.jsonl", split="train[:100%]")  # adjust slice
eval_dataset = eval_dataset.map(lambda x: {"text": tokenizer.apply_chat_template(x["messages"], tokenize=False)})
def tokenize_eval(example):
    tokens = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_eval = eval_dataset.map(tokenize_eval, remove_columns=eval_dataset.column_names)
from transformers import Trainer, DataCollatorForLanguageModeling, TrainingArguments

eval_args = TrainingArguments(
    output_dir="eval-outputs",
    per_device_eval_batch_size=2,
    report_to="none"
)

eval_trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

results = eval_trainer.evaluate()
print("📊 Evaluation Results:", results)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  eval_trainer = Trainer(


📊 Evaluation Results: {'eval_loss': 3.705432415008545, 'eval_model_preparation_time': 0.016, 'eval_runtime': 0.3492, 'eval_samples_per_second': 2.864, 'eval_steps_per_second': 2.864}


In [12]:
import math
print("🧠 Perplexity:", math.exp(results["eval_loss"]))


🧠 Perplexity: 40.667628808413106
