<a href="https://colab.research.google.com/github/Reennon/acter-ner/blob/colab-dry-run/notebooks/dry-run" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Reennon/acter-ner

In [None]:
!git clone https://github.com/AylaRT/ACTER

In [None]:
%cd /content/ACTER

In [None]:
%cd /content/acter-ner/term_extractor

In [None]:
ls

In [None]:
!bash combine_corpora.sh

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!rm train_full.tsv && \
  mv /content/drive/MyDrive/ucu/ner/datasets/train_cvalue_original.tsv train_full.tsv

In [None]:
# Cell X – programmatic IOB‐TSV → DocBin conversion for multiple splits
import spacy
from spacy.tokens import DocBin, Span
from pathlib import Path

# 1) Initialize blank pipeline (no models loaded)
nlp = spacy.blank("en")

# 2) List of (input TSV, desired output .spacy) pairs
splits = {
    "train_full.tsv":   "train_full.spacy",
    "test_full.tsv": "test_full.spacy",
}

# 3) Prepare output directory
out_dir = Path("output")
out_dir.mkdir(exist_ok=True)

# 4) Conversion loop
for tsv_name, spacy_name in splits.items():
    tsv_path = Path(tsv_name)
    if not tsv_path.exists():
        print(f"⚠️  Skipping missing {tsv_name}")
        continue

    docbin = DocBin(store_user_data=True)
    doc_count = ent_count = 0

    # read token-per-line, blank lines separate sentences → we group N sentences into one Doc
    # Here we’ll group every sentence as its own Doc (n_sents=1)
    words, labels = [], []
    with tsv_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    doc = spacy.tokens.Doc(nlp.vocab, words=words)
                    ents = []
                    # scan IOB labels to recover Span(start, end)
                    start = None
                    for i, tag in enumerate(labels):
                        if tag.startswith("B"):
                            if start is not None:
                                ents.append(Span(doc, start, i, label="TERM"))
                            start = i
                        elif tag.startswith("I"):
                            # continuation
                            continue
                        else:  # "O" or other
                            if start is not None:
                                ents.append(Span(doc, start, i, label="TERM"))
                                start = None
                    # catch final
                    if start is not None:
                        ents.append(Span(doc, start, len(labels), label="TERM"))

                    doc.ents = ents
                    docbin.add(doc)
                    doc_count += 1
                    ent_count += len(ents)
                    words, labels = [], []
                continue

            # parse token and IOB label (label may be "O" or "B-TERM"/"I-TERM")
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            tok, tag = parts
            words.append(tok)
            # normalize to plain "B"/"I"/"O"
            if tag.startswith("B"):
                labels.append("B")
            elif tag.startswith("I"):
                labels.append("I")
            else:
                labels.append("O")

        # flush last sentence if missing trailing blank line
        if words:
            doc = spacy.tokens.Doc(nlp.vocab, words=words)
            ents = []
            start = None
            for i, tag in enumerate(labels):
                if tag == "B":
                    if start is not None:
                        ents.append(Span(doc, start, i, label="TERM"))
                    start = i
                elif tag == "O" and start is not None:
                    ents.append(Span(doc, start, i, label="TERM"))
                    start = None
            if start is not None:
                ents.append(Span(doc, start, len(labels), label="TERM"))
            doc.ents = ents
            docbin.add(doc)
            doc_count += 1
            ent_count += len(ents)

    # write out the DocBin
    out_path = out_dir / spacy_name
    docbin.to_disk(out_path)
    print(f"✅ Converted {tsv_name} → {out_path} "
          f"({doc_count} docs, {ent_count} entities)")


In [None]:
!mkdir input_data && \
  mv output/train_full.spacy input_data/train_full.spacy && \
  mv output/test_full.spacy input_data/test_full.spacy && \
  mkdir -p /content/drive/MyDrive/ucu/ner/output/acter-cvalue-conf2-25-roberta-large-en

In [None]:
!pip install 'spacy[transformers]' -q

In [None]:
!spacy train configs/config_base.cfg  --gpu-id 0 --output /content/drive/MyDrive/ucu/ner/output/acter-cvalue-conf2-25-roberta-large-en

In [None]:
from google.colab import runtime
runtime.unassign()