In [2]:
# Clone CRAFT repository
!git clone https://github.com/UCDenver-ccp/CRAFT.git
# Install spaCy
!pip install -q spacy
!python3 -m spacy download en_core_web_sm

Cloning into 'CRAFT'...
remote: Enumerating objects: 17965, done.[K
remote: Counting objects: 100% (235/235), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 17965 (delta 152), reused 197 (delta 127), pack-reused 17730 (from 1)[K
Receiving objects: 100% (17965/17965), 258.71 MiB | 51.96 MiB/s, done.
Resolving deltas: 100% (15311/15311), done.
Updating files: 100% (3078/3078), done.
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import os
import xml.etree.ElementTree as ET

def parse_knowtator(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    spans = []
    for annot in root.findall(".//annotation"):
        spans_xml = annot.findall("span")
        if spans_xml:
            for span in spans_xml:
                start = int(span.attrib['start'])
                end = int(span.attrib['end'])
                spans.append((start, end))

    return spans

In [4]:
import re
# Not used because it causes difference in span position counts
def preprocess_text(text):
    lines = text.split("\n")
    processed = []
    for line in lines:
        stripped = line.strip()

        if not stripped:
            processed.append("")  # preserve blank lines
            continue

        is_likely_title = (
            (stripped.isupper() or stripped.istitle() or stripped[0].isupper())
            and len(stripped.split()) < 20
        )

        if is_likely_title and not re.match(r'.*[\.\!\?]$', stripped):
            stripped += "."

        processed.append(stripped)

    return "\n".join(processed)

In [5]:
import os
import json
import string

PUNCTUATION = set(string.punctuation)

def should_keep_token(token):
    # return True
    token_stripped = token.strip()
    # Remove white spaces and new lines
    if not token_stripped:
        return False
    # if token_stripped in PUNCTUATION:
    #     return False
    return True
def save_iob_as_tuples(file_id, sentence_tags, output_dir="iob"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{file_id}.txt")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(sentence_tags, f, ensure_ascii=False, indent=2)

# def save_iob_as_tuples(file_id, sentence_tags, output_dir="iob"):
#     os.makedirs(output_dir, exist_ok=True)
#     output_path = os.path.join(output_dir, f"{file_id}.txt")

#     with open(output_path, "w", encoding="utf-8") as f:
#         f.write("[\n")  # Open outer list
#         for i, sentence in enumerate(sentence_tags):
#             f.write("  [\n")
#             for j, (token, tag) in enumerate(sentence):
#                 comma = "," if j < len(sentence) - 1 else ""
#                 f.write(f"    ({repr(token)}, {repr(tag)}){comma}\n")
#             f.write("  ]" + ("," if i < len(sentence_tags) - 1 else "") + "\n")
#         f.write("]\n")  # Close outer list

In [6]:

import spacy
nlp = spacy.load("en_core_web_sm")

def iob_tag_tokens(text, spans):
    doc = nlp(text)
    spans = sorted(spans, key=lambda x: x[0])
    sentences = []

    for sent in doc.sents:
        sentence_tags = []
        for token in sent:
            tok_start = token.idx
            tag = "O"
            for span_start, span_end in spans:
                if span_start <= tok_start < span_end:
                    tag = "B" if tok_start == span_start else "I"
                    break  # Take the first match (in case of overlap)
            if should_keep_token(token.text):
                sentence_tags.append((token.text, tag))
        if sentence_tags:
            sentences.append(sentence_tags)

    return sentences

In [7]:
import glob

TEXT_DIR = "CRAFT/articles/txt/"
ANNOT_DIRS = {
    "GO_BP": "CRAFT/concept-annotation/GO_BP/GO_BP/knowtator/",
    "GO_CC": "CRAFT/concept-annotation/GO_CC/GO_CC/knowtator/",
    "GO_MF": "CRAFT/concept-annotation/GO_MF/GO_MF/knowtator/",
}
OUTPUT_DIR = "iob/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

processed = 0
for text_file in glob.glob(os.path.join(TEXT_DIR, "*.txt")):
    base = os.path.basename(text_file)
    file_id = base.replace(".txt", "")

    with open(text_file, "r", encoding="utf-8") as f:
        text = f.read()

    all_spans = []
    for path in ANNOT_DIRS.values():
        xml_file = os.path.join(path, base + ".knowtator.xml")
        if os.path.exists(xml_file):
            spans = parse_knowtator(xml_file)
            all_spans.extend(spans)

    if not all_spans:
        continue

    tagged = iob_tag_tokens(text, all_spans)
    save_iob_as_tuples(file_id, tagged, OUTPUT_DIR)

    processed += 1
    print(f"[✓] {base} → {OUTPUT_DIR}{file_id}.txt")

print(f"\nProcessed {processed} files with annotations.")

[✓] 12925238.txt → iob/12925238.txt
[✓] 16098226.txt → iob/16098226.txt
[✓] 15550985.txt → iob/15550985.txt
[✓] 17194222.txt → iob/17194222.txt
[✓] 15018652.txt → iob/15018652.txt
[✓] 17083276.txt → iob/17083276.txt
[✓] 15061865.txt → iob/15061865.txt
[✓] 16121256.txt → iob/16121256.txt
[✓] 15615595.txt → iob/15615595.txt
[✓] 16121255.txt → iob/16121255.txt
[✓] 16462940.txt → iob/16462940.txt
[✓] 11532192.txt → iob/11532192.txt
[✓] 17608565.txt → iob/17608565.txt
[✓] 15588329.txt → iob/15588329.txt
[✓] 12585968.txt → iob/12585968.txt
[✓] 15345036.txt → iob/15345036.txt
[✓] 16221973.txt → iob/16221973.txt
[✓] 15850489.txt → iob/15850489.txt
[✓] 16968134.txt → iob/16968134.txt
[✓] 15328538.txt → iob/15328538.txt
[✓] 16216087.txt → iob/16216087.txt
[✓] 16870721.txt → iob/16870721.txt
[✓] 12546709.txt → iob/12546709.txt
[✓] 15676071.txt → iob/15676071.txt
[✓] 11597317.txt → iob/11597317.txt
[✓] 16027110.txt → iob/16027110.txt
[✓] 16579849.txt → iob/16579849.txt
[✓] 14723793.txt → iob/14723

In [None]:
import shutil

# Zip all IOB files
shutil.make_archive("iob_output", 'zip', OUTPUT_DIR)

# If in Google Colab, download the zip file
# from google.colab import files
# files.download("iob_output.zip")

In [9]:
def print_json_iob_sentences(file_id, output_dir="iob"):
    file_path = os.path.join(output_dir, f"{file_id}.txt")

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    with open(file_path, "r", encoding="utf-8") as f:
        content = json.load(f)

    for sentence in content:
        print([tuple(pair) for pair in sentence])
# Sample result
print_json_iob_sentences("15492776")

[('BMP', 'B'), ('Receptor', 'I'), ('Signaling', 'I'), ('Is', 'O'), ('Required', 'O'), ('for', 'O'), ('Postnatal', 'O'), ('Maintenance', 'O'), ('of', 'O'), ('Articular', 'O'), ('Cartilage', 'O'), ('Abstract', 'O'), ('Articular', 'O'), ('cartilage', 'O'), ('plays', 'O'), ('an', 'O'), ('essential', 'O'), ('role', 'O'), ('in', 'O'), ('health', 'O'), ('and', 'O'), ('mobility', 'O'), (',', 'O'), ('but', 'O'), ('is', 'O'), ('frequently', 'O'), ('damaged', 'O'), ('or', 'O'), ('lost', 'O'), ('in', 'O'), ('millions', 'O'), ('of', 'O'), ('people', 'O'), ('that', 'O'), ('develop', 'O'), ('arthritis', 'O'), ('.', 'O')]
[('The', 'O'), ('molecular', 'O'), ('mechanisms', 'O'), ('that', 'O'), ('create', 'O'), ('and', 'O'), ('maintain', 'O'), ('this', 'O'), ('thin', 'O'), ('layer', 'O'), ('of', 'O'), ('cartilage', 'O'), ('that', 'O'), ('covers', 'O'), ('the', 'O'), ('surface', 'O'), ('of', 'O'), ('bones', 'O'), ('in', 'O'), ('joint', 'O'), ('regions', 'O'), ('are', 'O'), ('poorly', 'O'), ('understood', 