In [None]:
!pip install spacy spacy-layout sentence-transformers
!python -m spacy download en_core_web_sm

In [None]:
import csv
from pathlib import Path

import spacy
from spacy_layout import spaCyLayout

# import ExtractoMAT code (adjust path as needed)
import sys
sys.path.append(str(Path.cwd() / "extractomat-master"))
from matcha import basic, cvalue
from tester import TermEvaluator
from sbert_reranker import SentenceSimilarityCalculator

# -- CONFIGURATION -----------------------------------------------------------
# List your feature files (TSVs with tokens + BIO labels) here:
test_files = [
    '../data/train_rerank_original.tsv',
    '../data/train_rerank_conf0.4.tsv',
    '../data/train_cvalue_original.tsv',
    '../data/train_cvalue_conf2.25.tsv',
    # add more test files as you wish
]
# List your ground-truth sources here (TSVs covering full datasets):
gt_sources = [
    'train_full.tsv',
    'test_full.tsv',
    # add more GT source files as needed
]
# SpaCy model and rerank adjustment method:
SPACY_MODEL = 'en_core_web_sm'
RERANK_ADJUSTMENT = 'none'  # options: none, legacy, median, modified_z_score

# Threshold mapping per feature file (adjust or extend):
thresholds = {
    'train_rerank_original.tsv': 0.0,
    'train_rerank_conf0.4.tsv': 0.4,
    'train_cvalue_original.tsv': 0.0,
    'train_cvalue_conf2.25.tsv': 2.25,
    # for additional files, set your thresholds
}

# -- UTILITY FUNCTIONS ------------------------------------------------------
def read_tsv(path):
    """Return list of (tokens, labels) sentences from a BIO-tagged TSV."""
    sents, toks, labs = [], [], []
    for line in Path(path).read_text(encoding='utf-8').splitlines():
        if not line.strip():
            if toks:
                sents.append((toks, labs))
                toks, labs = [], []
        else:
            token, label = line.split('\t')
            toks.append(token)
            labs.append(label)
    if toks:
        sents.append((toks, labs))
    return sents


def build_full_map(source_paths):
    """Combine multiple full TSVs into a sentence->BIO-labels map."""
    full_map = {}
    for src in source_paths:
        for toks, labs in read_tsv(src):
            key = ' '.join(toks)
            full_map[key] = labs
    return full_map


def extract_gt_terms(tokens, labels):
    """Extract unique GT term strings from tokens + BIO labels."""
    terms, i = [], 0
    while i < len(labels):
        if labels[i].startswith('B-'):
            span = [tokens[i]]
            i += 1
            while i < len(labels) and labels[i].startswith('I-'):
                span.append(tokens[i])
                i += 1
            terms.append(' '.join(span).lower())
        else:
            i += 1
    return sorted(set(terms))


def write_text_file(sentences, out_path):
    with open(out_path, 'w', encoding='utf-8') as f:
        for s in sentences:
            f.write(s + '\n\n')


def write_gt_csv(sentences, full_map, out_csv):
    with open(out_csv, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        for sent in sentences:
            toks = sent.split(' ')
            labs = full_map[sent]
            for term in extract_gt_terms(toks, labs):
                writer.writerow([term])

# -- MAIN EVALUATION --------------------------------------------------------

# Load spaCy & layout once
eval_nlp = spacy.load(SPACY_MODEL)
layout = spaCyLayout(eval_nlp)

# Build GT map
gt_map = build_full_map(gt_sources)

results = []
for feature_file in test_files:
    # Reconstruct sentences from feature file
    sents = [' '.join(toks) for toks, _ in read_tsv(feature_file)]

    # Write temporary text and GT CSV
    txt_path = feature_file.replace('.tsv', '.txt')
    gt_csv_path = feature_file.replace('.tsv', '_gt.csv')
    write_text_file(sents, txt_path)
    write_gt_csv(sents, gt_map, gt_csv_path)

    # Run ExtractoMAT
    raw_text = Path(txt_path).read_text(encoding='utf-8').lower()
    doc = eval_nlp(raw_text)
    if 'rerank' in feature_file:
        ts, toc = cvalue(doc, n_min=2, smoothing=0.1, n_max=4)
        reranker = SentenceSimilarityCalculator()
        ts = reranker.rerank_terms_in_doc(
            doc, toc,
            context_len=3,
            pooling='max',
            length_adjustment=RERANK_ADJUSTMENT,
        )
        method = 'rerank'
    else:
        ts, toc = cvalue(doc, n_min=2, smoothing=0.1, n_max=4)
        method = 'cvalue'

    # Evaluate
    evaluator = TermEvaluator(
        gt_path=gt_csv_path,
        term_scores=ts,
        term_occurrences=toc,
        filter_single_word=True,
        method=method,
        language=eval_nlp.lang_,
    )
    thr = thresholds.get(feature_file, 0.0)
    prec, rec, f1 = evaluator.calculate_metrics(threshold=thr, verbose=False)
    results.append((feature_file, method, thr, rec, prec, f1))

# Print summary
def print_results(res):
    header = f"{'file':<30} {'method':<8} {'thr':>5} {'recall':>8} {'prec':>8} {'f1':>8}"
    print(header)
    print('-'*len(header))
    for fn, m, thr, rec, prec, f1 in res:
        print(f"{fn:<30} {m:<8} {thr:5.2f} {rec:8.3f} {prec:8.3f} {f1:8.3f}")

print_results(results)
