# Import Required Libraries
This section imports the necessary libraries for working with SpaCy, visualization, and the test data conversion utilities.

In [7]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import pandas as pd
import os
from pathlib import Path

# If your converter is in the module, import it as well
from webanno_spacy_converter.converters.webanno_to_spacy import AnnotationSentencesToDocBinConverterV2

# Load Test Data
Load a .spacy file generated from your test data. This file contains serialized SpaCy Doc objects.

In [8]:
# Path to the test .spacy file (adjust as needed)
docbin_path = Path("..") / "test_data" / "examplev2.spacy"

# Load the DocBin
nlp = spacy.load(Path("..") /"my_nlp_el_cnn1")  
doc_bin = DocBin().from_disk(str(docbin_path))
docs = list(doc_bin.get_docs(nlp.vocab))
print(f"Loaded {len(docs)} SpaCy Doc objects from {docbin_path}")



Loaded 5 SpaCy Doc objects from ..\test_data\examplev2.spacy


# Convert Test Data to SpaCy Docs
If you have annotated sentences and want to convert them to SpaCy Docs, use the converter here. (This is optional if you already have a .spacy file.)

In [9]:
# Example (uncomment and adjust if you want to run conversion):
# from webanno_spacy_converter.parsers.tsv_parser_v3 import WebAnnoNELParser
# parser = WebAnnoNELParser("../test_data/output.tsv")
# parser.parse()
# converter = AnnotationSentencesToDocBinConverterV2(nlp)
# doc_bin = converter.convert(parser.sentences)
# docs = list(doc_bin.get_docs(nlp.vocab))

# Visualize SpaCy Docs
Use SpaCy's displacy to visualize named entities and dependency parses for the loaded documents.

In [10]:
# Visualize named entities in the first document (if any)
if docs and docs[0].ents:
    displacy.render(docs[0], style="ent", jupyter=True)
else:
    print("No named entities found in the first document.")

# Inspect Named Entities
Programmatically inspect the named entities, their labels, and any linked knowledge base IDs in the SpaCy Docs.

In [11]:
# Collect entity details from the first document
if docs:
    ents_data = [
        {
            "Text": ent.text,
            "Label": ent.label_,
            "KB_ID": getattr(ent, "kb_id_", None)
        }
        for ent in docs[0].ents
    ]
    if ents_data:
        df = pd.DataFrame(ents_data)
        display(df)
    else:
        print("No entities found in the first document.")
else:
    print("No documents loaded.")

Unnamed: 0,Text,Label,KB_ID
0,Alžir,LOC,Q262
1,Africi,LOC,Q15
2,Alžir,LOC,Q3561
3,Sredozemnog mora,LOC,Q4918
4,Severnoj Africi,LOC,Q27381
5,Alžira,LOC,Q262
6,Alžira,LOC,Q3561
7,Alžiru,LOC,Q262
8,ambasadori,ROLE,NIL
9,Alžir,LOC,Q3561


In [12]:
#show the first document text, taggs,lemmas and sentences
if docs:
    doc = docs[0]
    print(f"Text: {doc.text}")
    print(f"Tokens: {[token.text for token in doc]}")
    print(f"Lemmas: {[token.lemma_ for token in doc]}")
    print(f"POS Tags: {[token.pos_ for token in doc]}")
    print(f"Sentences: {[sent.text for sent in doc.sents]}")

Text: Alžir, kao druga po veličini zemlja u Africi, ima bogatu istoriju koja se proteže od drevnih vremena, kada su se njome kretali nomadi, sve do modernog doba, kada je postao važan igrač u globalnoj trgovini naftom. Alžir, koji se prostire duž obale Sredozemnog mora, važi za jedan od najstarijih gradova u Severnoj Africi, poznat po svojim belim zgradama i živopisnim bazarima. Ekonomija Alžira se oslanja na bogate rezerve nafte i gasa, što čini ovu severnoafričku zemlju jednim od ključnih izvoznika energije na svetskom tržištu. Stari delovi Alžira, sa uskim ulicama i tradicionalnim kućama, pružaju uvid u bogatu istoriju i kulturni život koji je oblikovao ovaj glavni grad vekovima. Međunarodni stručnjaci pružaju pomoć Alžiru kako bi unapredili svoje obrazovne i zdravstvene sisteme, osiguravajući bolji standard života za stanovništvo cele zemlje. Povodom kulturnog festivala, ambasadori iz različitih zemalja došli su u Alžir kako bi prisustvovali događajima koji prikazuju bogatu umetnič