In [None]:
import requests
import json
from gatenlp import Document
import docx2txt

In [None]:
#read all txt files from folder 
def read_txt_files(folder_path):
    import os
    txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    documents = []
    for file in txt_files:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            filename = os.path.splitext(file)[0]
            
            content = f.read()
            gatenlp_doc = Document(content)
            gatenlp_doc.name = filename
            documents.append(gatenlp_doc)
            
    return documents

In [None]:
def read_docx(file_path):
    full_text = docx2txt.process(file_path)

    return full_text

# read all docs from folder and extract text
def read_docs_from_folder(folder_path):
    import os
    files = os.listdir(folder_path)
    docs = []
    for file in files:
        if file.endswith('.docx'):
            doc = read_docx(folder_path + "/" + file)
            gateNlpDoc = Document(doc)
            gateNlpDoc.name = file
            docs.append(gateNlpDoc)
    return docs

all_docs = read_docs_from_folder("./DocumentsStore/batini3")

In [None]:
all_docs = read_txt_files("./DocumentsStore/bologna_final")
len(all_docs)

In [None]:
#read all json files in folder
import os
import glob
import json
import math 
def read_json_files(folder):
    data = []
    for filename in glob.glob(os.path.join(folder, '*.json')):
        with open (filename, 'r') as f:
            jsonDoc = json.load(f)
            # for key, value in jsonDoc['features'].items():
            #     if isinstance(value, float) and math.isnan(value):
            #         jsonDoc['features'][key] = "NaN"
        
            data.append(Document.from_dict(jsonDoc))
    return data

In [None]:
nerDocs = read_json_files(
    "./DocumentsStore/missing"
)

In [None]:
len(nerDocs)

In [None]:
nerDocs[0]

In [None]:
len(all_docs[5].text)

# NER processing

In [None]:
# ner
import time
from tqdm.notebook import tqdm
now = time.time()
nerDocs = []
for document in tqdm(all_docs):
    try:
        res = requests.post(
            "http://vm.chronos.disco.unimib.it:10881/api/spacyner",
            json=document.to_dict(),
        )
        assert res.ok  
        nerDocs.append(Document.from_dict(res.json()))
    except Exception as e:
        print(e)      
elapsed = time.time() - now
print(elapsed)


In [None]:
len(all_docs), len(nerDocs)

In [None]:
#dump processed docs with joblib 
import joblib
joblib.dump(nerDocs, 'nerDocsBologna.joblib')

In [None]:
nerDocs[15]

# Bi Encoder Linking

In [None]:
def read_jsons_from_folder(folder_path, clean_linking_keys=True):
    import os

    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    documents = []
    
    # Keys to remove from annotations
    linking_keys = [
        "linking_probability", "linking_score", "candidates", "linking", 
        "nil_score", "nil_probability", "nil_label", "nil_prediction"
    ]
    
    for file in json_files:
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            content = json.load(f)
            
            # Clean the annotations by removing linking keys if requested
            if clean_linking_keys and "annotation_sets" in content:
                for ann_set_name, ann_set in content["annotation_sets"].items():
                    if "annotations" in ann_set:
                        for annotation in ann_set["annotations"]:
                            # Remove linking keys from the annotation
                            for key in linking_keys:
                                if key in annotation.get("features", {}):
                                    del annotation["features"][key]
            
            # Convert to GateNLP document
            documents.append(Document.from_dict(content))
            
    print(f"Loaded {len(documents)} documents from {folder_path}")
    return documents


nerDocs = read_jsons_from_folder(
    "./DocumentsStore/output",
    clean_linking_keys=True,
)

In [None]:
#load ner docs from joblib
import joblib
nerDocs = joblib.load('nerDocsBatini.joblib')

In [None]:
# ner
import time
from tqdm import tqdm
now = time.time()
linkedDocs = []
for document in tqdm(nerDocs):
    try: 
        res = requests.post(
            "http://vm.chronos.disco.unimib.it:10881/api/blink/biencoder/mention/doc",
            json=document.to_dict(),
        )
        assert res.ok  
        linkedDocs.append(Document.from_dict(res.json()))
    except Exception as e:
        print('error',e)      
elapsed = time.time() - now
print(elapsed)
assert res.ok

# Indexing

In [None]:
# ner
import time
from tqdm import tqdm
now = time.time()
indexed = []
for document in tqdm(linkedDocs):
    try: 
        res = requests.post(
            "http://vm.chronos.disco.unimib.it:10881/api/indexer/search/doc",
            json=document.to_dict(),
        )
        indexed.append(Document.from_dict(res.json()))
    except Exception as e:
        print('error', e)      
elapsed = time.time() - now
print(elapsed)

# Nil prediction

In [None]:
# ner
import time
from tqdm import tqdm
now = time.time()
nilDocs = []
for document in tqdm(indexed):
    try:
        res = requests.post(
            "http://vm.chronos.disco.unimib.it:10881/api/nilprediction/doc",
            json=document.to_dict(),
        )

        nilDocs.append(Document.from_dict(res.json()))
    except Exception as e:
        print('error', e)      
elapsed = time.time() - now
print(elapsed)

In [None]:
len(nilDocs)

In [None]:
import joblib 
joblib.dump(nilDocs, 'nilDocsBatini.joblib')

# Clustering

In [None]:
# read jsons from folder
def read_jsons_from_folder(folder_path, clean_linking_keys=True):
    import os
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    documents = []
    
    # Keys to remove from annotations
    linking_keys = [
        "linking_probability", "linking_score", "candidates", "linking", 
        "nil_score", "nil_probability", "nil_label", "nil_prediction"
    ]
    
    for file in json_files:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            content = json.load(f)
            
            # Clean the annotations by removing linking keys if requested
            if clean_linking_keys and "annotation_sets" in content:
                for ann_set_name, ann_set in content["annotation_sets"].items():
                    if "annotations" in ann_set:
                        for annotation in ann_set["annotations"]:
                            # Remove linking keys from the annotation
                            for key in linking_keys:
                                if key in annotation.get("features", {}):
                                    del annotation["features"][key]
            
            # Convert to GateNLP document
            documents.append(Document.from_dict(content))
            
    print(f"Loaded {len(documents)} documents from {folder_path}")
    return documents

nilDocs = read_jsons_from_folder(
    "./DocumentsStore/output",
    clean_linking_keys=True
)

In [None]:
# ner
import time
from tqdm import tqdm
now = time.time()
clustering = []

for document in tqdm(nilDocs):
    try:
        res = requests.post(
            "http://vm.chronos.disco.unimib.it:10881/api/clustering",
            json=document.to_dict(),
        )
        clustering.append(Document.from_dict(res.json()))
    except Exception as e:
        print('error', e)      
elapsed = time.time() - now
print(elapsed)
assert res.ok

In [None]:
len(clustering)

In [None]:
import joblib

joblib.dump(clustering, 'clusteringBatini.joblib')

In [None]:
import joblib 
clustering = joblib.load('clusteringBatini.joblib')

In [None]:
# ner
import time
from tqdm.notebook import tqdm
now = time.time()
nilConsDocs = []
for document in tqdm(clustering):
    try:
        res = requests.post(
            "http://vm.chronos.disco.unimib.it:10881/api/consolidation",
            json=document.to_dict(),
        )

        nilConsDocs.append(Document.from_dict(res.json()))
    except Exception as e:
        print('error', e)      
elapsed = time.time() - now
print(elapsed)

In [None]:
nilConsDocs[0]

In [None]:
# Create mapping between documents and their original filenames
import os

# Get the original filenames from the source directory
source_folder = "./DocumentsStore/bologna_final"
original_filenames = [f for f in os.listdir(source_folder) if f.endswith('.txt')]

# Create a mapping dictionary
file_to_document_mapping = {}

# Match documents with their original filenames
for i, document in enumerate(nilConsDocs):
    if i < len(original_filenames):
        # Get the original filename without extension
        original_filename = original_filenames[i]
        doc_name = original_filename.replace('.txt', '')
        
        # Set the document name to the original filename (without extension)
        document.name = doc_name
        
        # Add to mapping
        file_to_document_mapping[doc_name] = document
        
        print(f"Document {i}: {doc_name}")
    else:
        print(f"Warning: Document {i} has no corresponding filename")

print(f"\nCreated mapping for {len(file_to_document_mapping)} documents")
print("Sample document names:", list(file_to_document_mapping.keys())[:5])

In [None]:
# save all documents as json

for document in nilDocs:
    with open(f"DocumentsStore/bologna_final/{document.name}.json.annotated", "w") as f:
        json.dump(document.to_dict(), f)

In [None]:
joblib.dump(nilConsDocs, './DocumentsStore/finalDocsBatini.joblib')