In [11]:
from pathlib import Path
import os
import json
from typing import List

# Load JSON-annotated documents from DocumentsStore
documents_path = Path("./out_anon")
json_files: List[Path] = [p for p in documents_path.iterdir() if p.suffixes == ['.json'] or p.name.endswith('.json')]

data = []
for json_file in json_files:
    with json_file.open('r', encoding='utf-8') as f:
        fileObject = json.load(f)
        data.append(fileObject)

print(f'Loaded {len(data)} documents from {documents_path}')

Loaded 1 documents from out_anon


In [4]:
# Number of loaded documents
len(data)

48

In [59]:
# Save plain text previews for each document
from pathlib import Path
out_dir = Path('DocumentsStore/bolognaSource')
out_dir.mkdir(parents=True, exist_ok=True)
for i, fileObject in enumerate(data):
    text = fileObject.get('text', '')
    path = out_dir / f'{i}.txt'
    path.write_text(text, encoding='utf-8')
    # avoid noisy per-file prints in bulk operations
print(f'Wrote {len(data)} text files to {out_dir}')

Wrote 48 text files to DocumentsStore/bolognaSource


In [None]:
# Example: load a single indexed.json file if present
from pathlib import Path
indexed_file = Path('./DocumentsStore/indexed.json')
data = None
if indexed_file.exists():
    with indexed_file.open('r', encoding='utf-8') as f:
        data = json.load(f)
        print('Loaded indexed.json')
else:
    print('indexed.json not found; using previously loaded data if any')

In [None]:
# Optionally load pre-serialized data (joblib) if you prefer
import joblib
joblib_path = Path('./DocumentsStore/finalDocsBatini.joblib')
if joblib_path.exists():
    data = joblib.load(str(joblib_path))
    print('Loaded joblib data')
else:
    print('joblib file not found; skipping')

In [12]:
import hashlib
from typing import Dict, Any
import logging

logger = logging.getLogger(__name__)
if not logger.handlers:
    # basic config for notebook runs
    logging.basicConfig(level=logging.INFO)

def get_string_hash(input_string: str) -> str:
    """Return a SHA-256 hex digest for a given string."""
    return hashlib.sha256(input_string.encode('utf-8')).hexdigest()

def clean_doc(document: Dict[str, Any]) -> Dict[str, Any]:
    """Remove MongoDB-specific fields from a document in-place and return it."""
    fields_to_remove = ['_id', 'inc_id', '__v', 'edited']
    for field in fields_to_remove:
        document.pop(field, None)

    # Clean annotation sets safely
    ann_sets = document.get('annotation_sets', {})
    for annset_name, annset in ann_sets.items():
        if isinstance(annset, dict):
            annset.pop('_id', None)
            annotations = annset.get('annotations', [])
            for annotation in annotations:
                annotation.pop('_id', None)
                annotation.pop('annotationSetId', None)
    return document

def remove_surrogates(text: str) -> str:
    if not isinstance(text, str):
        return text
    encoded_text = text.encode('utf-16', 'surrogatepass')
    return encoded_text.decode('utf-16', errors='ignore')

def process_doc_for_mongo(obj: Dict[str, Any], database) -> None:
    """Prepare and insert document, annotation sets and annotations into MongoDB."""
    try:
        text = obj.get('text', '')
        doc_id = get_string_hash(text)
        document = {
            'text': remove_surrogates(text),
            'preview': remove_surrogates(text[:100] + '...'),
            'name': remove_surrogates(obj.get('name', '')),
            'features': obj.get('features', {}),
            'offset_type': obj.get('offset_type'),
            'id': doc_id,
        }
        document.pop('_id', None)
        database['documents'].insert_one(document)

        annotation_sets = obj.get('annotation_sets', {})
        annset_collection = database['annotationSets']
        annset_id_map = {}
        for name, annset in annotation_sets.items():
            ann_record = {
                'name': name,
                'docId': doc_id,
                'next_annid': annset.get('next_annid', 1),
            }
            ann_record.pop('_id', None)
            inserted = annset_collection.insert_one(ann_record)
            annset_id_map[name] = inserted.inserted_id

        annotation_collection = database['annotations']
        for name, annset in annotation_sets.items():
            for annotation in annset.get('annotations', []):
                try:
                    annotation['annotationSetId'] = annset_id_map[name]
                    if 'features' in annotation and 'mention' in annotation['features']:
                        annotation['features']['mention'] = remove_surrogates(annotation['features']['mention'])
                    annotation.pop('_id', None)
                    annotation_collection.insert_one(annotation)
                except Exception as e:
                    logger.exception('Error inserting annotation for doc %s: %s', doc_id, e)
    except Exception as e:
        logger.exception('Error processing document to Mongo: %s', e)

In [13]:
# Configuration for entity preprocessing
ENABLE_ENTITY_PREPROCESSING = False  # Set to True to enable automatic entity detection
MIN_MENTION_LENGTH = 2  # Minimum length for entity mentions to be considered
CASE_SENSITIVE = False  # Set to True for case-sensitive matching
# Configuration for cluster merging
ENABLE_CLUSTER_MERGING = False  # Set to True to enable merging of duplicate annotation sets
# Use environment variable to toggle dry-run (won't write to Mongo)
DRY_RUN = os.environ.get('DRY_RUN', '1') != '0'
print(f'ENTITY_PREPROCESSING={ENABLE_ENTITY_PREPROCESSING}, CLUSTER_MERGING={ENABLE_CLUSTER_MERGING}, DRY_RUN={DRY_RUN}')

ENTITY_PREPROCESSING=False, CLUSTER_MERGING=False, DRY_RUN=True


In [14]:
# Configuration for saving enhanced documents
SAVE_ENHANCED_DOCUMENTS = False  # Set to False to disable saving enhanced documents
ENHANCED_DOCUMENTS_FOLDER = './DocumentsStore/output_enhanced'  # Folder to save enhanced documents
from pathlib import Path
ENHANCED_DOCS_PATH = Path(ENHANCED_DOCUMENTS_FOLDER)
ENHANCED_DOCS_PATH.mkdir(parents=True, exist_ok=True)

In [15]:
import re
from collections import defaultdict

def find_missing_entities(document):
    """Find additional entity mentions by string-matching existing mentions."""
    text = document.get('text', '')
    annotation_sets = document.get('annotation_sets', {})
    total_new_annotations = {}

    for ann_set_name, ann_set in annotation_sets.items():
        annotations = ann_set.get('annotations', [])
        existing_mentions = defaultdict(list)
        existing_positions = set()
        for annotation in annotations:
            start = annotation.get('start')
            end = annotation.get('end')
            if start is None or end is None:
                continue
            mention = text[start:end]
            existing_mentions[mention].append((start, end, annotation))
            for pos in range(start, end):
                existing_positions.add(pos)

        new_annotations = []
        next_annid = ann_set.get('next_annid', len(annotations) + 1)
        for mention, occurrences in existing_mentions.items():
            if len(mention.strip()) < MIN_MENTION_LENGTH:
                continue
            template = occurrences[0][2]
            escaped = re.escape(mention)
            pattern = r'\b' + escaped + r'\b'
            flags = 0 if CASE_SENSITIVE else re.IGNORECASE
            for m in re.finditer(pattern, text, flags):
                s, e = m.start(), m.end()
                if any(p in existing_positions for p in range(s, e)):
                    continue
                new_annotation = {
                    'start': s,
                    'end': e,
                    'type': template.get('type', 'Unknown'),
                    'features': template.get('features', {}).copy(),
                    'id': next_annid,
                }
                new_annotations.append(new_annotation)
                for p in range(s, e):
                    existing_positions.add(p)
                next_annid += 1
        if new_annotations:
            annotations.extend(new_annotations)
            ann_set['next_annid'] = next_annid
            total_new_annotations[ann_set_name] = len(new_annotations)
        else:
            total_new_annotations[ann_set_name] = 0

    total_added = sum(total_new_annotations.values())
    logger.info('Document %s: added %s new entities', document.get('name', 'unknown'), total_added)
    return document

In [16]:
def merge_duplicate_annotation_sets(document):
    """Merge clusters with identical titles (case-insensitive) inside document.features.clusters."""
    features = document.get('features', {})
    clusters = features.get('clusters')
    if not clusters or not isinstance(clusters, dict):
        logger.debug('No clusters structure found; skipping merge')
        return document

    total_clusters_merged = 0
    total_mentions_moved = 0
    for ann_set_name, cluster_list in clusters.items():
        if not isinstance(cluster_list, list):
            continue
        title_map = {}
        new_clusters = []
        for cluster in cluster_list:
            title = cluster.get('title')
            if not title:
                new_clusters.append(cluster)
                continue
            key = title.lower().strip()
            if key in title_map:
                primary = title_map[key]
                mentions = cluster.get('mentions', [])
                if mentions:
                    primary.setdefault('mentions', []).extend(mentions)
                    total_mentions_moved += len(mentions)
                    total_clusters_merged += 1
            else:
                title_map[key] = cluster
                new_clusters.append(cluster)
        clusters[ann_set_name] = new_clusters
        logger.info('After merging %s clusters in %s', len(new_clusters), ann_set_name)

    if total_clusters_merged:
        logger.info('Merged %s clusters and moved %s mentions in total', total_clusters_merged, total_mentions_moved)
    return document

In [17]:
import os
import json
from bson import ObjectId
from datetime import datetime
from pathlib import Path

def clean_for_json(obj):
    if isinstance(obj, dict):
        return {key: clean_for_json(value) for key, value in obj.items() if key not in ['_id', 'annotationSetId'] and not key.startswith('__')}
    if isinstance(obj, list):
        return [clean_for_json(item) for item in obj]
    if isinstance(obj, ObjectId):
        return str(obj)
    if isinstance(obj, datetime):
        return obj.isoformat()
    return obj

def save_enhanced_document(document, output_folder: Path):
    if not SAVE_ENHANCED_DOCUMENTS:
        return
    output_folder.mkdir(parents=True, exist_ok=True)
    doc_name = document.get('name') or ('document_' + get_string_hash(document.get('text', '')[:64]))
    safe_name = ''.join(c for c in doc_name if c.isalnum() or c in (' ', '-', '_', '.')).strip() or 'document'
    filename = f'{safe_name}.json'
    filepath = output_folder / filename
    try:
        clean_document = clean_for_json(document)
        with filepath.open('w', encoding='utf-8') as f:
            json.dump(clean_document, f, ensure_ascii=False, indent=2)
        logger.info('Enhanced document saved: %s', filename)
    except Exception as e:
        logger.exception('Error saving enhanced document %s: %s', filename, e)
        try:
            fallback = output_folder / f'document_{abs(hash(str(document))) % (10**8)}.json'
            with fallback.open('w', encoding='utf-8') as f:
                json.dump(clean_document, f, ensure_ascii=False, indent=2)
            logger.info('Enhanced document saved with fallback name: %s', fallback.name)
        except Exception as e2:
            logger.exception('Failed to save enhanced document even with fallback: %s', e2)

In [13]:
print(MONGO_URI)


mongodb://root:oovailosoozohthu1phoh0eew1oolaePha8xo5kee4iig5@127.0.0.1:27018/


In [18]:
from pymongo import MongoClient
from tqdm.notebook import tqdm
import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get MongoDB credentials from environment variables
MONGO_USER = os.environ.get('MONGO_USER')
MONGO_PASSWORD = os.environ.get('MONGO_PASSWORD')

# Construct Mongo URI using credentials
MONGO_URI = f"mongodb://{MONGO_USER}:{MONGO_PASSWORD}@127.0.0.1:27018/" if MONGO_USER and MONGO_PASSWORD else None

DATABASE_NAME = os.environ.get('MONGO_DB', 'anonymized')
DRY_RUN = False

# Configuration for database deletion
DELETE_EXISTING_DATABASE = True  # Set to True to delete existing database before processing
CONFIRM_DELETE = True  # Set to False to skip confirmation prompt

client = MongoClient(MONGO_URI) if not DRY_RUN else None
db = client[DATABASE_NAME] if client else None

if DRY_RUN:
    logger.info('DRY_RUN is enabled; no writes to MongoDB will be performed')

# Delete existing database if requested
if DELETE_EXISTING_DATABASE and not DRY_RUN and client is not None:
    if CONFIRM_DELETE:
        response = input(f"Are you sure you want to delete the database '{DATABASE_NAME}'? (yes/no): ")
        if response.lower() in ['yes', 'y']:
            client.drop_database(DATABASE_NAME)
            logger.info(f'Database "{DATABASE_NAME}" has been deleted')
            # Reconnect to the database (it will be recreated when first used)
            db = client[DATABASE_NAME]
        else:
            logger.info('Database deletion cancelled')
    else:
        client.drop_database(DATABASE_NAME)
        logger.info(f'Database "{DATABASE_NAME}" has been deleted automatically')
        # Reconnect to the database (it will be recreated when first used)
        db = client[DATABASE_NAME]

for doc in tqdm(data):
    try:
        doc = clean_doc(doc)
    except Exception as e:
        logger.exception('Error cleaning document %s: %s', doc.get('id', 'unknown'), e)
        continue

    if ENABLE_CLUSTER_MERGING:
        try:
            doc = merge_duplicate_annotation_sets(doc)
        except Exception as e:
            logger.exception('Error merging duplicate annotation sets for doc %s: %s', doc.get('id', 'unknown'), e)

    if ENABLE_ENTITY_PREPROCESSING:
        try:
            doc = find_missing_entities(doc)
            save_enhanced_document(doc, ENHANCED_DOCS_PATH)
        except Exception as e:
            logger.exception('Error in entity preprocessing for doc %s: %s', doc.get('id', 'unknown'), e)

    try:
        if not DRY_RUN and db is not None:
            process_doc_for_mongo(doc, db)
        else:
            # In dry-run mode, just validate the document processing path
            logger.debug('Dry-run: would process doc %s', doc.get('id', 'unknown'))
    except Exception as e:
        logger.exception('Error processing doc to Mongo %s: %s', doc.get('id', 'unknown'), e)

INFO:__main__:Database deletion cancelled


  0%|          | 0/1 [00:00<?, ?it/s]