Library Installation

In [None]:
!pip install neo4j python-dotenv
!pip install git+https://github.com/vidraj/derinet.git#subdirectory=tools/data-api/derinet2

print("Installation complete and verified.")

Collecting git+https://github.com/vidraj/derinet.git#subdirectory=tools/data-api/derinet2
  Cloning https://github.com/vidraj/derinet.git to /tmp/pip-req-build-l2qgsarm
  Running command git clone --filter=blob:none --quiet https://github.com/vidraj/derinet.git /tmp/pip-req-build-l2qgsarm
  Resolved https://github.com/vidraj/derinet.git to commit cc8af7386a80456ae2d731eac8fd34bff6ae49a0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Installation complete and verified.


Configuration

In [None]:
import os
import json
from dotenv import load_dotenv
from neo4j import GraphDatabase
import derinet.lexicon as dlex
import math
# --- Configuration ---
NEO4J_URI = "XXXX"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "XXXX"

DERINET_DATA_PATH = "derinet-2-3.tsv"
BATCH_SIZE = 5000

os.environ['NEO4J_URI'] = NEO4J_URI
os.environ['NEO4J_USERNAME'] = NEO4J_USERNAME
os.environ['NEO4J_PASSWORD'] = NEO4J_PASSWORD

print("Configuration variables set.")

Configuration variables set.


Downloading DeriNet 2.3 Data

In [None]:
DERINET_ZIP_URL = "https://lindat.mff.cuni.cz/repository/server/api/core/items/62540779-b206-4cf7-ac33-399ce68e35e6/allzip?handleId=11234/1-5846"
LOCAL_ZIP_FILENAME = "derinet-2-3-all.zip"
DERINET_DATA_PATH = "derinet-2-3.tsv"

print(f"Downloading DeriNet 2.3 data package via curl...")
!curl -o $LOCAL_ZIP_FILENAME $DERINET_ZIP_URL

print(f"✅ Data package downloaded as {LOCAL_ZIP_FILENAME}.")
!unzip -o $LOCAL_ZIP_FILENAME -d .

print(f"✅ Data extracted. The target file path is set to: {DERINET_DATA_PATH}")

Downloading DeriNet 2.3 data package via curl...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  418M    0  418M    0     0  27.3M      0 --:--:--  0:00:15 --:--:-- 31.0M
✅ Data package downloaded as derinet-2-3-all.zip.
Archive:  derinet-2-3-all.zip
  inflating: ./derinet-2-3.tsv       
✅ Data extracted. The target file path is set to: derinet-2-3.tsv


Queries and Node Creation

In [None]:
# Create unique constraint on the Word ID
CONSTRAINT_QUERY = "CREATE CONSTRAINT word_id_unique IF NOT EXISTS FOR (w:Word) REQUIRE w.id IS UNIQUE;"

# MERGE Nodes with all properties
# Note: Complex dicts/lists are stored as JSON strings because Neo4j properties must be primitives.
NODE_MERGE_QUERY = """
UNWIND $words AS word_data
MERGE (w:Word {id: word_data.id})
ON CREATE SET
    w.lemma = word_data.lemma,
    w.pos = word_data.pos,
    w.language = word_data.lang,
    w.is_root = word_data.is_root,
    w.features = word_data.features,
    w.morphology = word_data.morphology,
    w.misc = word_data.misc,
    w.corpus_stats = word_data.corpus_stats,
    w.corpus_absolute_count = word_data.corpus_absolute_count
"""

# MERGE Relationships
REL_MERGE_QUERY = """
UNWIND $relations AS rel_data
MATCH (parent:Word {id: rel_data.parent_id})
MATCH (child:Word {id: rel_data.child_id})
MERGE (parent)-[r:COMPONENT {type: rel_data.type}]->(child)
"""

def create_index(tx):
    tx.run(CONSTRAINT_QUERY)

def batch_insert(driver, query, data_list, name="Items"):
    print(f"\nStarting {name} insertion...")
    total_count = len(data_list)

    if total_count == 0:
        print(f"No {name} to insert.")
        return

    for i in range(0, total_count, BATCH_SIZE):
        batch = data_list[i:i + BATCH_SIZE]
        try:
            with driver.session() as session:
                session.run(query, {name.lower(): batch})
            print(f"-> Batch {i//BATCH_SIZE + 1} of {total_count//BATCH_SIZE + 1} processed ({len(batch)} {name}).")
        except Exception as e:
            print(f"!!! Error in {name} batch starting at index {i}. Stopping: {e}")
            raise

def prepare_data(lexicon):
    word_nodes = []
    word_relations = []

    print("Iterating over lexicon to prepare data...")

    for lexeme in lexicon.iter_lexemes():

        primary_parent_id = lexeme.parent.lemid if lexeme.parent else None

        # Parse JSON stats
        corpus_stats_str = "{}"
        absolute_count = 0
        if hasattr(lexeme, 'extra_data') and lexeme.extra_data:
             if 'corpus_stats' in lexeme.extra_data:
                stats = lexeme.extra_data['corpus_stats']
                corpus_stats_str = json.dumps(stats)
                absolute_count = stats.get('absolute_count', 0)

        morph_json = json.dumps(lexeme.segmentation) if hasattr(lexeme, 'segmentation') else "[]"

        word_nodes.append({
            "id": lexeme.lemid,
            "lemma": lexeme.lemma,
            "pos": str(lexeme.pos),
            "lang": getattr(lexeme, 'lang', 'cs'),
            "is_root": primary_parent_id is None,
            "features": json.dumps(lexeme.feats),
            "misc": json.dumps(lexeme.misc),
            "morphology": morph_json,
            "corpus_stats": corpus_stats_str,
            "corpus_absolute_count": absolute_count,
            "corpus_log_count": math.log(absolute_count + 1)
        })

        if hasattr(lexeme, 'parent_relations'):
            for rel in lexeme.parent_relations:
                rel_type = getattr(rel, 'type', 'Derivation')

                # A single relation (like Compounding) can have multiple sources.
                # We iterate over all sources for this specific relation.
                for source_lexeme in rel.sources:
                    word_relations.append({
                        "child_id": lexeme.lemid,
                        "parent_id": source_lexeme.lemid,
                        "type": rel_type
                    })

    return word_nodes, word_relations

Batch Uploading/Execution

In [None]:
def convert_derinet_to_neo4j():
    # Variables loaded from environment or defined earlier in Block 1
    URI = os.getenv("NEO4J_URI")
    AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

    print(f"Loading DeriNet data from {DERINET_DATA_PATH}...")
    lexicon = None
    try:
        lexicon = dlex.Lexicon()
        lexicon.load(data_source=DERINET_DATA_PATH, fmt=dlex.Format.DERINET_V2)
        print("✅ DeriNet file loaded successfully.")
    except Exception as e:
        print(f"❌ Failed to load DeriNet data: {e}")
        return

    # Call the corrected prepare_data function from Block 3
    word_nodes, word_relations = prepare_data(lexicon)

    num_words = len(word_nodes)
    num_rels = len(word_relations)
    if num_words == 0:
        print("❌ Error: Lexicon yielded 0 nodes.")
        return

    print(f"✅ Data prepared. Nodes: {num_words}, Relationships: {num_rels}")

    # --- Filter Word Nodes by Frequency ---
    print(f"Filtering word nodes by corpus_absolute_count. Total nodes: {len(word_nodes)}")
    word_nodes.sort(key=lambda x: x['corpus_absolute_count'], reverse=True)
    word_nodes = word_nodes[:75000]
    top_75k_word_ids = {node['id'] for node in word_nodes}

    # Filter word_relations to only include relations where both parent and child are in the top 75k words
    original_relations_count = len(word_relations)
    word_relations = [rel for rel in word_relations if rel['parent_id'] in top_75k_word_ids and rel['child_id'] in top_75k_word_ids]

    print(f"✅ Filtered nodes to top 75,000 based on frequency. Remaining nodes: {len(word_nodes)}")
    print(f"✅ Filtered relationships. Original: {original_relations_count}, Remaining: {len(word_relations)}")

    # Connect to Neo4j and Import
    driver = None
    try:
        driver = GraphDatabase.driver(URI, auth=AUTH)
        # Verify connection before starting large transactions
        driver.verify_connectivity()
        print("\n✅ Connection to Neo4j established.")

        # Create constraint/index
        with driver.session() as session:
            session.execute_write(create_index)
        print("✅ Unique constraint on :Word(id) created.")

        # Insert Nodes
        batch_insert(driver, NODE_MERGE_QUERY, word_nodes, name="Words")

        # Insert Relationships
        batch_insert(driver, REL_MERGE_QUERY, word_relations, name="Relations")

        print("\n✨ Data import complete.")

    except Exception as e:
        print(f"\n❌ A database error occurred: {e}")
    finally:
        if driver:
            driver.close()

if __name__ == "__main__":
    convert_derinet_to_neo4j()

Loading DeriNet data from derinet-2-3.tsv...
✅ DeriNet file loaded successfully.
Iterating over lexicon to prepare data...
✅ Data prepared. Nodes: 1042751, Relationships: 857538
Filtering word nodes by corpus_absolute_count. Total nodes: 1042751
✅ Filtered nodes to top 75,000 based on frequency. Remaining nodes: 75000
✅ Filtered relationships. Original: 857538, Remaining: 35272

✅ Connection to Neo4j established.
✅ Unique constraint on :Word(id) created.

Starting Words insertion...
-> Batch 1 of 16 processed (5000 Words).
-> Batch 2 of 16 processed (5000 Words).
-> Batch 3 of 16 processed (5000 Words).
-> Batch 4 of 16 processed (5000 Words).
-> Batch 5 of 16 processed (5000 Words).
-> Batch 6 of 16 processed (5000 Words).
-> Batch 7 of 16 processed (5000 Words).
-> Batch 8 of 16 processed (5000 Words).
-> Batch 9 of 16 processed (5000 Words).
-> Batch 10 of 16 processed (5000 Words).
-> Batch 11 of 16 processed (5000 Words).
-> Batch 12 of 16 processed (5000 Words).
-> Batch 13 of 16

In [None]:
"""
Some useful neo4j commands:

This finds nodes which have more than 2 unique relation types
MATCH (w:Word)-[r:DERIVED_FROM]->(child)
WITH w, count(DISTINCT r.type) AS unique_types
WHERE unique_types > 2
RETURN w.lemma, unique_types

This finds nodes with the corresponding lemma
MATCH (n:Word {lemma: 'kilogram'})
RETURN n
"""

"\nSome useful neo4j commands:\n\nThis finds nodes which have more than 2 unique relation types\nMATCH (w:Word)-[r:DERIVED_FROM]->(child)\nWITH w, count(DISTINCT r.type) AS unique_types\nWHERE unique_types > 2\nRETURN w.lemma, unique_types\n\nThis finds nodes with the corresponding lemma\nMATCH (n:Word {lemma: 'kilogram'})\nRETURN n\n"