In [None]:
!pip install pandas neo4j



In [None]:
import pandas as pd
from neo4j import GraphDatabase
import time
import math
from collections import defaultdict

# ==========================================
# 1. CONFIGURATION & DATABASE CONNECTION
# ==========================================

NEO4J_URI = "XXXX"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "XXXX"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

Notice, below code uses InterCorpu_v16ud_100k.csv and any_corp_AND_any_dict.tsv files. 

Please make sure to have these files in the same directory as the script.

In [None]:
def load_and_prep_data():
    print("Loading InterCorp data...")
    inter_df = pd.read_csv('InterCorp_v16ud_100k.csv', sep=';', on_bad_lines='skip')

    # Clean columns and data
    inter_df.columns = [c.strip().replace('"', '') for c in inter_df.columns]
    inter_df['freq'] = pd.to_numeric(inter_df['freq'], errors='coerce').fillna(0)

    # Get highest frequency entry for each word
    inter_df = inter_df.sort_values('freq', ascending=False).drop_duplicates(subset=['word'])
    stats_map = inter_df.set_index('word')[['upos', 'freq']].to_dict('index')

    print("Loading Corpus words...")
    with open('any_corp_AND_any_dict.tsv', 'r', encoding='utf-8') as f:
        corpus_words = [line.strip() for line in f if line.strip()]

    # --- PRE-CALCULATION ---
    # Sum the frequency of all words connected to a character
    print("Calculating character frequencies...")
    char_freq_map = defaultdict(float)

    for word in corpus_words:
        # Get frequency of the word (default to 0 if not found)
        w_freq = stats_map.get(word, {}).get('freq', 0)

        # Add word's frequency to each of its UNIQUE characters
        # Using set(word) means if a word is "food" (freq 100), 'o' gets +100 (not +200)
        # This aligns with "sum of the number of times the connected WORDS appear"
        for char in set(word):
            char_freq_map[char] += w_freq

    return corpus_words, stats_map, char_freq_map

def create_chinese_nodes(tx, batch):
    """
    Creates Word and Character nodes with consistent property naming.
    """
    query = """
    UNWIND $batch AS item

    // 1. Create Word Node
    MERGE (w:Word {id: item.word})
    SET w.text = item.word,
        w.pos = item.pos,
        w.corpus_absolute_count = item.corpus_absolute_count,
        w.corpus_log_count = item.corpus_log_count,
        w.lang = item.lang

    // 2. Process Characters
    FOREACH (char_data IN item.chars |
        MERGE (c:Character {id: char_data.char})
        // Use the pre-calculated sums from the batch
        SET c.text = char_data.char,
            c.lang = item.lang,
            c.corpus_absolute_count = char_data.corpus_absolute_count,
            c.corpus_log_count = char_data.corpus_log_count

        // Link Character -> Word
        MERGE (c)-[r:COMPONENT]->(w)
        SET r.order = char_data.order,
            r.type = "Compounding"
    )
    """
    tx.run(query, batch=batch)

def build_graph(driver, words, stats_map, char_freq_map):
    batch_size = 2000
    batch = []
    total = len(words)
    start_time = time.time()

    print(f"Starting graph construction for {total} words...")

    with driver.session() as session:
        for i, word in enumerate(words):

            # 1. Word Stats
            w_stats = stats_map.get(word, {'upos': 'UNKNOWN', 'freq': 0})
            w_freq = float(w_stats['freq'])
            w_log = math.log(w_freq + 1)

            # 2. Character Data (Enriched)
            chars_data = []
            for idx, char in enumerate(word):
                # Lookup the pre-summed frequency for this character
                c_freq = char_freq_map.get(char, 0)
                c_log = math.log(c_freq + 1)

                chars_data.append({
                    'char': char,
                    'order': idx,
                    # STRICT NAMING: Matches the node properties
                    'corpus_absolute_count': c_freq,
                    'corpus_log_count': c_log
                })

            # 3. Add to Batch
            batch.append({
                'word': word,
                'pos': w_stats['upos'],
                'lang': 'zh',
                # STRICT NAMING: Matches the node properties
                'corpus_absolute_count': w_freq,
                'corpus_log_count': w_log,
                'chars': chars_data
            })

            if len(batch) >= batch_size:
                session.execute_write(create_chinese_nodes, batch)
                batch = []
                if (i + 1) % 10000 == 0:
                    print(f"Processed {i + 1}/{total} words... ({time.time() - start_time:.2f}s)")

        if batch:
            session.execute_write(create_chinese_nodes, batch)

    print("Graph construction complete.")

In [None]:
if __name__ == "__main__":
    words, stats, char_stats = load_and_prep_data()

    build_graph(driver, words, stats, char_stats)

    driver.close()

Loading InterCorp data...
Loading Corpus words...
Calculating character frequencies...
Starting graph construction for 48881 words...
Processed 10000/48881 words... (57.84s)
Processed 20000/48881 words... (174.03s)
Processed 30000/48881 words... (350.94s)
Processed 40000/48881 words... (586.80s)
Graph construction complete.
