In [11]:
from graphdatascience import GraphDataScience
from getpass import getpass
import pandas as pd
import numpy as np

In [7]:
!python main.py --dataset emails --nt 1000 --nq 1000 --epochs 20 --save-split --recall --save-embed --save-model

2024-07-07 18:54:29.163901: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Loading time: 0.6244535446166992
# loading indices from file
# Unique signature     : 39
# Maximum length       : 44
# Sampled Train Items  : 1000
# Sampled Query Items  : 1000
# Number of Base Items : 93905
# Number of Items      : 95905
# loading dist and knn from file
# train dist : (1000, 1000)
# query dist : (1000, 93905)
# batch embedding: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
# batch embedding: 100%|████████████████████████| 92/92 [00:40<00:00,  2.29it/s]
# batch embedding: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.22it/s]
# Embedding time: 40.18893504142761


In [7]:
!python main.py --dataset ssn --nt 1000 --nq 1000 --epochs 20 --save-split --recall --save-embed

2024-07-08 16:01:49.049096: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Loading time: 0.24382495880126953
# loading indices from file
# Unique signature     : 11
# Maximum length       : 12
# Sampled Train Items  : 1000
# Sampled Query Items  : 1000
# Number of Base Items : 97990
# Number of Items      : 99990
# edit distance 1000x1000: 100%|██████████| 1000/1000 [00:00<00:00, 2485.90it/s]
# Calculate edit distance time: 0.40700721740722656
# sorting for KNN indices: 100%|█████████| 1000/1000 [00:00<00:00, 53727.67it/s]
# edit distance 97990x1000: 100%|███████| 97990/97990 [00:39<00:00, 2466.28it/s]
# Calculate edit distance time: 39.74365019798279
# sorting for KNN indices: 100%|███████████| 1000/1000 [00:01<00:00, 570.94it/s]
# 

In [26]:
def build_data_df(dataset_name, property_name):
    property_df = pd.read_csv(f"data/{dataset_name}", header=None)
    property_df.columns=[property_name]

    base_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/embedding_xb.npy')
    train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/embedding_xt.npy')
    query_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/embedding_xq.npy')
    embedding_df = pd.concat([pd.DataFrame(base_embedding), pd.DataFrame(train_embedding), pd.DataFrame(query_embedding)])

    embedding_df['embedding'] = embedding_df.apply(lambda row: row.tolist(), axis=1)

    base_index = train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/base_idx.npy')
    train_index = train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/train_idx.npy')
    query_index = train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/query_idx.npy')

    embedding_index = pd.concat([pd.Series(base_index), pd.Series(train_index), pd.Series(query_index)])
    embedding_df.index = embedding_index

    embedding_df = embedding_df['embedding'].copy()

    data_df = property_df.merge(embedding_df, left_index=True, right_index=True)

    data_df.loc[base_index, 'split'] = 'Base'
    data_df.loc[train_index, 'split'] = 'Train'
    data_df.loc[query_index, 'split'] = 'Query'

    return data_df


In [7]:
email_df = build_data_df("emails", "address")

In [27]:
ssn_df = build_data_df("ssn", "ssn")

In [28]:
ssn_df.head()

Unnamed: 0,ssn,embedding,split
0,797-19-5522,"[0.12009803205728531, -0.029904861003160477, 0...",Base
1,607-53-7106,"[0.20407545566558838, -0.0458657331764698, 0.0...",Base
2,883-80-7822,"[0.04416758194565773, -0.009732005186378956, 0...",Base
3,444-34-9110,"[0.026955336332321167, 0.04990869015455246, -0...",Base
4,473-41-3914,"[0.10247407108545303, -0.062402576208114624, -...",Base


In [29]:
neo4j_password = getpass("neo4j password")

neo4j password ········


In [30]:
gds = GraphDataScience("neo4j+s://3bddbcd7.databases.neo4j.io", auth=("neo4j", neo4j_password))



In [None]:
gds.run_cypher("""CREATE CONSTRAINT email_node_key FOR (e:Email) REQUIRE e.address IS NODE KEY""")

In [25]:
gds.run_cypher("""
UNWIND $data AS row
CALL {
    WITH row
    MERGE (e:Email {address:row['email']})
    SET e.split = row['Split']
    WITH row, e
    CALL db.create.setNodeVectorProperty(e, 'editEmbedding', row['embedding']) 
} IN CONCURRENT TRANSACTIONS OF 10000 rows""",
              {"data": embedding_df.to_dict("records")})

In [31]:
gds.run_cypher("""CREATE CONSTRAINT ssn_node_key FOR (s:SSN) REQUIRE s.ssn IS NODE KEY""")

In [32]:
gds.run_cypher("""
UNWIND $data AS row
CALL {
    WITH row
    MERGE (e:SSN {ssn:row['ssn']})
    SET e.split = row['split']
    WITH row, e
    CALL db.create.setNodeVectorProperty(e, 'editEmbedding', row['embedding']) 
} IN CONCURRENT TRANSACTIONS OF 10000 rows""",
              {"data": ssn_df.to_dict("records")})