In [1]:
from graphdatascience import GraphDataScience
from getpass import getpass
import pandas as pd
import numpy as np
from time import perf_counter

# Run string embed python code

In [2]:
%cd ..

/home/sagemaker-user/string-embed


In [3]:
start_time = perf_counter()

In [4]:
!python main.py --dataset emails --nt 1000 --nq 1000 --epochs 20 --save-split --save-embed --save-model

2024-07-17 19:37:47.912984: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Loading time: 0.8150429725646973
# shuffled index:  [117303  40009  15903 ... 122964  25047  81273]
# Unique signature     : 39
# Maximum length       : 44
# Sampled Train Items  : 1000
# Sampled Query Items  : 1000
# Number of Base Items : 139286
# Number of Items      : 141286
# edit distance 1000x1000: 100%|██████████| 1000/1000 [00:00<00:00, 2173.59it/s]
# Calculate edit distance time: 0.4646010398864746
# sorting for KNN indices: 100%|█████████| 1000/1000 [00:00<00:00, 39935.86it/s]
# edit distance 139286x1000: 100%|████| 139286/139286 [01:03<00:00, 2188.02it/s]
# Calculate edit distance time: 63.67589807510376
# sorting for KNN indices: 100%|███████████|

In [5]:
end_time = perf_counter()
print(end_time - start_time)

92.63070385399988


# Send results to Neo4j

In [6]:
def build_data_df(dataset_name, property_name):
    property_df = pd.read_csv(f"data/{dataset_name}", header=None)
    property_df.columns=[property_name]

    base_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/embedding_xb.npy')
    train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/embedding_xt.npy')
    query_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/embedding_xq.npy')
    embedding_df = pd.concat([pd.DataFrame(base_embedding), pd.DataFrame(train_embedding), pd.DataFrame(query_embedding)])

    embedding_df['embedding'] = embedding_df.apply(lambda row: row.tolist(), axis=1)

    base_index = train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/base_idx.npy')
    train_index = train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/train_idx.npy')
    query_index = train_embedding = np.load(f'model/808/cnn/{dataset_name}/nt1000_nq1000/query_idx.npy')

    embedding_index = pd.concat([pd.Series(base_index), pd.Series(train_index), pd.Series(query_index)])
    embedding_df.index = embedding_index

    embedding_df = embedding_df['embedding'].copy()

    data_df = property_df.merge(embedding_df, left_index=True, right_index=True)

    data_df.loc[base_index, 'split'] = 'Base'
    data_df.loc[train_index, 'split'] = 'Train'
    data_df.loc[query_index, 'split'] = 'Query'

    return data_df


In [7]:
email_df = build_data_df("emails", "address")

In [8]:
email_df.head()

Unnamed: 0,address,embedding,split
0,cooperjames@yahoo.com,"[-0.04751335829496384, -0.039060041308403015, ...",Base
1,cindy24@hotmail.com,"[-0.08039160817861557, -0.04634513705968857, 0...",Base
2,averyjohn@ruiz.com,"[0.041277457028627396, -0.009661229327321053, ...",Base
3,santanachristopher@burns-robinson.com,"[-0.1594521701335907, 0.16107752919197083, 0.1...",Base
4,alvarezsherri@gmail.com,"[-0.044671107083559036, -0.08574065566062927, ...",Base


In [9]:
neo4j_password = getpass("neo4j password")

neo4j password ········


In [10]:
gds = GraphDataScience("neo4j+s://3bddbcd7.databases.neo4j.io", auth=("neo4j", neo4j_password))



In [11]:
gds.run_cypher("""CREATE CONSTRAINT email_node_key IF NOT EXISTS FOR (e:Email) REQUIRE e.address IS NODE KEY""")

In [12]:
start_time = perf_counter()
gds.run_cypher("""
UNWIND $data AS row
CALL {
    WITH row
    MERGE (e:Email {address:row['address']})
    SET e.split = row['Split']
    WITH row, e
    CALL db.create.setNodeVectorProperty(e, 'editEmbedding', row['embedding']) 
} IN CONCURRENT TRANSACTIONS OF 10000 rows""",
              {"data": email_df.to_dict("records")})
end_time = perf_counter()
print(end_time - start_time)

66.8922534340004
