In [2]:
from neo4j import GraphDatabase
URI = "neo4j://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = "test"
GRAPH_NAME = 'nyc2'

driver = GraphDatabase.driver(URI, auth=(NEO4J_USER, NEO4J_PASS))

In [26]:
def create_gds_graph(tx, graph_name, nodes, relationships):
    qry = f"""
    CALL gds.graph.project(
      '{graph_name}',
      {nodes},   
      {relationships}     
    )
    YIELD
      graphName AS graph, nodeProjection, nodeCount AS nodes, relationshipCount AS rels
    """
    _ = tx.run(qry)

In [None]:
with GraphDatabase.driver(URI, auth=(NEO4J_USER, NEO4J_PASS)) as driver:
    with driver.session() as session:
        query = f"CALL gds.graph.drop('{GRAPH_NAME}', false)"
        session.run(query)
        _ = session.read_transaction(create_gds_graph, GRAPH_NAME, ['OWNER', 'BBL', 'PHONE'], ['TAX_ASSESSOR_OWNER', 'DEED_OWNER', 'PERMIT_OWNER', "PERMIT_OWNER_PHONE"])
        
        graph_stats, wcc_count =  session.read_transaction(get_gds_graph_stats, GRAPH_NAME)
        
print(graph_stats)
print(wcc_count)

Convert hyperparameters into cypher

In [42]:
def settings_to_cypher(settings):
    return "{" + ", ".join([f"{k}: {v}" for k, v in settings.items()]) + "}"

In [30]:
node2vec_settings = [
    {'sudo': 'TRUE',  'embeddingDimension': 64},
    {'sudo': 'TRUE', 'walkLength': 5, 'iterations': 3, 'mutateProperty': "'emb_2'"},
    {'sudo': 'TRUE', 'walkLength': 5, 'iterations': 3, 'embeddingDimension': 64, 'mutateProperty': "'emb_3'"},
    {'sudo': 'TRUE', 'walkLength': 5, 'iterations': 3, 'inOutFactor': 0.5, 'mutateProperty': "'emb_4'"},
    {'sudo': 'TRUE', 'walkLength': 5, 'iterations': 3, 'returnFactor': 1.5, 'mutateProperty': "'emb_5'"},
]

node2vec_properties = ['emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5']

def settings_to_cypher(settings):
    return "{" + ", ".join([f"{k}: {v}" for k, v in settings.items()]) + "}"

def create_node2vec_embeddings(tx, graph_name, n2v_settings):
    qry = f"""
    CALL gds.beta.node2vec.stream('{graph_name}', {settings_to_cypher(n2v_settings)})
    YIELD nodeId, embedding
    RETURN nodeId, embedding
    """
    
    print(qry)
    
    records = []    
    result = tx.run(qry)
    
    for record in result:
        records.append(record)
        
    return qry, records

In [55]:
with GraphDatabase.driver(URI, auth=(NEO4J_USER, NEO4J_PASS)) as driver:
    with driver.session() as session:
        qry, result = session.read_transaction(create_node2vec_embeddings, GRAPH_NAME, node2vec_settings[0])

  qry, result = session.read_transaction(create_node2vec_embeddings, GRAPH_NAME, node2vec_settings[0])



    CALL gds.beta.node2vec.stream('nyc2', {sudo: TRUE, embeddingDimension: 64})
    YIELD nodeId, embedding
    RETURN nodeId, embedding
    


In [36]:
import pandas as pd 
with GraphDatabase.driver(URI, auth=(NEO4J_USER, NEO4J_PASS)) as driver:
    with driver.session() as session:
        bbl_result = session.run("match (b:BBL) return b.bbl, b.address, b.embedding")
        bbl_df = pd.DataFrame([r.values() for r in bbl_result], columns=bbl_result.keys())
        owner_result = session.run("match (o:OWNER) return o.name, o.embedding")
        owner_df = pd.DataFrame([r.values() for r in owner_result], columns=owner_result.keys())

Visualize embeddings with t-SNE

In [111]:
from sklearn.manifold import TSNE
import numpy as np

def get_tsne_projections(df, embedding_col, n_components=2):
    # Run the scikit-learn TSNE method to reduce the embeddings to 2d
    tsne = TSNE(n_components=n_components, verbose=1, random_state=123)
    x = tsne.fit_transform(np.array(df[embedding_col].tolist()))
    df["comp-1"] = x[:,0]
    df["comp-2"] = x[:,1]
    df["comp-3"] = x[:,2]
    
    return df

In [112]:
bbl_projected_df = get_tsne_projections(bbl_df, "b.embedding", 3)
owner_projected_df = get_tsne_projections(owner_df, "o.embedding", 3)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15753 samples in 0.001s...
[t-SNE] Computed neighbors for 15753 samples in 0.219s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15753
[t-SNE] Computed conditional probabilities for sample 2000 / 15753
[t-SNE] Computed conditional probabilities for sample 3000 / 15753
[t-SNE] Computed conditional probabilities for sample 4000 / 15753
[t-SNE] Computed conditional probabilities for sample 5000 / 15753
[t-SNE] Computed conditional probabilities for sample 6000 / 15753
[t-SNE] Computed conditional probabilities for sample 7000 / 15753
[t-SNE] Computed conditional probabilities for sample 8000 / 15753
[t-SNE] Computed conditional probabilities for sample 9000 / 15753
[t-SNE] Computed conditional probabilities for sample 10000 / 15753
[t-SNE] Computed conditional probabilities for sample 11000 / 15753
[t-SNE] Computed conditional probabilities for sample 12000 / 15753
[t-SNE] Computed conditional probabilities for sam


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



[t-SNE] Computed neighbors for 13169 samples in 0.203s...
[t-SNE] Computed conditional probabilities for sample 1000 / 13169
[t-SNE] Computed conditional probabilities for sample 2000 / 13169
[t-SNE] Computed conditional probabilities for sample 3000 / 13169
[t-SNE] Computed conditional probabilities for sample 4000 / 13169
[t-SNE] Computed conditional probabilities for sample 5000 / 13169
[t-SNE] Computed conditional probabilities for sample 6000 / 13169
[t-SNE] Computed conditional probabilities for sample 7000 / 13169
[t-SNE] Computed conditional probabilities for sample 8000 / 13169
[t-SNE] Computed conditional probabilities for sample 9000 / 13169
[t-SNE] Computed conditional probabilities for sample 10000 / 13169
[t-SNE] Computed conditional probabilities for sample 11000 / 13169
[t-SNE] Computed conditional probabilities for sample 12000 / 13169
[t-SNE] Computed conditional probabilities for sample 13000 / 13169
[t-SNE] Computed conditional probabilities for sample 13169 / 13169

In [51]:
# bbl_projected_df.to_csv("data/bbl_embeddings.csv", index=False)
# owner_projected_df.to_csv("data/owner_embeddings.csv", index=False)

In [113]:
bbl_subset = bbl_projected_df.sample(1000).drop("b.embedding", axis=1).assign(node = "property").reset_index(drop=True)
owenr_subset = owner_projected_df.sample(1000).drop("o.embedding", axis=1).assign(node = "owner").reset_index(drop=True)

plot_df = pd.concat([bbl_subset, owenr_subset])

In [117]:
plot_df.to_csv("data/tsne-plot-data.csv", index=False)

In [116]:
import plotly.express as px
fig = px.scatter_3d(plot_df, x='comp-1', y='comp-2', z='comp-3', color='node', log_x=True, log_y=True)
fig.show()