# GRAPH REPRESENTATION AND TRANSACTION PATH IN CRYPTOCURRENCY NETWORKS

In [1]:
#Installing Neo4j for pip install Neo4j Bolt driver for Python
!pip install neo4j





Importing Libraries of python

In [2]:
import os              # module provides a portable way of using operating system dependent functionality
import bz2             # This module provides a comprehensive interface for...
                      #...compressing and decompressing data using the bzip2 compression algorithm
import glob           #The glob() function returns an array of filenames or directories matching a specified pattern.
from neo4j import GraphDatabase  # Neo4j provides drivers which allow you to make a connection...
                                 #...to the database and develop applications 
                                 #...which create, read, update, and delete information from the graph.
import io
import copy
        
import pandas            #to access many of matplotlib's and NumPy's methods with less code
from tqdm import tqdm    #tqdm is a library in Python which is used for creating Progress Meters or Progress Bars.

uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "Friends!12345")) #Neo4j credentials to connect with neo4j database

Assigning data path and Converting hexadecimel values to decimel

In [3]:
DATA_PATH = "C:/Users/praka/DATA"
WEI_IN_ETH = 10**18  #HEXA TO DECIMEL

checking sample text data with neo4j database

In [4]:
def read_example():
    """Read the example data file."""
    df = pandas.read_csv(os.path.join(
        DATA_PATH, "example.txt"), sep=" ", header=None)
    df.columns = ["symbol", "block", "txno", "from", "to", "value"]
    return df

Loading the dataset block ranges downloaded and saved in local disk

In [5]:
def get_block_ranges(path):
    """Get the block ranges of the data files in the given path."""
    block_ranges = []
    for file in glob.glob(os.path.join(path, "eth-blocktime-*.txt.bz2")):
        block_ranges.append(
         (int(file.split("-")[2]), int(file.split("-")[3].split(".")[0])))
    return sorted(block_ranges)

In [6]:
get_block_ranges(DATA_PATH)

[(0, 999999),
 (1000000, 1999999),
 (2000000, 2999999),
 (3000000, 3999999),
 (4000000, 4099999),
 (4100000, 4199999),
 (4200000, 4299999),
 (4300000, 4399999),
 (4400000, 4499999),
 (4500000, 4599999),
 (4600000, 4699999),
 (4700000, 4799999),
 (4800000, 4899999),
 (4900000, 4999999),
 (5000000, 5099999),
 (5100000, 5199999),
 (5200000, 5299999),
 (5300000, 5399999),
 (5400000, 5499999),
 (5500000, 5599999),
 (5600000, 5699999),
 (5700000, 5799999),
 (5800000, 5899999),
 (5900000, 5999999),
 (6000000, 6099999),
 (6100000, 6199999),
 (6200000, 6299999),
 (6300000, 6399999),
 (6400000, 6499999),
 (6500000, 6599999),
 (6600000, 6699999),
 (6700000, 6799999),
 (6800000, 6899999),
 (6900000, 6999999),
 (7000000, 7099999),
 (7100000, 7199999),
 (7200000, 7299999),
 (7300000, 7399999),
 (7400000, 7499999),
 (7500000, 7599999),
 (7600000, 7699999),
 (7700000, 7799999),
 (7800000, 7899999),
 (7900000, 7999999),
 (8000000, 8099999),
 (8100000, 8199999),
 (8200000, 8299999),
 (8300000, 8399999),

Loading the dataset block ranges with time downloaded and saved in local disk

In [7]:
def load_blocktime_in_range(start, end):
#Load the blocktime data for the given block range.
    data = []
    for file in glob.glob(os.path.join(DATA_PATH, "eth-blocktime-*.txt.bz2")):
        if int(file.split("-")[2]) >= start and int(file.split("-")[3].split(".")[0]) <= end:
            data.append(pandas.read_csv(file, sep=" ", header=None))

    df = pandas.concat(data)
    df.columns = ["block", "time"]
    return df

In [8]:
load_blocktime_in_range(0, 999999)

Unnamed: 0,block,time
0,0,0x0
1,1,0x55ba4224
2,2,0x55ba4241
3,3,0x55ba4260
4,4,0x55ba427d
...,...,...
999995,999995,0x56bfb39c
999996,999996,0x56bfb3c0
999997,999997,0x56bfb3d6
999998,999998,0x56bfb3ed


Load the transaction data for the given block range

In [9]:
def load_tx_in_range(start, end):
    data = []
    for file in glob.glob(os.path.join(DATA_PATH, "eth-tx-*.txt.bz2")):
        if int(file.split("-")[2]) >= start and int(file.split("-")[3].split(".")[0]) <= end:
            data.append(pandas.read_csv(file, sep=" ", header=None))

    df = pandas.concat(data)
    df.columns = ["symbol", "block", "txno", "from", "to", "value"]
    return df

In [10]:
load_tx_in_range(0, 999999)

Unnamed: 0,symbol,block,txno,from,to,value
0,ETH,0,0,ETHMAINBLOCK,0x0000000000000000000000000000000000000000,0x4563918244f40000
1,ETH,0,1,0x0000000000000000000000000000000000000000,0x3282791d6fd713f1e94f4bfd565eaa78b3a0599d,0x487a9a304539440000
2,ETH,0,2,0x0000000000000000000000000000000000000000,0x17961d633bcf20a7b029a7d94b7df4da2ec5427f,0xc6ff070f1938b8000
3,ETH,0,3,0x0000000000000000000000000000000000000000,0x493a67fe23decc63b10dda75f3287695a81bd5ab,0x2fb474098f67c00000
4,ETH,0,4,0x0000000000000000000000000000000000000000,0x01fb8ec12425a04f813e46c54c05748ca6b29aa9,0xe15730385467c0000
...,...,...,...,...,...,...
2769395,ETH,999999,7,0x5c51467399bc655f0cc6db88df15946717534633,0x32be343b94f860124dc4fee278fdcbd38c102d88,0x1c54e302456eb400
2769396,ETH,999999,8,0x055d9d7ec193d1e062c6ec4fa80ef89b5c1258f4,0x32be343b94f860124dc4fee278fdcbd38c102d88,0xf0447b1edca4000
2769397,ETH,999999,9,0x8e68c0c9b5275fa684291304af9cafe6ceaf2772,0x26016a2b5d872adc1b131a4cd9d4b18789d0d9eb,0x16345785d8a0000
2769398,ETH,999999,10,0x2a65aca4d5fc5b5c859090a6c34d164135398226,0x5275c3371ece4d4a5b1e14cf6dbfc2277d58ef92,0xe93ea6a35f2e000


unzip the given file and call the given function with the path to the unzipped file

In [11]:
def temp_unzip_and_call(func, filename):
    zipfile = bz2.BZ2File(filename)
    data = zipfile.read()

    df = pandas.read_csv(io.BytesIO(data), sep=" ", header=None)
    df.columns = ["symbol", "block", "txno", "from", "to", "value"]
    df["value"] = df["value"].apply(normalize)

    data = io.BytesIO()
    df.to_csv(data, index=False, sep=" ", header=False)
    print(df.head())
    data.seek(0)

    newfilepath = filename[:-4]
    open(newfilepath, 'wb').write(data.getvalue())
    print("After serialization: ", pandas.read_csv(newfilepath, sep=" ", header=None).head())

    try:
        result = func(newfilepath)
    except Exception as e:
        print("Exception Found: ", e)
    os.remove(newfilepath)
    return result


 Get the filename for the given atomic block range.

In [12]:
def get_filename_for_range(start, end):
    return os.path.abspath(os.path.join(DATA_PATH, "eth-tx-{}-{}.txt.bz2".format(start, end)))
get_filename_for_range(0, 999999)

'C:\\Users\\praka\\DATA\\eth-tx-0-999999.txt.bz2'

Function converting values in the dataframe to integer and float

In [13]:
def hex2int(x): return int(x, 16)
def normalize(x): return hex2int(x) / WEI_IN_ETH


Graph Database(Neo4j) directory path to import the dataset

In [14]:
def get_import_path(tx):
    results = tx.run("CALL dbms.listConfig() YIELD name, value "
                     "WHERE name='dbms.directories.neo4j_home' OR name='server.directories.neo4j_home' "
                     "RETURN value")
    return os.path.join(results.single()[0], "import")

Loading dataframe as CSV using symlink function

In [15]:

def create_symlink_and_load_csv(file_path):
    print("In the create symlink function")
    symlink_path = create_symlink_for_file(file_path)
    print("Symlink path:" , symlink_path)
    try:
        with driver.session() as session:
            create_txs_from_csv(session, os.path.basename(
                os.path.normpath(symlink_path)))
    except Exception as e:
        print("Exception Found: file", e)
        return False
    return True


CREATING SYMLINK AND UNZIPPING THE ZIP FILES AND REMOVING LEFT OVER THE DATA

In [16]:
def create_symlink_for_file(file_path):
    print("file_path ",file_path)  
    with driver.session() as session:
        
        import_path = session.execute_read(get_import_path)
    print("import_path " , import_path)
    filename = os.path.basename(os.path.normpath(file_path))
    print("File_name ", filename)
    symlink_path = os.path.join(import_path, filename)
    try:
        os.symlink(file_path, symlink_path)
    except FileExistsError:
        pass
    return symlink_path

CREATING NEW TRANSACTIONS 

In [17]:
def create_txs(tx, tx_df):
    for _, row in tqdm(tx_df.iterrows()):
        tx.run("MERGE (a:Account {address: $address}) "
               "MERGE (b:Account {address: $address1}) "
               "MERGE (a)-[:SENT {value: $value, symbol: $symbol, block: $block, txno: $txno}]->(b)",
               address=row["from"], address1=row["to"], value=normalize(row["value"]), symbol=row["symbol"], block=row["block"], txno=row["txno"])


ADDING BLOCKTIME

In [18]:

def add_block_time(tx, blocktime_df):
    for _, row in tqdm(blocktime_df.iterrows()):
        tx.run("MATCH ()-[a:SENT]-() "
               "WHERE a.block = $block "
               "SET a.time = $time",
               block=row["block"], time=row["time"])


Running in automatic transaction mode to load data

In [19]:
def create_txs_from_csv(session, filename):
    query = """LOAD CSV FROM 'file:///{}' AS row FIELDTERMINATOR ' '
    CALL {{ WITH row
    MERGE (a:Account {{address: row[3]}})
    MERGE (b:Account {{address: row[4]}})
    MERGE (a)-[:SENT {{value: toFloat(row[5]), symbol: row[0], block: toInteger(row[1]), txno:
    toInteger(row[2])}}]->(b)
    }} IN TRANSACTIONS OF 10000 ROWS""".format(filename)
    session.run(query)

In [20]:
os.listdir(DATA_PATH)

['erc20tokens.json',
 'eth-blocktime-0-999999.txt.bz2',
 'eth-blocktime-1000000-1999999.txt.bz2',
 'eth-blocktime-10000000-10099999.txt.bz2',
 'eth-blocktime-10100000-10199999.txt.bz2',
 'eth-blocktime-2000000-2999999.txt.bz2',
 'eth-blocktime-3000000-3999999.txt.bz2',
 'eth-blocktime-4000000-4099999.txt.bz2',
 'eth-blocktime-4100000-4199999.txt.bz2',
 'eth-blocktime-4200000-4299999.txt.bz2',
 'eth-blocktime-4300000-4399999.txt.bz2',
 'eth-blocktime-4400000-4499999.txt.bz2',
 'eth-blocktime-4500000-4599999.txt.bz2',
 'eth-blocktime-4600000-4699999.txt.bz2',
 'eth-blocktime-4700000-4799999.txt.bz2',
 'eth-blocktime-4800000-4899999.txt.bz2',
 'eth-blocktime-4900000-4999999.txt.bz2',
 'eth-blocktime-5000000-5099999.txt.bz2',
 'eth-blocktime-5100000-5199999.txt.bz2',
 'eth-blocktime-5200000-5299999.txt.bz2',
 'eth-blocktime-5300000-5399999.txt.bz2',
 'eth-blocktime-5400000-5499999.txt.bz2',
 'eth-blocktime-5500000-5599999.txt.bz2',
 'eth-blocktime-5600000-5699999.txt.bz2',
 'eth-blocktime-

TRANSACTIONS MADE FROM EACH ACCOUNT

In [21]:
def txs_per_address(tx):
    results = tx.run("MATCH(a: Account)-[r:SENT] -> (: Account) "
                     "RETURN a.address, count(r) as sent "
                     "ORDER BY sent DESC")
    return [record for record in results]

Minimum and maximum transactions sent per address

In [22]:
def min_max_of_txs_per_address(tx):
    results = txs_per_address(tx)
    return results[0], results[-1]

Transaction Size Distribution

In [23]:
def tx_size_distribution(tx):
    results = tx.run("MATCH(: Account)-[r:SENT] -> (: Account) "
                     "RETURN r.value as size, count(r.value) as occurrence "
                     "ORDER BY occurrence DESC")
    return [record for record in results]


Ether Balance remaining in each account

In [24]:
def remaining_ether_balance(tx, address):
    results = tx.run("MATCH (a: Account {address: $address})"
                     "OPTIONAL MATCH p=(a)<-[received:SENT]-(: Account) WHERE received.symbol = 'ETH' "
                     "OPTIONAL MATCH p1=(a)-[sent:SENT] -> (:Account) WHERE sent.symbol = 'ETH' "
                     "RETURN sum(received.value) - sum(sent.value)", address=address)
    return [record for record in results]


IN and OUT DEGREE of each account

In [25]:
def in_and_out_degree(tx, address, start_block, end_block):
    results = tx.run("MATCH(a: Account {address: $address})"
                     "OPTIONAL MATCH outdegree=(a)-[s:SENT] -> (: Account) WHERE s.block >= $start_block AND s.block <= $end_block "
                     "OPTIONAL MATCH indegree=(a) <-[r:SENT]-(: Account) WHERE r.block >= $start_block AND r.block <= $end_block "
                     "RETURN count(outdegree), count(indegree)", address=address, start_block=start_block, end_block=end_block)
    return [record for record in results]

Lorenz curve distribution of Ether balances

In [26]:
def lorenz_curve(tx, start_block, end_block):
    results = tx.run("MATCH (a:Account) "
                     "CALL { "
                     "WITH a "
                     "OPTIONAL MATCH p = (b:Account {address: a.address})<-[received:SENT]-(:Account) WHERE received.block >= $start_block AND received.block <= $end_block "
                     "OPTIONAL MATCH p1 = (b)-[sent:SENT]->(:Account) WHERE sent.block >= $start_block  AND sent.block <= $end_block "
                     "RETURN sum(received.value) - sum(sent.value) as balance "
                     "} "
                     "RETURN a.address, balance ORDER BY balance DESC", start_block=start_block, end_block=end_block)
    return [record for record in results]

Relation between Balance and degrees

In [27]:
def relation_between_balance_and_degrees(tx):
    results = tx.run("MATCH (a:Account) "
                     "CALL { "
                     "WITH a "
                     "OPTIONAL MATCH p = (b:Account {address: a.address})<-[received:SENT]-(:Account) "
                     "OPTIONAL MATCH p1 = (b)-[sent:SENT]->(:Account) "
                     "RETURN sum(received.value) - sum(sent.value) as balance, count(indegree) as in_degree, count(outdegree) as out_degree "
                     "} "
                     "RETURN a.address, balance, in_degree, out_degree ORDER BY balance DESC")
    return [record for record in results]

Get Addresses of Each Account

In [28]:
def get_addresses(tx):
    results = tx.run("MATCH (a: Account) RETURN DISTINCT a.address as address")
    return [record for record in results]

TRANSACTIONS DETAILS OF ACCOUNTS

In [29]:
def get_account_txs(tx, address):
    results = tx.run("MATCH (a: Account {address: $address})-[r:SENT]->(b: Account) "
                     "RETURN r.symbol as symbol, r.block as block, r.txno as txno, r.value as value, r.time as time, b.address as to",
                     address=address)
    return [record for record in results]

query to delete the whole graph

In [30]:
def delete_graph(tx):
    tx.run("MATCH (n) DETACH DELETE n")

projection of Graph Data model to run the graph algorithms

In [31]:
def project_graph_into_catalog(tx, projection_name):
    tx.run("CALL gds.graph.project($projection_name, 'Account', { SENT : { properties: ['block', 'txno', 'value'] } } )",
           projection_name=projection_name)

Adds a graph to the catalog by filtering an existing graph using node and relationship predicates.

In [32]:
def project_subgraph_into_catalog_per_block(tx, projection_name, subgraph_name, block):
    results = tx.run("CALL gds.beta.graph.project.subgraph( "
                     "$projection, "
                     "$new_name, "
                     "'*', "
                     "'r.block = $block' "
                     ") "
                     "YIELD graphName, fromGraphName, nodeCount, relationshipCount",
                     projection=projection_name, new_name=subgraph_name, block=float(block))
    return [record for record in results]

Degree centrality to find influential nodes

In [33]:
def degree_centrality(tx, projection_name):
    results = tx.run("CALL gds.degree.stream('{}') "
                     "YIELD nodeId, score "
                     "RETURN gds.util.asNode(nodeId).address, score ".format(projection_name))
    return [record for record in results]

Degree closeness to find the shortest path of node to the cluster nodes

In [34]:
def degree_closeness(tx, projection_name):
    results = tx.run("CALL gds.beta.closeness.stream('{}') "
                     "YIELD nodeId, score "
                     "RETURN gds.util.asNode(nodeId).address AS address, score "
                     "ORDER BY score DESC".format(projection_name))
    return [record for record in results]

Degree Betweeness to find bridge node which connect one part of graph to another

In [35]:
def degree_betweenness(tx, projection_name):
    results = tx.run("CALL gds.betweenness.stream('{}') "
                     "YIELD nodeId, score "
                     "RETURN gds.util.asNode(nodeId).address AS address, score "
                     "ORDER BY score DESC".format(projection_name))
    return [record for record in results]

Pagerank to calculate importance of node in the cluster

In [36]:
def degree_pagerank(tx, projection_name):
    results = tx.run("CALL gds.pageRank.stream('{}') "
                     "YIELD nodeId, score "
                     "RETURN gds.util.asNode(nodeId).address AS address, score "
                     "ORDER BY score DESC".format(projection_name))
    return [record for record in results]

# COMMUNITY DETECTION

Louvain algorithm to detect communities in large networks

In [37]:
def louvain_algorithm(tx, projection_name):
    results = tx.run("CALL gds.louvain.stream('{}') "
                     "YIELD nodeId, communityId "
                     "RETURN gds.util.asNode(nodeId).address AS address, communityId "
                     "ORDER BY communityId DESC".format(projection_name))
    return [record for record in results]

It maximizes a modularity score for each community, where the modularity quantifies the quality of an assignment of nodes to communities. 

In [38]:
def write_louvain_communityId(tx, projection_name):
    results = tx.run("CALL gds.louvain.write('{}', { writeProperty: 'communityId' }) "
                     "YIELD communityCount, modularity, modularities".format(projection_name))
    return [record for record in results]

Weakly connected components in the network

In [39]:
def weakly_connected_components(tx, projection_name):
    results = tx.run("CALL gds.wcc.stream('{}') "
                     "YIELD nodeId, componentId "
                     "RETURN gds.util.asNode(nodeId).address AS address, componentId "
                     "ORDER BY componentId, address DESC".format(projection_name))
    return [record for record in results]

weakly connected component id

In [40]:
def write_wcc_componentId(tx, projection_name):
    results = tx.run("CALL gds.wcc.write('{}', { writeProperty: 'componentId' }) "
                     "YIELD nodePropertiesWritten, componentCount".format(projection_name))
    return [record for record in results]

To drop the projected graph

In [41]:
def drop_graph(tx, projection_name):
    tx.run("CALL gds.graph.drop('{}')".format(projection_name))

Loading dataset using Symlink function to neo4j database

In [42]:
def load_data(limit_range=1):
    # delete the graph
    with driver.session() as session:
        session.write_transaction(delete_graph)
        
    block_ranges = get_block_ranges(DATA_PATH)[:limit_range]
    for start, end in tqdm(block_ranges):
        print("Loading file for block range {}-{}".format(start, end))
        filename = get_filename_for_range(start, end)
        temp_unzip_and_call(create_symlink_and_load_csv, filename)

Loading data to Neo4j

In [43]:
#load_data()

In [44]:
#with driver.session() as session:
#     print(session.execute_read(get_import_path))

In [45]:
#CALL dbms.listConfig() YIELD name, value 
#WHERE name='dbms.directories.neo4j_home' 
#RETURN value

Run a graph algorithm on the projection in the catalog provided a projection name and algorithm name.

In [46]:
def run_gds_algorithm(projection_name, algorithm="louvain"):   
    with driver.session() as session:
        print("Projecting graph into catalog")
        session.write_transaction(project_graph_into_catalog, projection_name)

        print("Running algorithm: {}".format(algorithm))
        
        if algorithm == "louvain":
            results = session.execute_read(louvain_algorithm, projection_name)
        elif algorithm == "wcc":
            results = session.execute_read(weakly_connected_components, projection_name)
        elif algorithm == "degree_centrality":
            results = session.execute_read(degree_centrality, projection_name)
        elif algorithm == "degree_closeness":
            results = session.execute_read(degree_closeness, projection_name)
        elif algorithm == "degree_betweenness":
            results = session.execute_read(degree_betweenness, projection_name)
        elif algorithm == "degree_pagerank":
            results = session.execute_read(degree_pagerank, projection_name)
        else:
            raise ValueError("Algorithm not supported")

        session.write_transaction(drop_graph, projection_name)
        return results

In [47]:
run_gds_algorithm("centrality6", "wcc")

Projecting graph into catalog


  session.write_transaction(project_graph_into_catalog, projection_name)


Running algorithm: wcc


  session.write_transaction(drop_graph, projection_name)


[<Record address='ETHMAINUNCLE' componentId=0>,
 <Record address='ETHMAINBLOCK' componentId=0>,
 <Record address='0xfff7ac99c8e4feb60c9750054bdc14ce1857f181' componentId=0>,
 <Record address='0xfff4bad596633479a2a29f9a8b3f78eefd07e6ee' componentId=0>,
 <Record address='0xfff33a3bd36abdbd412707b8e310d6011454a7ae' componentId=0>,
 <Record address='0xffec0913c635baca2f5e57a37aa9fb7b6c9b6e26' componentId=0>,
 <Record address='0xffeac0305ede3a915295ec8e61c7f881006f4474' componentId=0>,
 <Record address='0xffe8cbc1681e5e9db74a0f93f8ed25897519120f' componentId=0>,
 <Record address='0xffe2e28c3fb74749d7e780dc8a5d422538e6e451' componentId=0>,
 <Record address='0xffe28db53c9044b4ecd4053fd1b4b10d7056c688' componentId=0>,
 <Record address='0xffe0e997f1977a615f5a315af413fd4869343ba0' componentId=0>,
 <Record address='0xffd6da958eecbc016bab91058440d39b41c7be83' componentId=0>,
 <Record address='0xffd5170fd1a8118d558e7511e364b24906c4f6b3' componentId=0>,
 <Record address='0xffc9cc3094b041ad0e076f968a

# MACHINE LEARNING- LINK PREDICTION

Adds a graph to the catalog using Native projection.

In [48]:
def project_graph_into_catalog_undirected(tx, projection_name):
    tx.run("CALL gds.graph.project($projection_name, 'Account', { SENT : { properties: ['block', 'txno', 'value'], orientation: 'UNDIRECTED' } } )",
           projection_name=projection_name)

1. Creating the pipeline

In [49]:
def create_link_pred_pipeline(tx, pipeline):
    tx.run("CALL gds.beta.pipeline.linkPrediction.create($pipeline)",
           pipeline=pipeline)

2. Adding node properties

In [50]:
def add_node_property_step(tx, pipeline, procedure, config):
    results = tx.run("CALL gds.beta.pipeline.linkPrediction.addNodeProperty($pipeline, $procedure, $config) YIELD nodePropertySteps",
           pipeline=pipeline, procedure=procedure, config=config)
    return [record for record in results]

3. Adding link features

In [51]:
def add_link_feature(tx, pipeline, feature, config):
    results = tx.run("CALL gds.beta.pipeline.linkPrediction.addFeature($pipeline, $feature, $config) YIELD featureSteps",
           pipeline=pipeline, feature=feature, config=config)
    return [record for record in results]


4. Configuring the relationship splits

In [52]:
def config_split(tx, pipeline, config):
    results = tx.run("CALL gds.beta.pipeline.linkPrediction.configureSplit($pipeline, $config) YIELD splitConfig",
           pipeline=pipeline, config=config)
    return [record for record in results]

5. Adding model candidates

In [53]:

def add_candidate_model(tx, pipeline, config, model="logistic_regression"):
    if model == "logistic_regression":
        results = tx.run("CALL gds.beta.pipeline.linkPrediction.addLogisticRegression($pipeline, $config) YIELD parameterSpace",
               pipeline=pipeline, config=config)
    elif model == "random_forest":
        results=  tx.run("CALL gds.alpha.pipeline.linkPrediction.addRandomForest($pipeline, $config) YIELD parameterSpace",
               pipeline=pipeline, config=config)
    else:
        raise ValueError("Model not supported")
    
    return [record for record in results]

TRAINING THE PIPELINE

In [54]:
def run_link_prediction_pipeline(tx, projection_name, pipeline, model_name):
    results = tx.run("""CALL gds.beta.pipeline.linkPrediction.train($projection_name, {
    pipeline: $pipeline,
    modelName: $model_name,
   targetRelationshipType: 'SENT',
    randomSeed: 73
    }) YIELD modelInfo, modelSelectionStats
    RETURN
        modelInfo.bestParameters AS winningModel,
        modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
        modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
        modelInfo.metrics.AUCPR.test AS testScore,
        [cand IN modelSelectionStats.modelCandidates | cand.metrics.AUCPR.validation.avg] AS validationScores"""
    , projection_name=projection_name, pipeline=pipeline, model_name=model_name)
    return [record for record in results]

Applying a trained model for prediction

In [55]:
def construct_sample_pipeline(projection_name, pipeline, model_name):
    with driver.session() as session:
        # project the graph 
        session.write_transaction(project_graph_into_catalog_undirected, projection_name)
        
        # create the pipeline
        session.write_transaction(create_link_pred_pipeline, pipeline)

        # add node property steps
        session.write_transaction(add_node_property_step, pipeline, "beta.node2vec", {
            'mutateProperty': 'embedding',
            'embeddingDimension': 64,
            'randomSeed': 42
        })

        # add link features
        session.write_transaction(add_link_feature, pipeline, "hadamard", {
                'nodeProperties': ['embedding']
                })

        # configure split
        session.write_transaction(config_split, pipeline,  {
            'testFraction': 0.25,
            'trainFraction': 0.6,
            'validationFolds': 3
            })
        
        # add candidate model
        ## add logistic regression
        session.write_transaction(add_candidate_model, pipeline, {'maxEpochs': 500}, model="logistic_regression")
        ## add random forest
        session.write_transaction(add_candidate_model, pipeline,
                                  {'numberOfDecisionTrees': 10}, model="random_forest")

        # run the pipeline
        results = session.write_transaction(run_link_prediction_pipeline, projection_name, pipeline, model_name)
        
        return results

Training the model after given configurations

In [56]:
construct_sample_pipeline("undirec1", "sample_pipeline1", "sample_model1")

  session.write_transaction(project_graph_into_catalog_undirected, projection_name)
  session.write_transaction(create_link_pred_pipeline, pipeline)
  session.write_transaction(add_node_property_step, pipeline, "beta.node2vec", {
  session.write_transaction(add_link_feature, pipeline, "hadamard", {
  session.write_transaction(config_split, pipeline,  {
  session.write_transaction(add_candidate_model, pipeline, {'maxEpochs': 500}, model="logistic_regression")
  session.write_transaction(add_candidate_model, pipeline,
  results = session.write_transaction(run_link_prediction_pipeline, projection_name, pipeline, model_name)


[<Record winningModel={'maxDepth': 2147483647, 'minLeafSize': 1, 'criterion': 'GINI', 'minSplitSize': 2, 'numberOfDecisionTrees': 10, 'methodName': 'RandomForest', 'numberOfSamplesRatio': 1.0} avgTrainScore=0.9999447194624992 outerTrainScore=0.9999399066139623 testScore=0.9964957400916541 validationScores=[0.6149365640404344, 0.9993703635737341]>]

PREDICT WITH CONTEXT FILTERING

In [57]:
def perform_inference(tx, projection_name, model_name):
    results = tx.run("""CALL gds.beta.pipeline.linkPrediction.predict.stream($projection_name, {
        modelName: $model_name,
        topN: 50,
        threshold: 0.5
    })
    YIELD node1, node2, probability
    RETURN gds.util.asNode(node1).address AS acc1, gds.util.asNode(node2).address AS acc2, probability
    ORDER BY probability DESC, acc1""", projection_name=projection_name, model_name=model_name)

    return [record for record in results]

PRINTING THE OUTPUT OF PREDICTION

In [58]:
with driver.session() as session:
    results = session.write_transaction(perform_inference, "undirec1", "sample_model1")
    for record in results:
        print(record)

  results = session.write_transaction(perform_inference, "undirec1", "sample_model1")


<Record acc1='0x0b7d339371e5be6727e6e331b5821fa24bdb9d5a' acc2='0xf8f28f38d73956da4ab44ccd82ff58cecadc5f75' probability=0.9>
<Record acc1='0x28967280214e218a120c5dda37041b111ea36d74' acc2='0x9da8e22ca10e67fea44e525e4751eeac36a31194' probability=0.9>
<Record acc1='0x28967280214e218a120c5dda37041b111ea36d74' acc2='0x6205c2d5647470848a3840f3887e9b015d34755c' probability=0.9>
<Record acc1='0x81139bfdcca656c430203f72958c543b6580d40c' acc2='0xf8f28f38d73956da4ab44ccd82ff58cecadc5f75' probability=0.9>
<Record acc1='0x952183cfd38e352e579d36decec5b18450f7fba0' acc2='0xf8f28f38d73956da4ab44ccd82ff58cecadc5f75' probability=0.9>
<Record acc1='0x95d98d0c1069908f067a52acac2b8b534da37afd' acc2='0xf8f28f38d73956da4ab44ccd82ff58cecadc5f75' probability=0.9>
<Record acc1='0xf8f28f38d73956da4ab44ccd82ff58cecadc5f75' acc2='0xe1dfb5cc890ee8b2877e885d267c256187d019e6' probability=0.9>
<Record acc1='0x17961d633bcf20a7b029a7d94b7df4da2ec5427f' acc2='0xbb7b8287f3f0a933474a79eae42cbca977791171' probability=0.8>
