# Importing a csv file to a neo4j

In [154]:
import pandas as pd
from neo4j.v1 import GraphDatabase, basic_auth

driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "blockchain"))
session = driver.session()
# ethereum = pd.read_csv("./transactions.csv", chunksize=1000, iterator=True, sep=";")
ethereum = [pd.read_csv("./transactions.csv", nrows=100, sep=";")]
IN_ADDRESS = 'from'
OUT_ADDRESS = 'to'
AMOUNT = 'value'
for chunk in ethereum:
    for index, row in chunk.iterrows():
        session.run("MERGE (address1:Address {address:{address1}})"
                    "MERGE (address2:Address {address:{address2}})"
                    "CREATE UNIQUE (address1)-[t:Transactions]->(address2)"
                    "SET t.amount = coalesce(t.amount, 0) - {amount}"
                    "SET t.number = coalesce(t.number, 0) - 1",
                    {"address1": row[IN_ADDRESS], "address2": row[OUT_ADDRESS], 'amount': row[AMOUNT]})
session.close()

# Clustering a neo4j graph

```cypher
MATCH (start:Address{address: "0x32be343b94f860124dc4fee278fdcbd38c102d88"})-[:Transaction*1..2]->(address:Address) RETURN address

//Find all connected components
CALL algo.unionFind('Address', 'Transactions', {write:true, partitionProperty:"component"})
YIELD nodes, setCount, loadMillis, computeMillis, writeMillis

//Create minimum spanning tree
//TODO Use connected components only
MATCH(a:Address{address: '0x0d4ecc7d750180ebf4a9df728b6669b5bceb5e40'}) 
CALL algo.mst(a, 'number', {write:true, writeProperty:"MSTree"})
YIELD loadMillis, computeMillis, writeMillis, weightSum, weightMin, weightMax, relationshipCount
RETURN relationshipCount;

//Create clusters
MATCH ()-[t:MSTree]->() 
WITH t 
ORDER BY t.number 
LIMIT 80 //Number of clusters - number of components - 1
DELETE t;

//Remove transactions relation
MATCH ()-[t:Transactions]->() DELETE t;

//Remove addresses
MATCH (a:Address) DETACH DELETE a;
```

### Open session

In [2]:
import pandas as pd
from neo4j.v1 import GraphDatabase, basic_auth

driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "blockchain"))
session = driver.session()

NameError: name 'GraphDatabase' is not defined

### Find connected components

In [None]:
session.run("CALL algo.unionFind('Address', 'Transactions', {write:true, partitionProperty:'component'}) "
            "YIELD nodes, setCount, loadMillis, computeMillis, writeMillis "
            "RETURN nodes;")

### Create minimum spanning tree

In [122]:
session.run("MATCH(a:Address{address: '0x0d4ecc7d750180ebf4a9df728b6669b5bceb5e40'}) "
            "CALL algo.mst(a, 'number', {write:true, writeProperty:'MSTree'}) "
            "YIELD loadMillis, computeMillis, writeMillis, weightSum, weightMin, weightMax, relationshipCount "
            "RETURN relationshipCount;")

<neo4j.v1.result.BoltStatementResult at 0x7f0a2170bbe0>

### Create clusters

In [123]:
session.run("MATCH ()-[t:MSTree]->() "
            "WITH t "
            "ORDER BY t.number DESC "
            "LIMIT 40 "
            "DELETE t;")

<neo4j.v1.result.BoltStatementResult at 0x7f0a2170b160>

### Remove MSTree relation

In [44]:
session.run("MATCH ()-[t:MSTree]->() DELETE t;")

<neo4j.v1.result.BoltStatementResult at 0x7f0a2180c8d0>

### Remove transactions relation

In [109]:
session.run("MATCH ()-[t:Transactions]->() DELETE t;")

<neo4j.v1.result.BoltStatementResult at 0x7f0a218c5e80>

### Remove addresses

In [153]:
session.run("MATCH (a:Address) DETACH DELETE a;")
session.close()

# Clustering using MCL

1. Get adjacency matrix
2. Normalize matrix
3. Use MCL to get clusters

In [155]:
import pandas as pd
from neo4j.v1 import GraphDatabase, basic_auth
import numpy as np

driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "blockchain"))
session = driver.session()

In [156]:
addresses = session.run("MATCH (address:Address) RETURN address");
address_indices = []
for address in addresses:
    address_indices.append(address['address'].properties['address'])

invert_address_indices = {v: k for k, v in enumerate(address_indices)}

In [157]:
transactions_matrix = np.zeros((len(address_indices), len(address_indices)))
transactions = session.run("MATCH p=()-[:Transactions]->() RETURN p")
for transaction in transactions.data():
    start = invert_address_indices[transaction['p'].start.properties['address']]
    end = invert_address_indices[transaction['p'].end.properties['address']]
    transactions_matrix[start, end] -= transaction['p'].relationships[0].properties['number']

In [158]:
from mcl.mcl_clustering import mcl

M, clusters = mcl(transactions_matrix, 
                   expand_factor = 10,
                   inflate_factor = 0.1,
                   max_loop = 1000,
                   mult_factor = 10)

In [159]:
results = []
for index, cluster in clusters.items():
    for address in cluster:
        print(address)
        address = address_indices[address]
        session.run("MATCH (a:Address{address: {address}}) WITH a "
                   "SET a.cluster = coalesce(a.cluster, {cluster})", 
                    {'address': address, 'cluster': index})
clusters

0
1
2
3
4
13
34
35
45
49
54
55
68
71
72
73
78
5
6
7
8
28
44
0
1
3
4
9
10
13
34
35
45
49
54
55
68
71
72
73
78
11
12
14
15
30
15
30
0
1
3
4
10
13
16
34
35
45
49
54
55
68
71
72
73
78
0
1
3
4
13
17
34
35
45
49
54
55
68
71
72
73
78
18
19
19
6
20
12
21
6
22
23
6
23
24
25
25
0
1
3
4
13
26
27
31
34
35
45
49
54
55
68
71
72
73
78
0
1
4
27
54
71
72
8
28
29
44
0
1
3
4
13
31
34
35
45
49
54
55
68
71
72
73
78
8
28
32
44
8
28
33
44
36
37
37
0
1
3
4
13
34
35
38
39
45
49
54
55
68
71
72
73
78
0
1
3
4
13
34
35
39
45
49
54
55
68
71
72
73
78
30
40
41
30
41
42
43
42
43
0
1
3
4
13
34
35
45
46
49
54
55
68
71
72
73
78
6
47
6
48
0
1
3
4
13
34
35
45
46
49
50
54
55
68
71
72
73
78
51
52
52
53
6
56
6
57
8
28
44
58
0
1
3
4
13
34
35
45
49
54
55
59
60
68
71
72
73
78
0
1
3
4
13
34
35
45
49
54
55
60
68
71
72
73
78
61
62
62
12
63
0
1
3
4
13
34
35
45
49
54
55
64
68
71
72
73
78
0
1
3
4
13
34
35
45
49
54
55
65
68
71
72
73
78
30
66
67
30
67
0
1
3
4
13
34
35
45
49
54
55
68
69
70
71
72
73
78
0
1
3
4
13
34
35
45
49
54
55
68
70
7

{0: [0, 1, 2, 3, 4, 13, 34, 35, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 1: [5, 6],
 2: [7, 8, 28, 44],
 3: [0, 1, 3, 4, 9, 10, 13, 34, 35, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 4: [11, 12],
 5: [14, 15, 30],
 6: [15, 30],
 7: [0, 1, 3, 4, 10, 13, 16, 34, 35, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 8: [0, 1, 3, 4, 13, 17, 34, 35, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 9: [18, 19],
 10: [19],
 11: [6, 20],
 12: [12, 21],
 13: [6, 22, 23],
 14: [6, 23],
 15: [24, 25],
 16: [25],
 17: [0, 1, 3, 4, 13, 26, 27, 31, 34, 35, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 18: [0, 1, 4, 27, 54, 71, 72],
 19: [8, 28, 29, 44],
 20: [0, 1, 3, 4, 13, 31, 34, 35, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 21: [8, 28, 32, 44],
 22: [8, 28, 33, 44],
 23: [36, 37],
 24: [37],
 25: [0, 1, 3, 4, 13, 34, 35, 38, 39, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 26: [0, 1, 3, 4, 13, 34, 35, 39, 45, 49, 54, 55, 68, 71, 72, 73, 78],
 27: [30, 40, 41],
 28: [30, 41],
 29: [42, 43],
 30: [42, 43],
 31: [0, 1, 3, 4, 13, 34, 35, 45, 46, 49

### Greedy clustering

```cypher
// Convert amount

MATCH (a:Address)-[t:Transaction]->(b:Address)
WITH t
SET t.amount = toFloat(t.amount)

// Reset properties

MATCH (a:Address) 
SET a.all = 0.0
SET a.rotate = 0.0
SET a.cluster = 0;

// Initialize sum of all transactions for an address

MATCH (a:Address)-[t:Transaction]-(b:Address) WITH t, a, b
SET a.all = a.all + t.amount

// Set start

MATCH (start:Address{address: '0x22a0fbf89ad1362d74f626436d8c4fc6dc4f0679'}) SET start.cluster = 1;

// Iteration: reset rotate for candidates

MATCH (:Address{cluster: 1})-[transaction:Transaction]-(candidate:Address) WHERE candidate.cluster <> 1 WITH candidate
SET candidate.rotate = 0.0;

// Iteration: recount rotate and all coefficients for each candidate vertex

MATCH (:Address{cluster: 1})-[transaction:Transaction]-(candidate:Address) WHERE candidate.cluster <> 1 WITH candidate, transaction
SET candidate.rotate = candidate.rotate + transaction.amount;

// Iteration: add best node to a cluster and recount cluster params

MATCH (start:Address{address: '0x22a0fbf89ad1362d74f626436d8c4fc6dc4f0679'}) WITH start
MATCH (:Address{cluster: 1})-[t:Transaction]-(candidate:Address) WHERE candidate.cluster <> 1 WITH ((start.rotate + candidate.rotate) / (start.all + candidate.all - candidate.rotate)) AS rotate_all, candidate, start ORDER BY rotate_all DESC LIMIT 1 
SET candidate.cluster = 1
SET start.all = start.all + candidate.all
SET start.rotate = start.rotate + candidate.rotate - candidate.rotate
SET start.rotate_all = rotate_all;

// Return all nodes

MATCH p=(:Address{cluster: 1})-[:Transaction]-(:Address) RETURN p LIMIT 50;
```

In [12]:
from neo4j.v1 import GraphDatabase, basic_auth
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "blockchain"))
session = driver.session()

In [13]:
session.run("MATCH (a:Address)" 
            "SET a.all = 0 "
            "SET a.rotate = 0 "
            "SET a.cluster = 0;", {})
session.run("MATCH (a:Address)-[t:Transaction]-(b:Address) WITH t, a, b "
            "SET a.all = coalesce(a.all, 0) + t.amount", {})
session.run("MATCH (start:Address{address: {address}}) SET start.cluster = 1;", {'address': '0x32be343b94f860124dc4fee278fdcbd38c102d88'})

<neo4j.v1.result.BoltStatementResult at 0x7fa45a8bf4a8>

In [14]:
for i in range(0, 100):
    print(i)
    session.run("MATCH (:Address{cluster: 1})-[transaction:Transaction]-(candidate:Address) WHERE candidate.cluster <> 1 WITH candidate "
                "SET candidate.rotate = 0.0;", {})
    session.run("MATCH (:Address{cluster: 1})-[transaction:Transaction]-(candidate:Address) WHERE candidate.cluster <> 1 WITH candidate, transaction "
                "SET candidate.rotate = coalesce(candidate.rotate, 0) + transaction.amount", {})
    session.run("MATCH (start:Address{address: {address}}) WITH start "
                "MATCH (:Address{cluster: 1})-[t:Transaction]-(candidate:Address) WHERE candidate.cluster <> 1 WITH ((start.rotate + candidate.rotate) / (start.all + candidate.all - candidate.rotate)) AS rotate_all, candidate, start ORDER BY rotate_all DESC LIMIT 1 "
                "SET candidate.cluster = 1 "
                "SET start.all = coalesce(start.all, 0) + candidate.all - candidate.rotate "
                "SET start.rotate = coalesce(start.rotate, 0) + candidate.rotate;", {'address': '0x32be343b94f860124dc4fee278fdcbd38c102d88'})

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [5]:
session.close();