### Imports

In [1]:
import pandas as pd
import os

In [2]:
from neo4j import GraphDatabase

In [3]:
from graphdatascience import GraphDataScience

### Connect to Neo4j server

In [4]:
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")

In [None]:
NEO4J_AUTH = ( "neo4j",  "<Your Password>") #Replace <Your Password> with your database password

### Make an instance of the Neo4j driver and an instance of GraphDataScience

In [6]:
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

In [7]:
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)

### Step 5: Create SHARED_PII relationships between clients

In [8]:
create_pii_query = '''
MATCH ( c:Client )-[ :HAS_EMAIL |:HAS_PHONE |:HAS_SSN ] -> (n) <-
                    [ :HAS_EMAIL |:HAS_PHONE |:HAS_SSN ]- ( d:Client )
WHERE id(c) < id(d)
WITH c, d, count(*) AS cnt
MERGE (c) - [ :SHARED_PII { count: cnt } ] -> (d)
'''

In [11]:
gds.run_cypher(create_pii_query)



### Step 6: Create Projection of the graph above

In [12]:
graph_projection_query = '''
CALL gds.graph.project(
    'clientClusters' ,
    { Client: {
        label: 'Client' }
    },
    { SHARED_PII: {
        type: 'SHARED_PII',
        orientation: 'UNDIRECTED',
        properties: {
                count: {
                    property: 'count' }
            }
        }
    }
)
YIELD graphName, nodeCount, relationshipCount
'''

In [13]:
gds.run_cypher(graph_projection_query)

Unnamed: 0,graphName,nodeCount,relationshipCount
0,clientClusters,2433,1518


### Step 7: Use the WCC algorithm to identify clusters of Client nodes

In [14]:
streaming_query = '''
CALL gds.wcc.stream(
    'clientClusters',
        {
        nodeLabels: ['Client'],
        relationshipTypes: ['SHARED_PII'],
        consecutiveIds: true
        }
    )
    YIELD nodeId, componentId
    RETURN gds.util.asNode(nodeId).id AS clientId, componentId AS clusterId
'''

In [15]:
import  neo4j

In [16]:
pandasDF = driver.execute_query(
    streaming_query,
    database_="neo4j",
    result_transformer_=  neo4j.Result.to_df
)

In [17]:
print(type(pandasDF))

<class 'pandas.core.frame.DataFrame'>


In [18]:
pandasDF

Unnamed: 0,clientId,clusterId
0,4997933060327094,0
1,4776276949898423,1
2,4858607188760216,2
3,4287186486553145,3
4,4661202154682409,4
...,...,...
2428,4413385955087620,1767
2429,4550448544478545,1862
2430,4114683318919154,334
2431,4172817689754167,2113


### Step 8: Mark possible fraudsters

In [19]:
streaming_query_2 = '''
CALL gds.wcc.stream(
    'clientClusters',
        {
        nodeLabels: ['Client'],
        relationshipTypes: ['SHARED_PII'],
        consecutiveIds: true
        }
    )
    YIELD nodeId, componentId
    WITH gds.util.asNode(nodeId) AS clientId , componentId AS clusterId
    WITH clusterId, collect(clientId.id) AS clients
    WITH clusterId, clients, size(clients) AS clusterSize WHERE clusterSize >= 2
    UNWIND clients AS client
    MATCH (c:Client) WHERE c.id = client
    SET c.secondPartyFraudRing = clusterId
'''

In [20]:
gds.run_cypher(streaming_query_2)