In [115]:
import pandas as pd
import configparser
import os
from neo4j import GraphDatabase, basic_auth
from graphdatascience import GraphDataScience
pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 50)

In [116]:
print("hello world")

hello world


In [117]:
NEO4J_PROPERTIES_FILE = 'resources/aura-p2p-fd.ini'

In [118]:
## Using an ini file for credentials, otherwise providing defaults
HOST = 'neo4j://localhost'
USERNAME = 'neo4j'
PASSWORD = 'password'

if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
    config = configparser.RawConfigParser()
    config.read(NEO4J_PROPERTIES_FILE)
    HOST = config['NEO4J']['HOST']
    USERNAME = config['NEO4J']['USERNAME']
    PASSWORD = config['NEO4J']['PASSWORD']
    DATABASE = config['NEO4J'].get('DATABASE', 'frauddb')

    print('Using custom database properties')
else:
    print('Could not find database properties file, using defaults')

Using custom database properties


In [119]:
# Use Neo4j URI and credentials according to your setup
# gds = GraphDatabase.driver(HOST, auth=basic_auth(USERNAME, PASSWORD))
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), database="frauddb")

In [None]:
def clear_graph_by_name(g_name):
    """
    Safely clears a projected graph in the Neo4j Graph Data Science (GDS) library.

    Parameters:
    g_name (str): The name of the projected graph to be cleared.

    Steps:
    1. Check if the graph with the given name exists in GDS.
    2. If the graph exists, retrieve it.
    3. Drop (delete) the graph from memory to free up resources.
    """
    
    # Step 1: Check if the graph exists in the GDS catalog
    if gds.graph.exists(g_name).exists:
        # Step 2: Retrieve the existing graph object
        g = gds.graph.get(g_name)
        # Step 3: Drop the graph to free up memory and prevent conflicts with new projections
        gds.graph.drop(g)

def clear_all_graphs():
    g_names = gds.graph.list().graphName.tolist()
    for g_name in g_names:
        g = gds.graph.get(g_name)
        gds.graph.drop(g)

def identifier_degrees(user_label, degree_property):
    """
    Projects a subgraph in Neo4j's Graph Data Science (GDS) library, computes degree centrality,
    and writes the computed property back to the database.

    Degree centrality is a graph algorithm that measures the number of direct connections
    (edges) a node has to other nodes.  It helps identify highly connected nodes in a network.
    
    Parameters:
    user_label (str): The label of the user nodes to include in the projection (e.g., 'User' or 'FlaggedUser').
    degree_property (str): The name of the property where the degree centrality values will be stored.

    Steps:
    1. Define a unique name for the projected graph (`g_name`).
    2. Clear any existing projected graph with the same name to prevent conflicts.
    3. Create a new graph projection, including specified node labels and relationships.
    4. Compute degree centrality for the projected graph and store it as `degree_property`.
    5. Write the computed centrality values back to the Neo4j database for specific node types.
    6. Drop the projected graph to free up memory.
    """

    # Step 1: Define a unique graph projection name
    # A graph projection in GDS library is a simplified in-memory representation of your database's 
    # graph that is optimized for graph algorithms 
    g_name = 'id-projection'

    # Step 2: Clear any existing graph with the same name to avoid conflicts
    clear_graph_by_name(g_name)

    # Step 3: Project a graph with specific node labels and relationship types
    # user_label is a dynamic variable representing a type of user node that will be 
    # included in the graph projection.
    # In Python, _ (underscore) is commonly used as a convention to indicate that a returned value is intentionally being ignored.
    g, _ = gds.graph.project(
        g_name, 
        [user_label, 'Card', 'Device', 'IP'],  # ✅ Nodes included in the projection
        {
            'HAS_CC': {'orientation': 'REVERSE'},  # ✅ Reverse the relationship direction for analysis
            'HAS_IP': {'orientation': 'REVERSE'},
            'USED': {'orientation': 'REVERSE'}
        }
    )

    # Step 4: Compute degree centrality and store it in `degree_property`
    # higher degree -> more connections -> more influence or risk
    # lower degree -> fewer connections -> less influence or risk
    # mutate computes the degree centrality for each node in the in-memory projection g
    # stores computed values in a new property (degree_property) inside the projection
    # modifies the in-memory graph but doesn't yet write to the database
    # degree_property = 'degree' or 'flagged_degree'
    gds.degree.mutate(g, mutateProperty=degree_property)

    # Step 5: Write the computed centrality values back to the Neo4j database
    # g is the in-memory projected graph where the degree centrality was computed
    # degree_property is the property to write (e.g. degree or flagged_degree).  
    # it stores the number of direct connections
    # ['Card', 'Device', 'IP'] is a list of node labels to which the property will be written
    # In graph theory, the degree of a node is the number of direct connetcions (edges) it has to other nodes
    gds.graph.writeNodeProperties(g, [degree_property], ['Card', 'Device', 'IP'])

    # Step 6: Drop the projected graph to free up memory
    g.drop()


In [122]:
import numpy
print(numpy.__version__)

2.2.2


In [None]:
# APOC (Awesome Procedures on Cypher).
# meta.stats() procedure, which provides metadata about the Neo4j database.
# YIELD labels extracts only the labels field from the apoc.meta.stats() result.
# UNWIND keys(labels) AS nodeLabel – Convert Dictionary to Rows
# Without UNWIND, the data remains a single row:
# {User: 100, Card: 200, Device: 300, IP: 400}
# UNWIND Allows Individual Access to Each Label

# total node counts
gds.run_cypher('''
    CALL apoc.meta.stats()
    YIELD labels
    UNWIND keys(labels) AS nodeLabel
    RETURN nodeLabel, labels[nodeLabel] AS nodeCount
''', database='frauddb')

Unnamed: 0,nodeLabel,nodeCount
0,User,33732
1,FlaggedUser,241
2,Device,51451
3,Card,118818
4,IP,585855


In [124]:
gds.run_cypher('''
    CALL apoc.meta.stats()
    YIELD relTypesCount
    UNWIND keys(relTypesCount) AS relationshipType
    RETURN relationshipType, relTypesCount[relationshipType] AS relationshipCount
''', database='frauddb')

Unnamed: 0,relationshipType,relationshipCount
0,USED,55026
1,HAS_IP,1488949
2,HAS_CC,128066
3,REFERRED,1870
4,P2P,102832


In [125]:
gds.run_cypher('''
    MATCH(u:User) RETURN u.fraudMoneyTransfer AS fraudMoneyTransfer, count(u) AS cnt
''', database='frauddb')

Unnamed: 0,fraudMoneyTransfer,cnt
0,0,33491
1,1,241


In [None]:
# This query identifies all User nodes involved in fraudulent money transfers, assigns them the FlaggedUser label, and returns the count of affected users.

gds.run_cypher('''
    MATCH(u:User) WHERE u.fraudMoneyTransfer=1 SET u:FlaggedUser RETURN count(u)
''', database='frauddb')

Unnamed: 0,count(u)
0,241


In [129]:
# Use GDS degree centrality to count the number of Users connected to each identifier type - Card, Device, IP
identifier_degrees('User', 'degree')
# Use GDS degree centrality to count the number of FLAGGED Users connected to each identifier type - Card, Device, IP
identifier_degrees('FlaggedUser', 'flaggedDegree')

# Calculate the ratio of flagged users to total users
gds.run_cypher('''
    MATCH(n) WHERE n:Card OR n:Device OR n:IP
    SET n.flaggedRatio = toFloat(n.flaggedDegree)/toFloat(n.degree)
''')

In [135]:
print('Flagged User Ratio for Card Count')
gds.run_cypher('''
MATCH (n:Card)
WHERE n.degree > 1
WITH toFloat(count(n)) AS total  // ✅ Calculate total count first

MATCH (n:Card)
WHERE n.degree > 1
WITH 
    CASE 
        WHEN n.flaggedRatio = 0 THEN '0'
        WHEN n.flaggedRatio = 1 THEN '1'
        ELSE 'Between 0-1' 
    END AS flaggedUserRatio, 
    total, 
    count(n) AS count  // ✅ Grouping on flaggedUserRatio

RETURN 
    flaggedUserRatio, 
    count, 
    round(toFloat(count)/total, 3) AS percentCount
ORDER BY flaggedUserRatio;

''')

Flagged User Ratio for Card Count


Unnamed: 0,flaggedUserRatio,count,percentCount
0,0,8185,0.964
1,1,31,0.004
2,Between 0-1,274,0.032
