In [166]:
"""Overall Summary of the Application
This application is a fraud detection and analysis system using Neo4j's Graph Data Science (GDS) library. It focuses on detecting fraudulent activities by analyzing relationships between users, devices, credit cards, and transactions in a graph database.

The application:

1) Connects to a Neo4j database (frauddb) to analyze financial transactions and user behavior.
2) Projects in-memory graphs using Neo4j's GDS library for running graph algorithms efficiently.
3) Computes centrality metrics (degree centrality) to identify users and entities (e.g., devices, IPs, and cards) that are highly connected and potentially high-risk.
4) Flags fraudulent users based on their transaction history and connections, assigning them the FlaggedUser label.
5) Calculates fraud ratios for various financial entities, such as cards and devices, determining the proportion of fraudulent activity within each category.
6) Applies the Louvain Community Detection algorithm to group users into communities based on their interactions, helping uncover fraud rings or coordinated attack groups.
7) Analyzes and ranks communities based on fraud prevalence, identifying clusters with a high proportion of fraudulent activity.
8) Outputs fraud-related insights, including flagged user ratios, community detection results, and risk scores for different financial identifiers.

This system is designed to identify fraudulent behavior, detect hidden fraud rings, and prioritize high-risk entities for investigation, helping prevent financial fraud and reduce risks in financial transactions.

"""

"Overall Summary of the Application\nThis application is a fraud detection and analysis system using Neo4j's Graph Data Science (GDS) library. It focuses on detecting fraudulent activities by analyzing relationships between users, devices, credit cards, and transactions in a graph database.\n\nThe application:\n\n1) Connects to a Neo4j database (frauddb) to analyze financial transactions and user behavior.\n2) Projects in-memory graphs using Neo4j's GDS library for running graph algorithms efficiently.\n3) Computes centrality metrics (degree centrality) to identify users and entities (e.g., devices, IPs, and cards) that are highly connected and potentially high-risk.\n4) Flags fraudulent users based on their transaction history and connections, assigning them the FlaggedUser label.\n5) Calculates fraud ratios for various financial entities, such as cards and devices, determining the proportion of fraudulent activity within each category.\n6) Applies the Louvain Community Detection algo

In [165]:
import pandas as pd
import configparser
import os
from neo4j import GraphDatabase, basic_auth
from graphdatascience import GraphDataScience
pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 50)

In [146]:
print("hello world")

hello world


In [147]:
NEO4J_PROPERTIES_FILE = 'resources/aura-p2p-fd.ini'

In [148]:
## Using an ini file for credentials, otherwise providing defaults
HOST = 'neo4j://localhost'
USERNAME = 'neo4j'
PASSWORD = 'password'

if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
    config = configparser.RawConfigParser()
    config.read(NEO4J_PROPERTIES_FILE)
    HOST = config['NEO4J']['HOST']
    USERNAME = config['NEO4J']['USERNAME']
    PASSWORD = config['NEO4J']['PASSWORD']
    DATABASE = config['NEO4J'].get('DATABASE', 'frauddb')

    print('Using custom database properties')
else:
    print('Could not find database properties file, using defaults')

Using custom database properties


In [149]:
# Use Neo4j URI and credentials according to your setup
# gds = GraphDatabase.driver(HOST, auth=basic_auth(USERNAME, PASSWORD))
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), database="frauddb")

In [None]:
def clear_graph_by_name(g_name):
    """
    Safely clears a projected graph in the Neo4j Graph Data Science (GDS) library.

    Parameters:
    g_name (str): The name of the projected graph to be cleared.

    Steps:
    1. Check if the graph with the given name exists in GDS.
    2. If the graph exists, retrieve it.
    3. Drop (delete) the graph from memory to free up resources.
    """
    
    # Step 1: Check if the graph exists in the GDS catalog
    if gds.graph.exists(g_name).exists:
        # Step 2: Retrieve the existing graph object
        g = gds.graph.get(g_name)
        # Step 3: Drop the graph to free up memory and prevent conflicts with new projections
        gds.graph.drop(g)

def clear_all_graphs():
    g_names = gds.graph.list().graphName.tolist()
    for g_name in g_names:
        g = gds.graph.get(g_name)
        gds.graph.drop(g)

def identifier_degrees(user_label, degree_property):
    """
    Projects a subgraph in Neo4j's Graph Data Science (GDS) library, computes degree centrality,
    and writes the computed property back to the database.

    Degree centrality is a graph algorithm that measures the number of direct connections
    (edges) a node has to other nodes.  It helps identify highly connected nodes in a network.
    
    Parameters:
    user_label (str): The label of the user nodes to include in the projection (e.g., 'User' or 'FlaggedUser').
    degree_property (str): The name of the property where the degree centrality values will be stored.

    Steps:
    1. Define a unique name for the projected graph (`g_name`).
    2. Clear any existing projected graph with the same name to prevent conflicts.
    3. Create a new graph projection, including specified node labels and relationships.
    4. Compute degree centrality for the projected graph and store it as `degree_property`.
    5. Write the computed centrality values back to the Neo4j database for specific node types.
    6. Drop the projected graph to free up memory.
    """

    # Step 1: Define a unique graph projection name
    # A graph projection in GDS library is a simplified in-memory representation of your database's 
    # graph that is optimized for graph algorithms 
    g_name = 'id-projection'

    # Step 2: Clear any existing graph with the same name to avoid conflicts
    clear_graph_by_name(g_name)

    # Step 3: Project a graph with specific node labels and relationship types
    # user_label is a dynamic variable representing a type of user node that will be 
    # included in the graph projection.
    # In Python, _ (underscore) is commonly used as a convention to indicate that a returned value is intentionally being ignored.
    g, _ = gds.graph.project(
        g_name, 
        [user_label, 'Card', 'Device', 'IP'],  # Nodes included in the projection
        {
            'HAS_CC': {'orientation': 'REVERSE'},  # Reverse the relationship direction for analysis
            'HAS_IP': {'orientation': 'REVERSE'},
            'USED': {'orientation': 'REVERSE'}
        }
    )

    # Step 4: Compute degree centrality and store it in `degree_property`
    # higher degree -> more connections -> more influence or risk
    # lower degree -> fewer connections -> less influence or risk
    # mutate computes the degree centrality for each node in the in-memory projection g
    # stores computed values in a new property (degree_property) inside the projection
    # modifies the in-memory graph but doesn't yet write to the database
    # degree_property = 'degree' or 'flagged_degree'
    gds.degree.mutate(g, mutateProperty=degree_property)

    # Step 5: Write the computed centrality values back to the Neo4j database
    # g is the in-memory projected graph where the degree centrality was computed
    # degree_property is the property to write (e.g. degree or flagged_degree).  
    # it stores the number of direct connections
    # ['Card', 'Device', 'IP'] is a list of node labels to which the property will be written
    # In graph theory, the degree of a node is the number of direct connetcions (edges) it has to other nodes
    gds.graph.writeNodeProperties(g, [degree_property], ['Card', 'Device', 'IP'])

    # Step 6: Drop the projected graph to free up memory
    g.drop()


In [151]:
import numpy
print(numpy.__version__)

2.2.2


In [152]:
# APOC (Awesome Procedures on Cypher).
# meta.stats() procedure, which provides metadata about the Neo4j database.
# YIELD labels extracts only the labels field from the apoc.meta.stats() result.
# UNWIND keys(labels) AS nodeLabel â€“ Convert Dictionary to Rows
# Without UNWIND, the data remains a single row:
# {User: 100, Card: 200, Device: 300, IP: 400}
# UNWIND Allows Individual Access to Each Label

# total node counts
gds.run_cypher('''
    CALL apoc.meta.stats()
    YIELD labels
    UNWIND keys(labels) AS nodeLabel
    RETURN nodeLabel, labels[nodeLabel] AS nodeCount
''', database='frauddb')

Unnamed: 0,nodeLabel,nodeCount
0,User,33732
1,FlaggedUser,241
2,Device,51451
3,Card,118818
4,IP,585855


In [153]:
gds.run_cypher('''
    CALL apoc.meta.stats()
    YIELD relTypesCount
    UNWIND keys(relTypesCount) AS relationshipType
    RETURN relationshipType, relTypesCount[relationshipType] AS relationshipCount
''', database='frauddb')

Unnamed: 0,relationshipType,relationshipCount
0,USED,55026
1,HAS_IP,1488949
2,HAS_CC,128066
3,REFERRED,1870
4,P2P,102832


In [154]:
gds.run_cypher('''
    MATCH(u:User) RETURN u.fraudMoneyTransfer AS fraudMoneyTransfer, count(u) AS cnt
''', database='frauddb')

Unnamed: 0,fraudMoneyTransfer,cnt
0,0,33491
1,1,241


In [155]:
# This query identifies all User nodes involved in fraudulent money transfers, assigns them the FlaggedUser label, and returns the count of affected users.

gds.run_cypher('''
    MATCH(u:User) WHERE u.fraudMoneyTransfer=1 SET u:FlaggedUser RETURN count(u)
''', database='frauddb')

Unnamed: 0,count(u)
0,241


In [156]:
# Use GDS degree centrality to count the number of Users connected to each identifier type - Card, Device, IP
identifier_degrees('User', 'degree')
# Use GDS degree centrality to count the number of FLAGGED Users connected to each identifier type - Card, Device, IP
identifier_degrees('FlaggedUser', 'flaggedDegree')

# Calculate the ratio of flagged users to total users
gds.run_cypher('''
    MATCH(n) WHERE n:Card OR n:Device OR n:IP
    SET n.flaggedRatio = toFloat(n.flaggedDegree)/toFloat(n.degree)
''')

In [157]:
print('Flagged User Ratio for Card Count')
gds.run_cypher('''
MATCH (n:Card)
WHERE n.degree > 1
WITH toFloat(count(n)) AS total  // âœ… Calculate total count first

MATCH (n:Card)
WHERE n.degree > 1
WITH 
    CASE 
        WHEN n.flaggedRatio = 0 THEN '0'
        WHEN n.flaggedRatio = 1 THEN '1'
        ELSE 'Between 0-1' 
    END AS flaggedUserRatio, 
    total, 
    count(n) AS count  // âœ… Grouping on flaggedUserRatio

RETURN 
    flaggedUserRatio, 
    count, 
    round(toFloat(count)/total, 3) AS percentCount
ORDER BY flaggedUserRatio;

''')

Flagged User Ratio for Card Count


Unnamed: 0,flaggedUserRatio,count,percentCount
0,0,8185,0.964
1,1,31,0.004
2,Between 0-1,274,0.032


In [None]:
"""
    Why is This Useful in Fraud Detection?
    - Identifies Devices Linked to Fraudsters - Helps find shared devices between flagged & unflagged users.
    - Detects High-Risk Devices - If many flagged users share the same device, it's suspicious.
    - Enhances Anomaly Detection - Helps monitor fraud rings using common devices.
"""

print('Flagged User Ratio for Device Count')
gds.run_cypher('''
    MATCH (n:Device) 
    WHERE n.degree > 1
    WITH toFloat(count(n)) AS total  // Compute total first

    MATCH (n:Device) 
    WHERE n.degree > 1
    WITH n, total, 
        CASE 
            WHEN n.flaggedRatio = 0 THEN '0'
            WHEN n.flaggedRatio = 1 THEN '1'
            ELSE 'Between 0-1' 
        END AS flaggedUserRatio
    WITH flaggedUserRatio, total, count(n) AS count  // Group results before return
    RETURN flaggedUserRatio, count, round(toFloat(count)/total, 3) AS percentCount
    ORDER BY flaggedUserRatio;
''')

Flagged User Ratio for Device Count


Unnamed: 0,flaggedUserRatio,count,percentCount
0,0,2655,0.967
1,1,2,0.001
2,Between 0-1,88,0.032


In [None]:
# This projects a graph for community detection using the Louvain algorithm and 
# stores the detected communities in the Neo4j database.

clear_graph_by_name('comm-projection')

# Create a Graph Projection for Community Detection
"""
'comm-projection'	Name of the in-memory graph projection.
['User', 'Card', 'Device']	Includes only these node types in the graph projection.
{ 'HAS_CC': {...}, 'USED': {...}, 'P2P': {...} }	Includes only these relationships, with specific orientations.
"""

"""
Why Use UNDIRECTED for HAS_CC and USED?

Fraud analysis often needs to analyze shared resources (e.g., multiple users using the same card/device).
Making these relationships undirected allows algorithms to group entities together, regardless of who initiated the connection.
ðŸ“Œ Why Use NATURAL for P2P?

P2P (person-to-person transactions) often have natural directionality (sender â†’ receiver).
Keeping this directionality allows Louvain community detection to identify fraud rings.
"""
g, _ = gds.graph.project('comm-projection', ['User','Card', 'Device'], {
    'HAS_CC': {'orientation': 'UNDIRECTED'},
    'USED': {'orientation': 'UNDIRECTED'},
    'P2P': {'orientation': 'NATURAL', 'aggregation': 'SINGLE'}
})

"""
Runs the Louvain algorithm on the in-memory graph g.
Finds communities (clusters) of closely connected nodes.
Stores the computed community ID in the database under the property louvainCommunityId.
"""
df = gds.louvain.write(g, writeProperty='louvainCommunityId')
g.drop()
df

writeMillis                                                                                                                                                                                                                                                                                                                                                                                                                 156
nodePropertiesWritten                                                                                                                                                                                                                                                                                                                                                                                                    204001
modularity                                                                                                                                                              

In [None]:
print("Louvain Communities Ordered by count of Flagged Users")

"""
1) This retrieves all User nodes in the database.
Each User has already been assigned a Louvain community (louvainCommunityId) from the Louvain community detection algorithm.

2) Groups all users by louvainCommunityId (each user belongs to a detected fraud community).

3) Compute the Ratio of Fraudsters in Each Community

4) Sort and Limit to the Top 100 Fraudulent Communities

Why is This Useful for Fraud Detection?
- Identifies Fraud Rings - Communities with high fraud ratios are likely organized fraud groups.
- Prioritizes Investigations - Sorting by fraud count ensures investigators focus on the worst offenders first.
- Detects Hidden Patterns - Users may not be directly connected but belong to the same fraudulent network.

"""

gds.run_cypher('''
    MATCH (u:User)
    WITH u.louvainCommunityId AS community,
        count(u) AS cnt,
        sum(u.fraudMoneyTransfer) as flaggedCount
    RETURN community,
        cnt,
        flaggedCount,
        toFloat(flaggedCount)/toFloat(cnt) AS flaggedRatio
    ORDER BY flaggedCount DESC LIMIT 100
''')

Louvain Communities Ordered by count of Flagged Users


Unnamed: 0,community,cnt,flaggedCount,flaggedRatio
0,200458,155,7,0.045161
1,182733,7,4,0.571429
2,188313,6,3,0.500000
3,173038,240,3,0.012500
4,201222,7,3,0.428571
...,...,...,...,...
95,172511,3,1,0.333333
96,174011,20,1,0.050000
97,173401,5,1,0.200000
98,172844,2,1,0.500000
