In [1]:
import pandas as pd
import warnings
from functions import *
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network
from IPython.core.display import display, HTML
import dash
from dash import dcc, html, Input, Output
import dash_cytoscape as cyto
from jupyter_dash import JupyterDash 
from dash import dcc, html
import plotly.graph_objects as go

In [2]:
# Functions

def create_network_graph(df, node_columns, node_feature):
    """
    Creates a network graph from the provided DataFrame.

    Args:
    - df (pd.DataFrame): DataFrame containing the data.
    - node_columns (list of str): Columns in the DataFrame to be used as nodes.
    - node_feature (list of str): Columns in the DataFrame to be used as node features.

    Returns:
    - G (nx.Graph): The generated network graph.
    """
    # Initialize the graph
    G = nx.Graph()

    # Add nodes for each unique identifier in the specified columns
    for column in node_columns:
        for value in df[column].unique():
            # Initialize the node with default feature values
            G.add_node(value, node_type=column, is_fraud=False)

    # Update node attributes based on the DataFrame, ensuring once 'is_fraud' is True, it remains True
    for _, row in df.iterrows():
        # Function to update the is_fraud attribute, ensuring it cannot change from True to False
        def update_node_fraud_status(node):
            if G.nodes[node]['is_fraud'] == False and row['is_fraud'] == True:
                G.nodes[node]['is_fraud'] = True
        
        # Update fraud status for nodes
        for column in node_columns:
            update_node_fraud_status(row[column])

    # Add edges based on relationships in the dataframe, avoiding self-edges
    for _, row in df.iterrows():
        # Define a function to add edges while avoiding self-edges
        def add_edge_no_self(source, target):
            if source != target:
                G.add_edge(source, target)

        # Connecting customer_id to other associated entities, avoiding self-edges
        for column in node_columns[1:]:  # Skip the first column 'customer_id'
            add_edge_no_self(row[node_columns[0]], row[column])

    return G



def get_disjoint_subgraphs(G):
    """
    Returns all disjoint subgraphs from a given graph G.

    Args:
    - G (nx.Graph): The input network graph.

    Returns:
    - List of nx.Graph: A list of disjoint subgraphs.
    """
    # Get all connected components in the graph
    connected_components = nx.connected_components(G)
    
    # Create a list of subgraphs for each connected component
    disjoint_subgraphs = [G.subgraph(component).copy() for component in connected_components]
    
    return disjoint_subgraphs

def plot_graph_colored_by_feature(G, node_feature='is_fraud', show_labels=False, node_size=70):
    """
    Plots the graph G with nodes colored by a boolean node feature.

    Args:
    - G (nx.Graph): The input network graph.
    - node_feature (str): The node attribute used for coloring the nodes (default is 'is_fraud').
    - show_labels (bool): Whether to display node labels (default is False).
    - node_size (int): The size of the nodes in the plot (default is 70).

    Returns:
    - None
    """
    # Extract node colors based on the node_feature (bool)
    node_colors = ['red' if G.nodes[node][node_feature] else 'green' for node in G.nodes]

    # Draw the graph
    plt.figure(figsize=(24, 16))
    pos = nx.kamada_kawai_layout(G)  # Position nodes using Kamada-Kawai path-length cost-function
    #pos = nx.spring_layout(G, seed=42)  # Position nodes using Fruchterman-Reingold force-directed algorithm

    # Draw nodes with color mapping
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_size, alpha=0.8)

    # Draw edges
    nx.draw_networkx_edges(G, pos, alpha=0.5)

    # Conditionally draw labels
    if show_labels:
        nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')

    # Display plot
    plt.title(f"Network Graph Colored by '{node_feature}'")
    plt.grid(False)
    plt.show()
    

def compute_node_centrality(G):
    """
    Computes and returns node centrality measures for the given graph G.

    Args:
    - G (nx.Graph): The input network graph.

    Returns:
    - dict: A dictionary with centrality measures for each node.
    """
    # Compute different centrality measures
    centrality_measures = {
        'degree_centrality': nx.degree_centrality(G),
        'betweenness_centrality': nx.betweenness_centrality(G),
        'closeness_centrality': nx.closeness_centrality(G),
        'eigenvector_centrality': nx.eigenvector_centrality(G)
    }

    return centrality_measures

def go_visualize_graph(G):
    # Adjusting Kawai layout with better parameters
    pos = nx.kamada_kawai_layout(G)

    # Normalize positions to balance the spacing
    def normalize_positions(pos_dict, scale=1.0):
        min_x = min(pos[0] for pos in pos_dict.values())
        max_x = max(pos[0] for pos in pos_dict.values())
        min_y = min(pos[1] for pos in pos_dict.values())
        max_y = max(pos[1] for pos in pos_dict.values())
        
        for key in pos_dict:
            pos_dict[key] = ((pos_dict[key][0] - min_x) / (max_x - min_x) * scale,
                             (pos_dict[key][1] - min_y) / (max_y - min_y) * scale)
    
    normalize_positions(pos, scale=10.0)  # Adjust scale to balance inner and outer spacing

    # Define unique colors for different node types
    node_type_colors = {
        'customer_id': 'blue',
        'end_customer_id': 'green',
        'account_hash': 'purple',
        'email': 'orange',
        'given_names': 'red',
        'middle_name': 'cyan',
        'surname': 'magenta'
    }

    # Define node shapes and colors
    node_shapes = []
    node_colors = []
    hover_texts = []

    for node in G.nodes():
        node_type = G.nodes[node]['node_type']
        hover_texts.append(f"Node: {node}, Type: {node_type}")
        if G.nodes[node]['is_fraud']:
            node_colors.append('black')  # Color for fraudulent nodes
            node_shapes.append('triangle-up')  # Shape for fraudulent nodes
        else:
            node_colors.append(node_type_colors.get(node_type, 'LightSkyBlue'))
            node_shapes.append('circle')

    # Create Plotly scatter plot for nodes
    nodes = go.Scatter(
        x=[pos[node][0] for node in G.nodes()],
        y=[pos[node][1] for node in G.nodes()],
        mode='markers',
        marker=dict(
            size=10,
            color=node_colors,
            symbol=node_shapes
        ),
        text=hover_texts,
        hoverinfo='text'
    )

    # Create Plotly scatter plot for edges
    edges = []
    for edge in G.edges():
        edge_trace = go.Scatter(
            x=[pos[edge[0]][0], pos[edge[1]][0], None],
            y=[pos[edge[0]][1], pos[edge[1]][1], None],
            mode='lines',
            line=dict(width=1, color='black'),
            hoverinfo='none'
        )
        edges.append(edge_trace)

    # Create a Dash app
    app = JupyterDash(__name__)

    # Define the app layout
    app.layout = html.Div([
        dcc.Dropdown(
            id='node-type-filter',
            options=[{'label': node_type, 'value': node_type} for node_type in set(nx.get_node_attributes(G, 'node_type').values())],
            multi=True,
            value=list(set(nx.get_node_attributes(G, 'node_type').values()))
        ),
        dcc.Graph(id='network-graph'),
    ])

    @app.callback(
        Output('network-graph', 'figure'),
        Input('node-type-filter', 'value')
    )
    def update_graph(selected_types):
        filtered_nodes = [node for node in G.nodes() if G.nodes[node]['node_type'] in selected_types]
        filtered_pos = {node: pos[node] for node in filtered_nodes}

        filtered_node_colors = [node_colors[list(G.nodes()).index(node)] for node in filtered_nodes]
        filtered_node_shapes = [node_shapes[list(G.nodes()).index(node)] for node in filtered_nodes]
        filtered_hover_texts = [hover_texts[list(G.nodes()).index(node)] for node in filtered_nodes]

        filtered_nodes_trace = go.Scatter(
            x=[filtered_pos[node][0] for node in filtered_nodes],
            y=[filtered_pos[node][1] for node in filtered_nodes],
            mode='markers',
            marker=dict(
                size=10,
                color=filtered_node_colors,
                symbol=filtered_node_shapes
            ),
            text=filtered_hover_texts,
            hoverinfo='text'
        )

        return go.Figure(
            data=edges + [filtered_nodes_trace],
            layout=go.Layout(
                title='NetworkX Graph Visualization',
                showlegend=False,
                hovermode='closest',
                margin=dict(b=0, l=0, r=0, t=40),
                width=2000,
                height=2000,
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
            )
        )

    # Run the app inline in the notebook
    app.run_server(debug=True)


In [3]:
# Read the data
df1 = read_excel('idtobank.xlsx')
df2 = read_excel('hashtoid.xlsx')
fraud_data = read_excel('fraud.xlsx', sheet_name='Detail data')

df1 = preprocess_dataframe(df1)
df2 = preprocess_dataframe(df2)

df = pd.concat([df1, df2], ignore_index=True)
fraud_ids = fraud_data['customer_id'].unique()
df['is_fraud'] = df['customer_id'].isin(fraud_ids).astype(object)
df['amount'] = pd.to_numeric(df['amount'], errors='coerce')

describe_dataframe(df)

Number of rows: 4063
Number of columns: 57

Column names and data types:
success_redirect_url                object
account_details                     object
expires_at                          object
client                              object
fee                                 object
refunded_amount                     object
payment_channel_transaction_id      object
linked_account_id                   object
is_otp_required                     object
otp_mobile_number                   object
settlement_date                     object
failure_code                        object
callback_url                        object
status                              object
channel_account_reference           object
updated                             object
version                             object
checkout_url                        object
vat                                 object
client_type                         object
id                                  object
status2                 

In [4]:
df.columns

Index(['success_redirect_url', 'account_details', 'expires_at', 'client',
       'fee', 'refunded_amount', 'payment_channel_transaction_id',
       'linked_account_id', 'is_otp_required', 'otp_mobile_number',
       'settlement_date', 'failure_code', 'callback_url', 'status',
       'channel_account_reference', 'updated', 'version', 'checkout_url',
       'vat', 'client_type', 'id', 'status2',
       'payment_channel_verification_id', 'transacting_entity',
       'idempotency_key', 'ledger_transaction_id', 'connector_metadata',
       'time', 'given_names', 'business_id', 'end_customer_id', 'created',
       'middle_name', 'amount', 'email', 'channel_code', 'client_reference',
       'failure_redirect_url', 'customer_id', 'required_action', 'surname',
       'payment_channel_reference_id', 'entity', 'type', 'account_hash', 'dt',
       'account_type', 'bank_acc', 'otp_expiration_timestamp', 'currency',
       'payment_method_id', 'mobile_number', 'reference_id', 'enable_otp',
       'd

In [5]:
df_copy = df.copy()
df= df[['customer_id', 'end_customer_id', 'account_hash', 'amount', 'email', 'given_names', 'middle_name', 'surname','is_fraud']]
df['is_fraud'] = df['is_fraud'].astype(bool)

describe_dataframe(df)
print(df.head())

Number of rows: 4063
Number of columns: 9

Column names and data types:
customer_id         object
end_customer_id     object
account_hash        object
amount             float64
email               object
given_names         object
middle_name         object
surname             object
is_fraud              bool
dtype: object

Number of NaN values per column (sorted by highest to lowest):
middle_name        1924
surname            1619
email              1320
end_customer_id    1315
given_names        1315
customer_id         869
account_hash          0
amount                0
is_fraud              0
dtype: int64

Number of unique values per column (sorted by highest to lowest):
account_hash       693
customer_id        650
amount             545
end_customer_id    481
given_names        425
surname            399
middle_name        383
email              347
is_fraud             2
dtype: int64

Basic statistics for numeric columns:
             amount
count   4063.000000
mean   10556

In [6]:
# Create the network graph
node_columns = ['customer_id', 'end_customer_id', 'account_hash', 'email', 'given_names', 'middle_name', 'surname']
node_feature = ['is_fraud']

G = create_network_graph(df, node_columns, node_feature)

mapping = {node: str(node) for node in G.nodes()}
G = nx.relabel_nodes(G, mapping)

In [7]:
disjoint_subgraphs = get_disjoint_subgraphs(G)
print(f"Number of disjoint subgraphs: {len(disjoint_subgraphs)}")
print('\n')



# Output the number of disjoint subgraphs and their nodes to verify
for i, subgraph in enumerate(disjoint_subgraphs):
    print(f"Subgraph {i+1}:")
    print(subgraph)
    print("-----")

Number of disjoint subgraphs: 61


Subgraph 1:
Graph with 2365 nodes and 2795 edges
-----
Subgraph 2:
Graph with 10 nodes and 9 edges
-----
Subgraph 3:
Graph with 50 nodes and 49 edges
-----
Subgraph 4:
Graph with 7 nodes and 6 edges
-----
Subgraph 5:
Graph with 7 nodes and 6 edges
-----
Subgraph 6:
Graph with 7 nodes and 6 edges
-----
Subgraph 7:
Graph with 8 nodes and 7 edges
-----
Subgraph 8:
Graph with 7 nodes and 6 edges
-----
Subgraph 9:
Graph with 12 nodes and 11 edges
-----
Subgraph 10:
Graph with 9 nodes and 8 edges
-----
Subgraph 11:
Graph with 7 nodes and 6 edges
-----
Subgraph 12:
Graph with 7 nodes and 6 edges
-----
Subgraph 13:
Graph with 9 nodes and 8 edges
-----
Subgraph 14:
Graph with 13 nodes and 12 edges
-----
Subgraph 15:
Graph with 13 nodes and 12 edges
-----
Subgraph 16:
Graph with 7 nodes and 6 edges
-----
Subgraph 17:
Graph with 8 nodes and 7 edges
-----
Subgraph 18:
Graph with 12 nodes and 11 edges
-----
Subgraph 19:
Graph with 7 nodes and 6 edges
-----
Subgrap

In [8]:
# for subgraph in disjoint_subgraphs:
#     plot_graph_colored_by_feature(subgraph)

In [9]:
# Compute node graph metrics
centrality = compute_node_centrality(G)


In [10]:
G = disjoint_subgraphs[0]  # Make sure you assign the correct subgraph here

go_visualize_graph(G)

Dash app running on http://127.0.0.1:8050/


In [11]:
print(disjoint_subgraphs[0])

Graph with 2365 nodes and 2795 edges
