In [1]:
# %% Importing Libarries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

# %% Load your dataset into a pandas DataFrame
df = pd.read_csv('Small_test_data\\New_data_2.csv')
print(df.columns)

Index(['Sender_account', 'Receiver_account', 'Amount', 'Payment_currency',
       'Received_currency', 'Sender_bank_location', 'Receiver_bank_location',
       'Payment_type', 'Is_laundering', 'Laundering_type', 'Year', 'Month',
       'Day', 'Week'],
      dtype='object')


In [2]:
# Initialize a directed graph
G = nx.DiGraph()

# Add edges to the graph from the Sender and Receiver columns with tqdm for progress
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Building Graph"):
    sender = row['Sender_account']
    receiver = row['Receiver_account']
    amount = row['Amount']
    label = row['Is_laundering']

    # Add the edge from sender to receiver, optionally storing transaction data
    G.add_edge(sender, receiver, amount=amount, label=label)

Building Graph:   0%|          | 0/157968 [00:00<?, ?it/s]

Building Graph: 100%|██████████| 157968/157968 [00:07<00:00, 21046.01it/s]


In [None]:
import random
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm

# # Assuming your graph is built with labels
# # Convert G.nodes() to a list
# nodes_list = list(G.nodes())

# # Select a subset of nodes (e.g., 1500 random nodes from the graph) with tqdm for progress bar
# # subset_nodes = random.sample(nodes_list, 1500)  # Adjust the number as needed

# # Create a subgraph induced by these nodes with tqdm progress for subgraph creation
# subgraph = G.subgraph(nodes_list)

# Extract edge color based on 'label' attribute of the edges
edge_colors = ['red' if G[u][v].get('label') == 1 else 'black' for u, v in G.edges()]

# Draw the subgraph with edge color based on the label
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G)  # Position the nodes with progress bar

# Draw the nodes and edges
nx.draw(G, pos, with_labels=False, node_color='blue', edge_color=edge_colors, node_size=10, font_size=10)

plt.title("Subgraph Visualization with Edge Labels")
plt.show()


In [None]:
import networkx as nx
from tqdm import tqdm

# Function to find all paths of length 5 using DFS
def find_paths_of_length_4(graph):
    paths_of_length_4 = []

    # Helper function to perform DFS and find paths of exactly length 5
    def dfs(current_node, path):
        # If the path has 6 nodes, it means 5 edges, so we store it
        if len(path) == 6:  # 4 edges = 5 nodes
            paths_of_length_4.append(path)
            return

        # Visit all neighbors of the current node
        for neighbor in graph.neighbors(current_node):
            if neighbor not in path:  # Avoid cycles
                dfs(neighbor, path + [neighbor])

    # Iterate over all nodes in the graph
    for node in tqdm(graph.nodes(), desc="Finding Paths of Length 4"):
        dfs(node, [node])

    return paths_of_length_4

# Find all paths of length 5
paths_of_length_4 = find_paths_of_length_4(G)

print(f"Number of paths of length 4: {len(paths_of_length_4)}")


In [None]:
# Check if the nodes in the paths are connected (strongly connected components)
def check_paths_connectivity(graph, paths):
    # Get strongly connected components in the graph
    scc = list(nx.strongly_connected_components(graph))
    
    # Iterate over each path
    for path in tqdm(paths, desc="Checking Connectivity"):
        # Check if all nodes in the path are in the same strongly connected component
        for component in scc:
            if all(node in component for node in path):
                print(f"Path {path} is connected.")

# Check if paths of length 4 are connected
check_paths_connectivity(G, paths_of_length_4)

In [10]:
import torch

In [None]:
graph_data_train = torch.load("Small_test_data\graph_data_test.pt")

In [None]:
graph_data_train.edge_attr.size(1)

In [None]:
graph_data_train.keys

In [None]:
# Create DataLoader
train_loader = DataLoader([graph_data_train], batch_size=256)
# test_loader = DataLoader([graph_data_test], batch_size=256)


In [None]:
train_loader

In [8]:
import pandas as pd

In [9]:
df_train = pd.read_csv("SAML/train_transactions_transformed.csv")

In [10]:
df_test = pd.read_csv("SAML/test_transactions_transformed.csv")


In [11]:
df = pd.concat([df_train, df_test],ignore_index=True)

In [12]:
df.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,3293686547,2857585278,-0.664485,10.0,10.0,16.0,16.0,0.0,0,20.0,2023,5,5,18
1,345409480,3629277366,-0.713173,10.0,10.0,16.0,16.0,6.0,0,20.0,2022,10,21,42
2,2207083075,8166004515,0.391648,10.0,10.0,16.0,16.0,6.0,0,14.0,2023,5,3,18
3,1715402599,1146874022,0.401941,10.0,10.0,16.0,16.0,6.0,0,13.0,2023,7,12,28
4,3059424812,3532465761,-0.458078,10.0,10.0,16.0,16.0,6.0,0,20.0,2023,2,25,8


In [13]:
print(df_train.shape)
print(df_test.shape)
print(df.shape)

(6653396, 14)
(2851456, 14)
(9504852, 14)


In [14]:
del df_train,df_test

In [15]:
df['Is_laundering'].value_counts()

Is_laundering
0    9494979
1       9873
Name: count, dtype: int64

In [16]:
df_1 = df[df['Is_laundering']==1]

In [17]:
df_0 = df[df['Is_laundering']==0]

In [18]:
del df

In [19]:
accounts_to_filter = pd.unique(df_1[['Sender_account', 'Receiver_account']].values.ravel('K'))
print(accounts_to_filter)
print(len(accounts_to_filter))


[5194755834 5793273539 4877416145 ... 5895391380 5684292815 8916362580]
7902


In [20]:
# Filter df2 rows where either Sender_account or Receiver_account is in accounts_to_filter
filtered_df = df_0[(df_0['Sender_account'].isin(accounts_to_filter)) | (df_0['Receiver_account'].isin(accounts_to_filter))]
print(filtered_df.shape)


(1476622, 14)


In [21]:
not_filtered_df = df_0[~((df_0['Sender_account'].isin(accounts_to_filter)) | (df_0['Receiver_account'].isin(accounts_to_filter)))]
not_filtered_df

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
1,345409480,3629277366,-0.713173,10.0,10.0,16.0,16.0,6.0,0,20.0,2022,10,21,42
2,2207083075,8166004515,0.391648,10.0,10.0,16.0,16.0,6.0,0,14.0,2023,5,3,18
4,3059424812,3532465761,-0.458078,10.0,10.0,16.0,16.0,6.0,0,20.0,2023,2,25,8
5,5764457799,3894530454,-0.421399,10.0,10.0,16.0,16.0,4.0,0,20.0,2022,12,10,49
6,18484669,2501673012,-0.618066,10.0,10.0,16.0,16.0,3.0,0,20.0,2023,6,26,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9504847,6964651170,9982278941,-0.708286,10.0,10.0,16.0,16.0,2.0,0,12.0,2023,1,21,3
9504848,2424585029,2163161736,0.382141,10.0,10.0,16.0,16.0,3.0,0,13.0,2023,8,11,32
9504849,742695398,5335064989,-0.077174,10.0,10.0,16.0,16.0,4.0,0,14.0,2022,10,24,43
9504850,2285174529,5262806905,-0.131770,10.0,10.0,16.0,16.0,0.0,0,14.0,2022,12,30,52


In [22]:
del df_0

In [23]:
len(pd.unique(filtered_df[['Sender_account', 'Receiver_account']].values.ravel('K'))
)

131239

In [28]:
9873*15*0.

88857.0

In [29]:
# Get 118,476 random rows from filtered_df
subset_118476 = filtered_df.sample(n=59238, random_state=1)

# Get 177,714 random rows from filtered_df
subset_177714 = not_filtered_df.sample(n=88857, random_state=1)


In [30]:
del filtered_df,not_filtered_df

In [31]:
df = pd.concat([subset_118476, subset_177714],ignore_index=True).reset_index()
df.shape

(148095, 15)

In [32]:
del subset_118476,subset_177714

In [33]:
df.tail()

Unnamed: 0,index,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
148090,148090,9362022609,2732978640,-0.305125,10.0,10.0,16.0,16.0,0.0,0,20.0,2023,5,28,21
148091,148091,6688839046,5619888985,-0.716508,10.0,10.0,16.0,16.0,4.0,0,20.0,2023,2,21,8
148092,148092,8354604070,6585837726,1.386891,10.0,10.0,16.0,16.0,6.0,0,13.0,2023,5,17,20
148093,148093,9398298924,420287614,-0.172666,10.0,10.0,16.0,16.0,0.0,0,14.0,2023,5,8,19
148094,148094,7282330957,552809214,0.437225,10.0,2.0,16.0,1.0,5.0,0,14.0,2022,12,6,49


In [34]:
df = df.drop("index",axis=1)

In [35]:
df.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,8874183186,1171598148,0.487131,10.0,10.0,16.0,16.0,0.0,0,14.0,2022,12,23,51
1,5004625051,8160081398,-0.623787,10.0,10.0,16.0,16.0,4.0,0,20.0,2022,12,4,48
2,3724240056,4989715346,0.374361,10.0,10.0,16.0,16.0,0.0,0,14.0,2022,10,17,42
3,685439805,6094891583,2.477372,10.0,10.0,16.0,16.0,3.0,0,14.0,2022,12,6,49
4,4063617528,876717764,0.007196,2.0,10.0,3.0,16.0,5.0,0,14.0,2023,1,1,52


In [36]:
df_new = pd.concat([df, df_1],ignore_index=True)

In [37]:
df_new.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,8874183186,1171598148,0.487131,10.0,10.0,16.0,16.0,0.0,0,14.0,2022,12,23,51
1,5004625051,8160081398,-0.623787,10.0,10.0,16.0,16.0,4.0,0,20.0,2022,12,4,48
2,3724240056,4989715346,0.374361,10.0,10.0,16.0,16.0,0.0,0,14.0,2022,10,17,42
3,685439805,6094891583,2.477372,10.0,10.0,16.0,16.0,3.0,0,14.0,2022,12,6,49
4,4063617528,876717764,0.007196,2.0,10.0,3.0,16.0,5.0,0,14.0,2023,1,1,52


In [38]:
df_new.to_csv("Small_test_data\\New_data_2.csv", index=False)

In [39]:
df_new["Is_laundering"].value_counts()

Is_laundering
0    148095
1      9873
Name: count, dtype: int64