In [None]:
import pandas as pd
import torch
import torch.nn as nn
import networkx as nx
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv
from sklearn.preprocessing import MinMaxScaler
import numpy as np

## Defining a Model

In [None]:
class EdgeAwareGAE(nn.Module):
    def __init__(self, input_dim, edge_dim, hidden_dim, embedding_dim):
        super(EdgeAwareGAE, self).__init__()
        self.edge_transform = nn.Linear(edge_dim, hidden_dim)
        self.encoder1 = GATv2Conv(
            input_dim, 
            hidden_dim, 
            edge_dim=hidden_dim,
            heads=1,
            add_self_loops=False
        )
        self.encoder2 = GATv2Conv(
            hidden_dim, 
            hidden_dim, 
            edge_dim=hidden_dim,
            heads=1,
            add_self_loops=False
        )
        self.embedding_layer = GATv2Conv(
            hidden_dim, 
            embedding_dim,
            edge_dim=hidden_dim,
            heads=1,
            add_self_loops=False
        )
        self.lin1 = torch.nn.Linear(embedding_dim * 2, edge_dim)
        
    
    def forward(self, data):
        z = self.encoder(data)
        return self.decoder(z,data.edge_index)

    def encoder(self,data):
        # Encode edge attributes
        transformed_edge_attr = self.edge_transform(data.edge_attr)
        z = self.encoder1(data.x, data.edge_index, transformed_edge_attr)
        z = F.relu(z)
        z = self.encoder2(z, data.edge_index, transformed_edge_attr)
        z = F.relu(z)
        return self.embedding_layer(z, data.edge_index, transformed_edge_attr)
        
    def decoder(self, z, edge_index):
        # Decode edge attributes based on the embeddings of the source and destination nodes
        src_nodes = edge_index[0]
        dst_nodes = edge_index[1]
        edge_embeddings = torch.cat([z[src_nodes], z[dst_nodes]], dim=1)
        return self.lin1(edge_embeddings)

## Loading the Data

In [None]:
data_orignal = pd.read_csv("all_data.csv")

## Preprocessing data

In [None]:
all_data = []
for i in range(data_orignal["Month"].min()+1,data_orignal["Month"].max()):
    data=data_orignal[data_orignal["Month"]==i]
    # Create a mapping for node IDs (to numerical indices)
    all_ids = pd.concat([data['From_Account_id'], data['To_Account_id']]).unique()
    node_map = {val: i for i, val in enumerate(all_ids)}
    data['source_node'] = data['From_Account_id'].map(node_map)
    data['target_node'] = data['To_Account_id'].map(node_map)
    data['normalized_amount'] = np.log(data['amount']+1)
    hour_of_month = (data["Day"] - 1) * 24 + data["Hour"]
    data['sin_hour_month'] = np.sin(2 * np.pi * hour_of_month / data["Day"].max()*24)
    data['cos_hour_month'] = np.cos(2 * np.pi * hour_of_month / data["Day"].max()*24)
    edge_index = torch.tensor(data[['source_node', 'target_node']].values.T, dtype=torch.long)
    edge_attr = torch.tensor(data[['normalized_amount',"sin_hour_month","cos_hour_month"]].values, dtype=torch.float)

    node_feature = pd.read_csv(f"month-{i}-nodes.csv")
    node_feature["Unnamed: 0"] = node_feature["Unnamed: 0"].map(node_map)
    node_feature = node_feature.sort_values(by='Unnamed: 0', ascending=True)
    node_feature["Outgoing_amount"] = np.log(node_feature["Outgoing_Amount"]+1)
    node_feature["Incoming_amount"] = np.log(node_feature["Incoming_Amount"]+1)
    in_scaler = MinMaxScaler()
    out_scaler = MinMaxScaler()
    node_feature['in_degree_centrality'] = in_scaler.fit_transform(node_feature[['in_degree_centrality']])
    node_feature['out_degree_centrality'] = out_scaler.fit_transform(node_feature[['out_degree_centrality']])
    x = torch.tensor(node_feature[["Outgoing_amount","Incoming_amount","in_degree_centrality","out_degree_centrality"]].values,dtype=torch.float)
    all_data.append(Data(x=x, edge_index=edge_index, edge_attr=edge_attr, num_nodes=len(all_ids)))
    print(i)

## Initialize the model and optimizer

In [None]:
embedding_dim = 64
hidden_dim = 128
num_edge_features = all_data[0].edge_attr.size(1) 
num_node_featuress = all_data[0].num_node_features
model = EdgeAwareGAE(num_node_featuress,num_edge_features,hidden_dim,embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
epochs=20
device = torch.device('cpu')

## Start Training

In [None]:
for i in range(epochs):
    print("Epochs: ",i)
    for month_idx, monthly_data in enumerate(all_data):
        model.train()
        
        # Move monthly data to the device (GPU/CPU)
        monthly_data = monthly_data.to(device)
        
        # Optimizer reset if necessary (optional)
        optimizer.zero_grad()
        
        # Forward pass
        reconstructed = model(monthly_data)
        
        # Compute reconstruction loss
        loss = F.mse_loss(reconstructed, monthly_data.edge_attr)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        print(f"Month {month_idx + 1}, Loss: {loss.item()}")
        
        # Optionally save the model after each month's training
    torch.save(model.state_dict(), f"gae_model.pt")

## Loading Trained Weights

In [None]:
model.load_state_dict(torch.load('gae_model_14.pt', map_location=torch.device('cpu')))

## Extracting and Saving Node Embeddings

In [None]:
data = all_data[0] # for only 7 month data
model.eval()
with torch.no_grad():
    embeddings = model.encoder(data)

In [None]:
embeddings_np = embeddings.cpu().numpy()
np.save('7-month-node-embeddings.npy', embeddings_np)

## Performing Anomaly Detection on node embeddings

In [None]:
embeddings = np.load('7-month-node-embeddings.npy')

In [None]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(contamination=0.01, random_state=42)  # Adjust contamination rate
anomalies = clf.fit_predict(embeddings)  # -1 for anomalies, 1 for normal

In [None]:
len(anomalies)

In [None]:
suspicious_indices = np.where(anomalies==-1)[0]

In [None]:
len(suspicious_indices) # Number of suspicious nodes

In [None]:
month_7 = data_orignal[data_orignal["Month"]==7]

In [None]:
all_ids = pd.concat([month_7['From_Account_id'], month_7['To_Account_id']]).unique()
node_map = {val: i for i, val in enumerate(all_ids)}
month_7['source_node'] = month_7['From_Account_id'].map(node_map)
month_7['target_node'] = month_7['To_Account_id'].map(node_map)

## Getting suspicious transactions

In [None]:
suspicious_transactions = month_7[
    (month_7['source_node'].isin(suspicious_indices)) &
    (month_7['target_node'].isin(suspicious_indices))
]

In [None]:
suspicious_transactions

## Creating a graph object

In [None]:
G = nx.DiGraph()

# Iterate through the DataFrame rows and add edges to the graph
for index, row in suspicious_transactions.iterrows():
    G.add_edge(row['source_node'], row['target_node'])

In [None]:
connected_components = [G.subgraph(c).copy() for c in nx.weakly_connected_components(G) if len(c)>3]

In [None]:
len(connected_components)

In [None]:
import matplotlib.pyplot as plt
nx.draw(G[0], with_labels=True)
plt.show()