# Import Graph

In [1]:
import os
import numpy as np
import pandas as pd

# Set the environment variable before importing TensorFlow and TensorFlow GNN
os.environ['TF_USE_LEGACY_KERAS'] = '1'

import tensorflow as tf
import networkx as nx


#  Import networkX graph from file
G = nx.read_graphml('data/graph.graphml')

# Extract node indices
node_list = list(G.nodes)
node_indices = {node: idx for idx, node in enumerate(node_list)}

# Calculate edge weights
max_transactions = max([attrs["total_transactions"] for _, _, attrs in G.edges(data=True)])
min_transactions = min([attrs["total_transactions"] for _, _, attrs in G.edges(data=True)])

print("Max transactions: ", max_transactions)
print("Min transactions: ", min_transactions)

# Max min regulzarization
for u, v, attrs in G.edges(data=True):
    attrs["weight"] = (attrs["total_transactions"] - min_transactions) / (max_transactions - min_transactions)


# Extract edge indices using the node indices
edge_indices = np.array([(node_indices[u], node_indices[v]) for u, v, attrs in G.edges(data=True)])
edge_weights = np.array([attrs["weight"] for _, _, attrs in G.edges(data=True)])

labels = np.array([0 if attrs["fraud_proportion"] == 0 else 1 for _, _, attrs in G.edges(data=True)])

# Create a DataFrame with the edge indices
adjency_matrix_coo_df = pd.DataFrame([], columns=["source", "target", "weight", "label"])
for ix, edge in enumerate(edge_indices):
    adjency_matrix_coo_df.loc[len(adjency_matrix_coo_df)] = [edge[0], edge[1], edge_weights[ix], labels[ix]]


adjency_matrix_coo_df.info()

Max transactions:  181
Min transactions:  1
<class 'pandas.core.frame.DataFrame'>
Index: 8288 entries, 0 to 8287
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   source  8288 non-null   float64
 1   target  8288 non-null   float64
 2   weight  8288 non-null   float64
 3   label   8288 non-null   float64
dtypes: float64(4)
memory usage: 323.8 KB


# Convert Graph to GNN input

In [2]:
# How many edges?
print('Number of edges:', G.number_of_edges())

# How many nodes?
print('Number of nodes:', G.number_of_nodes())

Number of edges: 8288
Number of nodes: 7311


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

country_vocab = set()
account_type_vocab = set()
customer_id_vocab = set()

for n, attrs in G.nodes(data=True):

    # Preprocess the customer ID
    attrs["customer_id"] = attrs.get("customer_id").replace("_", "")

    country_vocab.add(attrs.get("country", "unknown"))
    account_type_vocab.add(attrs.get("type", "unknown"))
    customer_id_vocab.add(attrs.get("customer_id"))

print(f"Country vocab size: {len(country_vocab)}")
print(f"Type vocab size: {len(account_type_vocab)}")
print(f"Customer ID vocab size: {len(customer_id_vocab)}")


# Define a TensorFlow encoder
class NodeAttributeEncoder(tf.keras.layers.Layer):
    def __init__(
        self, country_vocab_size, type_vocab_size, customer_id_vocab_size, output_dim
    ):
        super(NodeAttributeEncoder, self).__init__()
        self.country_embedding = tf.keras.layers.Embedding(
            country_vocab_size, output_dim
        )
        self.account_type_embedding = tf.keras.layers.Embedding(
            type_vocab_size, output_dim
        )
        self.customer_id_normalizer = tf.keras.layers.Embedding(
            customer_id_vocab_size, output_dim
        )

        # Initialize tokenizers
        self.country_tokenizer = Tokenizer(
            num_words=country_vocab_size, oov_token="<OOV>", split=" "
        )
        self.type_tokenizer = Tokenizer(num_words=type_vocab_size, oov_token="<OOV>", split=" ")
        self.customer_id_tokenizer = Tokenizer(
            num_words=customer_id_vocab_size, oov_token="<OOV>", split=" "
        )

    def fit_tokenizers(self, country_texts, type_texts, customer_id_texts):
        # Fit tokenizers on the respective input data
        self.country_tokenizer.fit_on_texts(country_texts)
        self.type_tokenizer.fit_on_texts(type_texts)
        self.customer_id_tokenizer.fit_on_texts(customer_id_texts)

    def call(self, inputs):
        country, customer_id, account_type_ = inputs

        # Convert text to sequences of integers
        country_seq = self.country_tokenizer.texts_to_sequences([country])
        type_seq = self.type_tokenizer.texts_to_sequences([account_type_])
        customer_id_seq = self.customer_id_tokenizer.texts_to_sequences([customer_id])

        # Convert sequences to tensors
        country_seq = tf.constant(country_seq)
        type_seq = tf.constant(type_seq)
        customer_id_seq = tf.constant(customer_id_seq)

        # Pad sequences to ensure uniform length
        country_seq = tf.keras.preprocessing.sequence.pad_sequences(
            country_seq, padding="post"
        )
        type_seq = tf.keras.preprocessing.sequence.pad_sequences(
            type_seq, padding="post"
        )
        customer_id_seq = tf.keras.preprocessing.sequence.pad_sequences(
            customer_id_seq, padding="post"
        )

        # Embed the sequences
        country_encoded = self.country_embedding(country_seq)
        type_encoded = self.account_type_embedding(type_seq)
        customer_id_encoded = self.customer_id_normalizer(customer_id_seq)
        return tf.concat([country_encoded, customer_id_encoded, type_encoded], axis=-1)


node_encoder = NodeAttributeEncoder(
    len(country_vocab) + 1,
    len(account_type_vocab) + 1,
    len(customer_id_vocab) + 1,
    output_dim=1,
)
node_encoder.fit_tokenizers(
    list(country_vocab), list(account_type_vocab), list(customer_id_vocab)
)

# Encode node attributes to feature vectors
node_features = tf.TensorArray(tf.float32, size=G.number_of_nodes())
for index, node_obj in enumerate(G.nodes(data=True)):
    node, attrs = node_obj
    encoded_features = node_encoder.call(
        (
            attrs.get("country", "unknown"),
            attrs.get("customer_id", "unknown"),
            attrs.get("type", "unknown"),
        )
    )
    node_features = node_features.write(index, tf.squeeze(encoded_features, axis=0))

# Convert TensorArray to Tensor
node_features_tf = node_features.stack()
node_features_tf = tf.reshape(node_features_tf, (G.number_of_nodes(), 3))
print(node_features_tf.shape)

Country vocab size: 1
Type vocab size: 1
Customer ID vocab size: 7311
(7311, 3)


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Normalization
from sklearn.preprocessing import StandardScaler

# Sample data
total_amount_vec = [attrs.get("total_amount") for a, b, attrs in G.edges(data=True)]

# Convert to NumPy array and reshape
total_amount_vec = np.array(total_amount_vec).reshape(-1, 1)

# Check for NaN or infinite values
if np.any(np.isnan(total_amount_vec)) or np.any(np.isinf(total_amount_vec)):
    raise ValueError("Data contains NaN or infinite values")

# Alternative normalization using StandardScaler
scaler = StandardScaler()

scaler.fit(total_amount_vec)
total_amount_normalized = scaler.transform(total_amount_vec)


total_transactions_vec = np.array(
    [attrs.get("total_transactions") for a, b, attrs in G.edges(data=True)]
).reshape(-1, 1)
scaler.fit(total_transactions_vec)
total_transactions_normalized = scaler.transform(total_transactions_vec)

fraud_proportion_vec = np.array(
    [attrs.get("fraud_proportion") for a, b, attrs in G.edges(data=True)]
).reshape(-1, 1)
scaler.fit(fraud_proportion_vec)
fraud_proportion_normalized = scaler.transform(fraud_proportion_vec)


edge_features = tf.concat(
    [
        total_amount_normalized,
        # min_amount_normalized,
        # max_amount_normalized,
        # mean_amount_normalized,
        total_transactions_normalized,
        fraud_proportion_normalized,
    ],
    axis=-1,
)

print(edge_features.shape)

(8288, 3)


# Create GNN 

In [5]:
# Create a GraphTensor
from spektral.data.graph import Graph
from spektral.data import SingleLoader, Dataset
from scipy.sparse import coo_matrix

num_nodes = len(G.nodes)

source_nodes = adjency_matrix_coo_df["source"].apply(lambda x: int(x)).values
target_nodes = adjency_matrix_coo_df["target"].apply(lambda x: int(x)).values
weights = adjency_matrix_coo_df["weight"].values

a = coo_matrix(
    (weights, (source_nodes, target_nodes)),
    shape=(num_nodes, num_nodes),
)

x = node_features_tf.numpy()  # node features
e = edge_features.numpy()  # edge features
y = adjency_matrix_coo_df["label"].values  # edge labels

print("Node features shape: ", x.shape)
print("Edge features shape: ", e.shape)
print("Edge labels shape: ", y.shape)


graph = Graph(x=x, a=a, e=e, y=y)

# Create a custom dataset class
class SingleGraphDataset(Dataset):
    def __init__(self, graph, **kwargs):
        self.graph = graph
        super().__init__(**kwargs)

    def read(self):
        return [self.graph]

dataset = SingleGraphDataset(graph)
loader = SingleLoader(dataset=dataset)

Node features shape:  (7311, 3)
Edge features shape:  (8288, 3)
Edge labels shape:  (8288,)


In [None]:
from spektral.layers import EdgeConv
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define the GCN model
class EdgeGCN(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = EdgeConv(32, activation='relu')
        self.conv2 = EdgeConv(16, activation='relu')
        self.dense = Dense(1, activation='sigmoid')

    def call(self, inputs):
        x, a, e = inputs
        x = self.conv1([x, a, e])
        x = self.conv2([x, a, e])
        # Aggregate edge features
        edge_indices = tf.stack([a.indices[:, 0], a.indices[:, 1]], axis=1)
        edge_features = tf.gather(x, edge_indices[:, 0]) + tf.gather(x, edge_indices[:, 1])
        return self.dense(edge_features)

# Create the model
model = EdgeGCN()

tf.debugging.set_log_device_placement(True)


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy')

# Train the model
model.fit(loader.load(), epochs=1, steps_per_epoch=loader.steps_per_epoch)

# Predict the fraud proportion
y_pred = model.predict(loader.load(), steps=loader.steps_per_epoch)


(<tf.Tensor 'IteratorGetNext:0' shape=(7311, 3) dtype=float32>, SparseTensor(indices=Tensor("DeserializeSparse:0", shape=(None, 2), dtype=int64), values=Tensor("edge_gcn/Cast:0", shape=(None,), dtype=float32), dense_shape=Tensor("shape:0", shape=(2,), dtype=int64)), <tf.Tensor 'edge_gcn/Cast_1:0' shape=(8288, 3) dtype=float32>)
(<tf.Tensor 'IteratorGetNext:0' shape=(7311, 3) dtype=float32>, SparseTensor(indices=Tensor("DeserializeSparse:0", shape=(None, 2), dtype=int64), values=Tensor("edge_gcn/Cast:0", shape=(None,), dtype=float32), dense_shape=Tensor("shape:0", shape=(2,), dtype=int64)), <tf.Tensor 'edge_gcn/Cast_1:0' shape=(8288, 3) dtype=float32>)


2024-11-12 16:19:39.898671: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: Incompatible shapes: [7311,1] vs. [8288,1]
	 [[{{node gradient_tape/binary_crossentropy/logistic_loss/mul/BroadcastGradientArgs}}]]


In [None]:
from matplotlib import pyplot as plt

print(y_pred.shape)
print(y.shape)

# Merge original fraud proportions with predicted fraud proportions
y_pred = np.squeeze(y_pred)
y = np.squeeze(y)

for original, predicted in sorted(zip(y, y_pred), key=lambda x: x[0], reverse=True):
    print(f"Original: {original} - Predicted: {predicted}")

# Plot the results
plt.scatter(y, y_pred)
plt.xlabel('True fraud proportion')
plt.ylabel('Predicted fraud proportion')
plt.show()


NameError: name 'y_pred' is not defined