In [1]:
import networkx as nx
from tqdm import tqdm
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from func import data_loading, data_loading_to_model

In [2]:
train, test = data_loading(frac=.1)
train.head()

(2438690, 13)


Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,MCC,Errors?,Is Fraud?,card_id,hour,minute
i64,i64,i64,f32,i64,i64,i64,i64,i64,i64,str,i8,i8
2003,6,14,0.7,2,6135208568923449408,1086,9402,20,0,"""188_3""",10,51
2009,9,12,25.4,2,-4693979874497918566,8363,7538,20,0,"""801_3""",6,38
2006,5,22,-436.0,2,4552887027432897467,7176,3596,20,0,"""959_4""",13,41
2013,5,20,3.28,2,-4334232547381218591,5359,5541,20,0,"""1860_2""",9,19
2006,3,21,22.549999,2,-8506198726270667357,6402,5813,20,0,"""1018_0""",19,35


In [3]:
train['Is Fraud?'].value_counts()

Is Fraud?,count
i64,u32
0,1631929
1,1993


In [4]:
G = nx.MultiGraph()
G.add_nodes_from(train["card_id"].unique(), type='card_id')
G.add_nodes_from(train["Merchant Name"].unique(), type='merchant_name')

In [5]:
for row in tqdm(train.to_dicts()):
    year = row['Year'],
    month = row['Month'],
    day = row['Day'],
    hour = row['hour'],
    minute =row['minute'],  
    amount = row['Amount'],
    use_chip =  row['Use Chip'],
    merchant_city = row['Merchant City'],
    errors =  row['Errors?'],
    mcc = row['MCC']

    G.add_edge(
        row['card_id'],
        row['Merchant Name'],
        year=year,
        month=month,
        day=day,
        hour=hour, 
        minute=minute, 
        amount=amount, 
        use_chip=use_chip,
        merchant_city=merchant_city, 
        errors=errors, 
        mcc=mcc
    )

100%|██████████| 1633922/1633922 [00:06<00:00, 262112.00it/s]


In [6]:
# Get the number of nodes and edges in the graph
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Print the number of nodes and edges
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 46501
Number of edges: 1633922


In [7]:
adj_matrix = nx.adjacency_matrix(G).todense()
adj_matrix.shape

(46501, 46501)

In [8]:
sample_nodes = list(G.nodes())[:10]

# Retrieve the properties of the sample nodes
node_properties = nx.get_node_attributes(G, 'type')
for node in sample_nodes:
    print(f"Node: {node}, Properties: {node_properties[node]}")

Node: 1826_0, Properties: card_id
Node: 1995_1, Properties: card_id
Node: 1609_1, Properties: card_id
Node: 885_0, Properties: card_id
Node: 352_1, Properties: card_id
Node: 331_4, Properties: card_id
Node: 627_1, Properties: card_id
Node: 1330_5, Properties: card_id
Node: 407_2, Properties: card_id
Node: 977_0, Properties: card_id


In [9]:
sample_size = 5
for i, edge in enumerate(G.edges()):
    print(G.get_edge_data(*edge))
    if i >= sample_size - 1:
        break

{0: {'year': (2020,), 'month': (1,), 'day': (21,), 'hour': (7,), 'minute': (36,), 'amount': (-56.0,), 'use_chip': (2,), 'merchant_city': (3330,), 'errors': (20,), 'mcc': 5541}, 1: {'year': (2020,), 'month': (1,), 'day': (21,), 'hour': (7,), 'minute': (45,), 'amount': (56.0,), 'use_chip': (2,), 'merchant_city': (3330,), 'errors': (20,), 'mcc': 5541}}
{0: {'year': (2020,), 'month': (1,), 'day': (21,), 'hour': (7,), 'minute': (36,), 'amount': (-56.0,), 'use_chip': (2,), 'merchant_city': (3330,), 'errors': (20,), 'mcc': 5541}, 1: {'year': (2020,), 'month': (1,), 'day': (21,), 'hour': (7,), 'minute': (45,), 'amount': (56.0,), 'use_chip': (2,), 'merchant_city': (3330,), 'errors': (20,), 'mcc': 5541}}
{0: {'year': (2020,), 'month': (1,), 'day': (27,), 'hour': (23,), 'minute': (46,), 'amount': (21.459999084472656,), 'use_chip': (2,), 'merchant_city': (3330,), 'errors': (20,), 'mcc': 5411}}
{0: {'year': (2017,), 'month': (4,), 'day': (29,), 'hour': (14,), 'minute': (59,), 'amount': (1.929999947

In [10]:
# Retrieve the properties errors of all the edges
edge_properties = nx.get_edge_attributes(G, 'errors')

# Count the number of edges by property value
edge_count_by_property = Counter(edge_properties.values())
for property_value, count in edge_count_by_property.items():
    print(f"Property value: {property_value}, Count: {count}")

Property value: (20,), Count: 1608026
Property value: (21,), Count: 3174
Property value: (18,), Count: 16146
Property value: (13,), Count: 4002
Property value: (3,), Count: 845
Property value: (9,), Count: 707
Property value: (0,), Count: 747
Property value: (16,), Count: 149
Property value: (14,), Count: 38
Property value: (7,), Count: 11
Property value: (1,), Count: 9
Property value: (5,), Count: 5
Property value: (17,), Count: 2
Property value: (19,), Count: 37
Property value: (10,), Count: 7
Property value: (2,), Count: 1
Property value: (4,), Count: 6
Property value: (15,), Count: 8
Property value: (8,), Count: 1
Property value: (11,), Count: 1


In [11]:
x_train, y_train = data_loading_to_model(G, train)
x_test, y_test = data_loading_to_model(G, test)

100%|██████████| 1633922/1633922 [00:03<00:00, 485885.66it/s]
100%|██████████| 1633922/1633922 [00:03<00:00, 491076.64it/s]


In [13]:
# Define the model
class FraudGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FraudGNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x.squeeze(-1)
    
    
input_dim = len(x_train[0])
hidden_dim = 16
model = FraudGNN(input_dim, hidden_dim)
num_epochs=201

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [14]:
# Train the model
for i in range(num_epochs):
    output = model(x_train)
    loss = criterion(output, y_train)
    if i % 20 == 0:
        print(f'Epoch: {i}, Loss: {loss.item()}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch: 0, Loss: 1.7549034357070923
Epoch: 20, Loss: 0.35467100143432617
Epoch: 40, Loss: 0.30478575825691223
Epoch: 60, Loss: 1.2197191715240479
Epoch: 80, Loss: 1.512757658958435
Epoch: 100, Loss: 1.1619799137115479
Epoch: 120, Loss: 0.7744260430335999
Epoch: 140, Loss: 0.42933258414268494
Epoch: 160, Loss: 0.24903957545757294
Epoch: 180, Loss: 0.18386967480182648
Epoch: 200, Loss: 0.15792910754680634


In [15]:
model(x_test)

tensor([-125.7524, -123.1959, -119.8451,  ..., -123.4304, -122.9815,
        -228.9663], grad_fn=<SqueezeBackward1>)

In [16]:
with torch.no_grad():  # No need to track gradients for inference
    output = model(x_test)

# 4. Interpret the output
# This depends on your task (node classification, graph classification, etc.)
predictions = output.argmax(dim=-1)

In [18]:
output

tensor([-125.7524, -123.1959, -119.8451,  ..., -123.4304, -122.9815,
        -228.9663])