# ✨ SHINE DEMO 

------

In [1]:
import copy
import os
from tqdm import tqdm
import numpy as np
import uuid
from sklearn.metrics import average_precision_score, roc_auc_score

# Torch
import torch
import torch.nn.functional as F
from torch.utils.data import WeightedRandomSampler

# Torch Geometric
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.nn import SAGEConv, to_hetero, Linear
import torch_geometric.transforms as T

# FUCC metrics
from fucc.metrics import log_performance

In [2]:
import sys
sys.path.append("./code")

from utils import EarlyStopping, HeteroSubgraph
from fraud_dataloader import HeteroFraudSubset
from custom_neighbor_loader import NeighborLoader
from models import HeteroGraphSAGE

In [3]:
subset = 5

In [4]:
dataset = HeteroFraudSubset(root='./data', subset=subset)

Processing...


df_train_shape (1275446, 61)
df_val_shape (141717, 61)
df_test_shape (282415, 61)
df_train_shape (1275446, 61)
df_val_shape (141717, 61)
df_test_shape (282415, 61)
(1699578, 61)
False    1697010
True        2568
Name: Is Fraud?, dtype: int64
1699578


Done!


In [5]:
data = dataset[0]

In [6]:
data

HeteroData(
  [1mtransaction[0m={
    x=[1699578, 31],
    y=[1699578],
    train_mask=[1699578],
    val_mask=[1699578],
    test_mask=[1699578]
  },
  [1mcardholder[0m={ x=[1531, 1] },
  [1mmerchant[0m={ x=[34804, 1] },
  [1m(cardholder, pays, transaction)[0m={ edge_index=[2, 1699578] },
  [1m(merchant, receives, transaction)[0m={ edge_index=[2, 1699578] }
)

In [7]:
#Environment
num_workers = 10


# other
train_size = '300d'
test_size = '60d'

#Model
num_neighbors = [2,32] 
num_hidden_channels = 64
sage_aggr = 'add'
hetero_aggr = 'sum'
concat = False
project = False
dropout = False
normalize = False


# weighted neighbor sampling 
weighted = False
weight_func = None #'sub' #('add', 'mul', 'sub')
skip = False
exp = False

#Learning
learning_rate = 0.00001 # worldline: 0.00001 ibm_ai: 0.0001
batch_size = 500
max_epochs = 2
patience = 5
delta = 0

In [8]:
kwargs = {'batch_size': batch_size, 'num_workers': num_workers, 'persistent_workers': True} #pin_memory=True (only set pin_memory=True if dataset does not fit in GPU entirely)
number_of_positives = int(test_size[:-1])*150
num_layers = len(num_neighbors)

In [9]:
#data_path = os.path.join(base_path ,  "data/fraudsage")
#output_path_model = (os.path.join(data_path, "models", str(dataset_name), str(setting), str(train_size) + '_' + str(test_size)))
#root = os.path.join(data_path, "subsets", str(dataset_name), str(train_size) + '_' + str(test_size) )
# we add a random string to the model filename to differentiate between multiple models of runs running concurrently.
#rnd_str = str(uuid.uuid4())
#filename_model = '_'.join(['subset', str(subset), 'run_id',  str(run_id), rnd_str])

#try:
#    os.makedirs(output_path_model)
#except:
#    pass

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Preprocessing

In [11]:
data = T.ToUndirected()(data)

In [12]:
# if data fits on GPU, move it!
data.to(device)

HeteroData(
  [1mtransaction[0m={
    x=[1699578, 31],
    y=[1699578],
    train_mask=[1699578],
    val_mask=[1699578],
    test_mask=[1699578]
  },
  [1mcardholder[0m={ x=[1531, 1] },
  [1mmerchant[0m={ x=[34804, 1] },
  [1m(cardholder, pays, transaction)[0m={ edge_index=[2, 1699578] },
  [1m(merchant, receives, transaction)[0m={ edge_index=[2, 1699578] },
  [1m(transaction, rev_pays, cardholder)[0m={ edge_index=[2, 1699578] },
  [1m(transaction, rev_receives, merchant)[0m={ edge_index=[2, 1699578] }
)

In [13]:
data['transaction'].x

tensor([[0.0785, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0722, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0766, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0719, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0731, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0779, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]])

In [14]:
# Test set statistics 
transaction_data = data['transaction']
print(transaction_data.y[transaction_data.train_mask].size())
print(transaction_data.y[transaction_data.train_mask].sum())

torch.Size([1275446])
tensor(1892)


In [15]:
# Test set statistics 
print(transaction_data.y[transaction_data.test_mask].size())
print(transaction_data.y[transaction_data.test_mask].sum())

torch.Size([282415])
tensor(447)


## Inductive preprocessing

There are two important *inductive* preprocessing steps:

1. **the train/val/test split** has to be inductive. In other words, the validation and test nodes should be separated from the training nodes. During training, the validation and test nodes cannot be observed! 

2. **time gap**: when the model is trained, we only know the true label for the first portion of our transaction nodes. The most recent training transactions are too recent to have a fixed label (they are still under investigation). Hence, we *mask* the labels of all transactions in the x most recent training days. They can still be observed by the algorithm and used in the neighborhood exploration, but their labels are unknown!

**Note**: In the *transductive setting* all nodes are present in the network. The training is limited to the nodes designated as training nodes, nevertheless during neighorhood exploration they can use the validation and test nodes!

### Train/Val/Test split 

Create a train/val/test graph

We only sample the transactions! 
Train data contains train transactions
Val data contains train and val transactions
Test data contains train, val and test transactions

In [16]:
cardholder_mask = torch.Tensor([True] * data['cardholder'].x.size(0)).bool().to(device)
merchant_mask = torch.Tensor([True] * data['merchant'].x.size(0)).bool().to(device)

node_mask_dict_train = {'transaction': data['transaction'].train_mask, 'cardholder': cardholder_mask, 'merchant': merchant_mask}
node_mask_dict_val = {'transaction': (data['transaction'].train_mask + data['transaction'].val_mask), 'cardholder': cardholder_mask, 'merchant': merchant_mask}
node_mask_dict_test = {'transaction': (data['transaction'].train_mask + data['transaction'].val_mask + data['transaction'].test_mask), 'cardholder': cardholder_mask, 'merchant': merchant_mask}

The Subgraph function filters the edges to only include those that link nodes which are in the node_mask_dict. 
E.g. for train_data -> node_mask_dict_train contains the training transactions, all cardholders and all merchants
Only the edges linking these nodes are retained. Hence, no validation or test transactions are in the train data.

In [17]:
# no need to take deepcopy of data as this is done inside the subgraph function
train_data = HeteroSubgraph(data, node_mask_dict_train, weighted=weighted)
val_data = HeteroSubgraph(data, node_mask_dict_val, weighted=weighted)
test_data = HeteroSubgraph(data, node_mask_dict_test, weighted=weighted)

### Time gap

In [18]:
# Let's see what happens if we cut half of the train set. Only first part can be used. 
# By introducing this gap, we mimic the real life situation in which fraud label is not known instantly. 
train_percentage = 1 - (8/int(train_size[:-1]))

cutoff = int(data['transaction'].train_mask.count_nonzero() * train_percentage)
train_input_nodes_mask = torch.tensor(np.zeros(data['transaction'].train_mask.size()[0])).bool()
train_input_nodes_mask[:cutoff] = True
train_input_nodes = ('transaction', train_input_nodes_mask)

In [19]:
val_input_nodes = ('transaction', data['transaction'].val_mask)
test_input_nodes = ('transaction', data['transaction'].test_mask)

## Data Loaders with Neighbor Sampling

In [20]:
train_loader = NeighborLoader(train_data, input_nodes=train_input_nodes ,
                              num_neighbors=num_neighbors, shuffle=True, skip=skip,  weight_func = weight_func, exp=exp, **kwargs)
val_loader = NeighborLoader(val_data, input_nodes= val_input_nodes,
                              num_neighbors=num_neighbors, shuffle=False, skip=skip,  weight_func = weight_func, exp=exp, **kwargs)
test_loader = NeighborLoader(test_data, input_nodes= test_input_nodes,
                              num_neighbors=num_neighbors, shuffle=False, skip=skip,  weight_func = weight_func, exp=exp, **kwargs)

## Model Construction

In [21]:
model = HeteroGraphSAGE(num_hidden_channels, 
                out_channels=1, 
                num_layers=len(num_neighbors), 
                num_features= data.num_features, 
                metadata=data.metadata(), 
                dropout=dropout, 
                project=project, 
                hetero_aggr = hetero_aggr,
                sage_aggr = sage_aggr,
                normalize = normalize)
model = model.to(device)

In [22]:
model

HeteroGraphSAGE(
  (convs): ModuleList(
    (0): HeteroConv(num_relations=4)
    (1): HeteroConv(num_relations=4)
  )
  (lin): Linear(-1, 1, bias=True)
)

In [23]:
@torch.no_grad()
def init_params():
    # Initialize lazy parameters via forwarding a single batch to the model:
    batch = next(iter(train_loader))
    batch = batch.to(device)
    model(batch.x_dict, batch.edge_index_dict)


def train():
    model.train()

    total_examples = total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = batch.to(device)
        batch_size = batch['transaction'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)[:batch_size]
        
        loss = criterion(out.squeeze(1), batch['transaction'].y[:batch_size].float())
        loss.backward()
        optimizer.step()

        total_examples += batch_size
        total_loss += float(loss) * batch_size

    return total_loss / total_examples


@torch.no_grad()
def test(loader, feature_list=None):
    model.eval()
    

    
    y_pred_probas = []
    y_trues = []
    
    total_examples = total_loss = 0

    for batch in tqdm(loader):
        batch = batch.to(device)
        batch_size = batch['transaction'].batch_size
        y = batch['transaction'].y[:batch_size]
        y_hat = model(batch.x_dict, batch.edge_index_dict)[:batch_size]
        loss = criterion(y_hat.squeeze(1), y.float())
        
        total_examples += batch_size
        total_loss += float(loss) * batch_size
        y_pred_probas.append(torch.sigmoid(y_hat.cpu()))
        y_trues.append(y.cpu())
        

    y_true = np.concatenate(y_trues)
    y_pred_proba = np.concatenate(y_pred_probas)
    
    return y_true, y_pred_proba, total_loss/total_examples





## Model Training

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# loss function
#criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCEWithLogitsLoss()


# initialize Early Stopping
early_stopping = EarlyStopping(patience=patience, verbose=True, delta=delta, path='checkpoint.pt')

init_params()  # Initialize parameters.

In [26]:
for epoch in range(max_epochs):
    
    loss = train()
    
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}') # , Approx. Train: {acc:.4f}')
    
    y_train, y_train_pred_proba, train_loss = test(train_loader) 
    y_val, y_val_pred_proba, val_loss = test(val_loader)
    
    
    
    # early_stopping needs the validation loss to check if it has decresed, 
        # and if it has, it will make a checkpoint of the current model
    early_stopping(val_loss, model, epoch)

    if early_stopping.early_stop:
        print("Early stopping")
        break
        
    
    # Average precision
    ap_train = average_precision_score(y_train, y_train_pred_proba)
    ap_val = average_precision_score(y_val, y_val_pred_proba)
    
    # ROC
    roc_train = roc_auc_score(y_train, y_train_pred_proba)
    roc_val = roc_auc_score(y_val, y_val_pred_proba)
    
    
    print(f'Epoch {epoch:02d}, AP train: {ap_train:.4f}, AP test: {ap_val:.4f}')
    print(f'Epoch {epoch:02d}, ROC train: {roc_train:.4f}, ROC test: {roc_val:.4f}')

100%|███████████████████████████████████████| 2483/2483 [04:07<00:00, 10.02it/s]


Epoch 00, Loss: 0.1565


100%|███████████████████████████████████████| 2483/2483 [02:23<00:00, 17.36it/s]
100%|█████████████████████████████████████████| 284/284 [00:31<00:00,  8.89it/s]


Validation loss decreased (inf --> 0.012887).  Saving model ...
Epoch 00, AP train: 0.0019, AP test: 0.0028
Epoch 00, ROC train: 0.5927, ROC test: 0.6804


100%|███████████████████████████████████████| 2483/2483 [04:26<00:00,  9.33it/s]


Epoch 01, Loss: 0.0113


100%|███████████████████████████████████████| 2483/2483 [02:08<00:00, 19.35it/s]
100%|█████████████████████████████████████████| 284/284 [00:14<00:00, 20.02it/s]


Validation loss decreased (0.012887 --> 0.010750).  Saving model ...
Epoch 01, AP train: 0.0077, AP test: 0.0164
Epoch 01, ROC train: 0.7357, ROC test: 0.8120


-----

## Model Evaluation

In [27]:
# load the last checkpoint with the best model
model.load_state_dict(torch.load(early_stopping.path))

test_loader_non_shuffled = NeighborLoader(data, input_nodes=test_input_nodes ,
                              num_neighbors=num_neighbors, shuffle=False, skip=skip, weight_func = weight_func, exp=exp,  **kwargs)
y_true_test, y_pred_proba_test, test_loss = test(test_loader_non_shuffled)
ap_test = average_precision_score(y_true_test, y_pred_proba_test)
roc_test = roc_auc_score(y_true_test, y_pred_proba_test)

print(ap_test)
print(roc_test)

100%|█████████████████████████████████████████| 565/565 [00:52<00:00, 10.80it/s]


0.028525297185399433
0.7840098725722093
