# Libraries

We begin by importing all the necessary libraries used throughout this notebook.

In [4]:
from typing import Any
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from torch_geometric.utils import k_hop_subgraph, degree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import yaml

# Loading the Dataset

In this section, we load the dataset used in our experiment.
The dataset is a simulated financial fraud dataset containing the following columns: ` Time `, ` Source `, ` Target `, ` Amount `, ` Location `, ` Type `, and ` Label `. The Label column contains values from 0 to 2, where:

- 0 indicates a legitimate transaction,
- 1 indicates a fraudulent transaction, and
- 2 denotes unlabeled data.

In [5]:
df = pd.read_csv('/S-FFSD-dataset/data/raw/S-FFSD.csv')

# Exploring the Dataset

In [6]:
df.head(10)

Unnamed: 0,Time,Source,Target,Amount,Location,Type,Labels
0,0,S10000,T1000,13.74,L100,TP100,2
1,1,S10001,T1001,73.17,L101,TP101,2
2,2,S10002,T1000,68.59,L100,TP100,2
3,3,S10003,T1002,57.0,L100,TP102,2
4,4,S10004,T1000,11.55,L100,TP100,2
5,5,S10005,T1000,245.4,L100,TP100,2
6,6,S10006,T1000,134.85,L100,TP100,2
7,7,S10007,T1000,59.92,L100,TP100,0
8,8,S10008,T1003,805.97,L100,TP100,2
9,9,S10009,T1000,44.13,L100,TP100,2


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77881 entries, 0 to 77880
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Time      77881 non-null  int64  
 1   Source    77881 non-null  object 
 2   Target    77881 non-null  object 
 3   Amount    77881 non-null  float64
 4   Location  77881 non-null  object 
 5   Type      77881 non-null  object 
 6   Labels    77881 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 4.2+ MB


In [8]:
df.describe()

Unnamed: 0,Time,Amount,Labels
count,77881.0,77881.0,77881.0
mean,38940.0,195.624898,1.306249
std,22482.452494,4642.50852,0.915825
min,0.0,0.0,0.0
25%,19470.0,5.0,0.0
50%,38940.0,16.61,2.0
75%,58410.0,69.0,2.0
max,77880.0,800000.0,2.0


# Time-Based Feature Engineering

In this section, we define a function to perform feature engineering on the ` Time ` column. First, we segment the time values into defined time spans by setting specific upper and lower bounds. This allows us to extract meaningful statistical patterns based on when each transaction occurred.
Next, we iterate through the dataset to calculate various statistics within each time span, including the average, total, and standard deviation of transaction amounts, as well as the transaction bias. We also compute the number of transactions, the number of unique locations, and the number of unique transaction types in each span. Finally, we concatenate these newly generated features with the original dataframe.

In [9]:
def featmap_gen(tmp_df=None):

    time_span = [2, 3, 5, 15, 20, 50, 100, 150,
                 200, 300, 864, 2590, 5100, 10000, 24000]
    time_name = [str(i) for i in time_span]
    time_list = tmp_df['Time']
    post_fe = []
    for trans_idx, trans_feat in tqdm(tmp_df.iterrows()):
        new_df = pd.Series(trans_feat)
        temp_time = new_df.Time
        temp_amt = new_df.Amount
        for length, tname in zip(time_span, time_name):
            lowbound = (time_list >= temp_time - length)
            upbound = (time_list <= temp_time)
            correct_data = tmp_df[lowbound & upbound]
            new_df['trans_at_avg_{}'.format(
                tname)] = correct_data['Amount'].mean()
            new_df['trans_at_totl_{}'.format(
                tname)] = correct_data['Amount'].sum()
            new_df['trans_at_std_{}'.format(
                tname)] = correct_data['Amount'].std()
            new_df['trans_at_bias_{}'.format(
                tname)] = temp_amt - correct_data['Amount'].mean()
            new_df['trans_at_num_{}'.format(tname)] = len(correct_data)
            new_df['trans_target_num_{}'.format(tname)] = len(
                correct_data.Target.unique())
            new_df['trans_location_num_{}'.format(tname)] = len(
                correct_data.Location.unique())
            new_df['trans_type_num_{}'.format(tname)] = len(
                correct_data.Type.unique())
        post_fe.append(new_df)
    return pd.DataFrame(post_fe)

# Neighbor-Based Feature Engineering

**In this section, we define three methods for later use.**

First, we define a method to find the neighbors of each node in the graph, which will be constructed based on the feature labels of the data points. This helps the model incorporate information from related nodes, enhancing its ability to detect anomalous patterns.

The parameter `k` specifies the number of hops for neighbor search. The `where` parameter indicates whether to find incoming or outgoing neighbors. The `choose_risk` label is used to filter the neighbor indices `neigh_idxs`, including only those neighbors whose labels match the specified `risk_label`.


In [10]:
def k_neighs(
        graph: Data,
        center_idx: int,
        k: int,
        where: str,
        choose_risk: bool = False,
        risk_label: int = 1
) -> torch.Tensor:

    if k not in [1, 2]:
        raise ValueError("k must be 1 or 2")
    flow = 'target_to_source' if where == 'in' else 'source_to_target'

    subset, edge_index, mapping, _ = k_hop_subgraph(
        center_idx,
        num_hops=k,
        edge_index=graph.edge_index,
        relabel_nodes=True,
        flow=flow
    )

    neigh_idxs = subset[subset != center_idx]

    if k == 2:
        subset_1hop, _, _, _ = k_hop_subgraph(
            center_idx,
            num_hops=1,
            edge_index=graph.edge_index,
            relabel_nodes=True,
            flow=flow
        )
        neigh_1hop = subset_1hop[subset_1hop != center_idx]
        neigh_idxs = neigh_idxs[~torch.isin(neigh_idxs, neigh_1hop)]

    if choose_risk:
        neigh_labels = graph.y[neigh_idxs]
        target_idxs = neigh_idxs[neigh_labels == risk_label]
    else:
        target_idxs = neigh_idxs

    return target_idxs

The `count_risk_neighs` method calculates the number of **risky** neighbors each node has in a graph. It iterates through each node, counts how many of its neighbors have a **risky** label, and returns these counts as a tensor.

In [11]:
def count_risk_neighs(
        graph: Data,
        risk_label: int = 1
) -> torch.Tensor:

    ret = []
    for center_idx in range(graph.num_nodes):
        neigh_idxs = k_neighs(graph, center_idx, k=1, where="out", choose_risk=True, risk_label=risk_label)
        risk_neigh_num = len(neigh_idxs)
        ret.append(risk_neigh_num)
    return torch.tensor(ret, dtype=torch.float)

The `feat_map` method is designed to generate node features based on neighborhood information in the graph. It computes aggregated features from the neighbors of each node and returns a tensor containing these features, along with their corresponding feature names. For each node, the method generates the following features:

- **1hop_degree**: Sum of the `degree` feature for 1-hop neighbors.
- **2hop_degree**: Sum of the `degree` feature for 2-hop neighbors.
- **1hop_riskstat**: Sum of the `riskstat` feature for 1-hop neighbors.
- **2hop_riskstat**: Sum of the `riskstat` feature for 2-hop neighbors.

In [12]:
def feat_map(graph, node_feat):

    tensor_list = []
    for idx in tqdm(range(graph.num_nodes)):
        neighs_1_of_center = k_neighs(graph, idx, 1, "in")
        neighs_2_of_center = k_neighs(graph, idx, 2, "in")

        tensor = torch.FloatTensor([
            node_feat[neighs_1_of_center, 0].sum().item(),
            node_feat[neighs_2_of_center, 0].sum().item(),
            node_feat[neighs_1_of_center, 1].sum().item(),
            node_feat[neighs_2_of_center, 1].sum().item(),
        ])
        tensor_list.append(tensor)

    feat_names = ["1hop_degree", "2hop_degree",
                  "1hop_riskstat", "2hop_riskstat"]

    tensor_list = torch.stack(tensor_list)
    return tensor_list, feat_names

# Data Preprocessing

As a first step, we perform feature engineering using the previously defined ` featmap_gen ` function.

In [13]:
df = featmap_gen(df.reset_index(drop=True))

77881it [23:30, 55.23it/s]


**Next, we handle the missing values by filling them with zeros.**

This approach is appropriate because the number of missing entries is relatively small compared to the size of the dataset. Moreover, since the dataset is simulated, there is no real-world information available to impute the missing values more accurately.

In [14]:
df.replace(np.nan, 0, inplace=True)
df.reset_index(drop=True, inplace=True)

In this part we create an adjeceny matrix for the categorical features.
To begin, we initialize three empty lists:

- ` out `: Stores the final output results.
- ` alls `: Keeps track of the source nodes.
- ` allt `: Keeps track of the target nodes.

Next, in the **outer loop**, we iterate through each column specified in the `pair` list.
Within the **inner loop**, we group the data based on the current column. For each group, we identify transactions that share the same value and create edges between them.

However, to limit the number of connections and preserve temporal relevance, we only create edges between transactions that fall within a defined sequential threshold, specified by the `edge_per_trans parameter`.

In [15]:
#df = pd.read_csv('/S-FFSD-dataset/data/processed/df.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/S-FFSD-dataset/data/processed/df.csv'

In [16]:
alls = []
allt = []
pair = ["Source", "Target", "Location", "Type"]
edge_per_trans = 3

for column in pair:
    src, tgt = [], []
    for c_id, c_df in tqdm(df.groupby(column), desc=column):
        c_df = c_df.sort_values(by="Time")
        df_len = len(c_df)
        sorted_idxs = c_df.index
        src.extend([sorted_idxs[i] for i in range(df_len)
                    for j in range(edge_per_trans) if i + j < df_len])
        tgt.extend([sorted_idxs[i+j] for i in range(df_len)
                    for j in range(edge_per_trans) if i + j < df_len])
    alls.extend(src)
    allt.extend(tgt)

Source: 100%|██████████| 30346/30346 [00:00<00:00, 31953.18it/s]
Target: 100%|██████████| 886/886 [00:00<00:00, 6349.44it/s]
Location: 100%|██████████| 296/296 [00:00<00:00, 2343.28it/s]
Type: 100%|██████████| 166/166 [00:00<00:00, 1304.04it/s]


Next, we store the edge information in `edge_index` tensor.

In [17]:
edge_index = torch.tensor([alls, allt], dtype=torch.long)

As part of preprocessing, we need to ensure that all data is numeric, as non-numeric values can cause issues during the training process. Therefore, all categorical columns must be encoded.

In [18]:
cal_list = ["Source", "Target", "Location", "Type"]
for col in cal_list:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

Let's split the data into feature and target sets.

In [19]:
feat_data = df.drop("Labels", axis=1)
labels = df["Labels"]

We need to convert the feature nodes and label nodes into tensors to create the graph object in the next step.

In [20]:
x = torch.tensor(feat_data.values, dtype=torch.float32)
y = torch.tensor(labels.values, dtype=torch.long)

In [21]:
graph = Data(x=x, edge_index=edge_index, y=y)

Now it's time to use the previously defined methods to create neighbor-based features, which include:

- The in-degree of each node
- The number of risky neighbors for each node

Next, we fill any missing values with zero and concatenate these features into a single DataFrame. Finally, to ensure that all feature values are within a similar range, we standardize the DataFrame.


In [22]:
print(f"graph info: {graph}")

node_feat = torch.cat([
degree(graph.edge_index[1], num_nodes=graph.num_nodes).unsqueeze(1).float(),
count_risk_neighs(graph).unsqueeze(1).float()], dim=1)

origin_feat_name = ['degree', 'riskstat']

features_neigh, feat_names = feat_map(graph, node_feat)
features_neigh = torch.cat((node_feat, features_neigh), dim=1).numpy()
feat_names = origin_feat_name + feat_names
features_neigh[np.isnan(features_neigh)] = 0.
features_neigh = pd.DataFrame(features_neigh, columns=feat_names)

graph info: Data(x=[77881, 126], edge_index=[2, 860968], y=[77881])


100%|██████████| 77881/77881 [10:05<00:00, 128.68it/s]


In [23]:
scaler = StandardScaler()
features_neigh = pd.DataFrame(scaler.fit_transform(features_neigh), columns=features_neigh.columns)
features_neigh.to_csv('/Users/raya/Desktop/fraud-detection/S-FFSD-dataset/data/processed/features_neigh.csv', index=False)

# Entropy-Based Feature Engineering

An important feature in fraud detection problems is **Trading Entropy**. For each user, trading entropy can be calculated to capture the variability in their trading behavior. A significant deviation from the normal entropy pattern may indicate a higher probability of fraudulent activity.

In [24]:
def calcu_trading_entropy(
        data_2: pd.DataFrame
) -> float:

    if len(data_2) == 0:
        return 0

    amounts = np.array([data_2[data_2['Type'] == type]['Amount'].sum()
                        for type in data_2['Type'].unique()])
    proportions = amounts / amounts.sum() if amounts.sum() else np.ones_like(amounts)
    ent = -np.array([proportion * np.log(1e-5 + proportion)
                     for proportion in proportions]).sum()
    return ent

# Data Transformation

In [26]:
def span_data_2d(
        data: pd.DataFrame,
        time_windows=None
) -> Any:

    if time_windows is None:
        time_windows = [1, 3, 5, 10, 20, 50, 100, 500]
    data = data[data['Labels'] != 2]

    nume_feature_ret, label_ret = [], []
    for row_idx in tqdm(range(len(data))):
        record = data.iloc[row_idx]
        acct_no = record['Source']
        feature_of_one_record = []

        for time_span in time_windows:
            feature_of_one_timestamp = []
            prev_records = data.iloc[(row_idx - time_span):row_idx, :]
            prev_and_now_records = data.iloc[(
                                                     row_idx - time_span):row_idx + 1, :]
            prev_records = prev_records[prev_records['Source'] == acct_no]

            feature_of_one_timestamp.append(
                prev_records['Amount'].sum() / time_span)
            feature_of_one_timestamp.append(prev_records['Amount'].sum())
            feature_of_one_timestamp.append(
                record['Amount'] - feature_of_one_timestamp[0])
            feature_of_one_timestamp.append(len(prev_records))
            old_ent = calcu_trading_entropy(prev_records[['Amount', 'Type']])
            new_ent = calcu_trading_entropy(
                prev_and_now_records[['Amount', 'Type']])
            feature_of_one_timestamp.append(old_ent - new_ent)

            feature_of_one_record.append(feature_of_one_timestamp)

        nume_feature_ret.append(feature_of_one_record)
        label_ret.append(record['Labels'])

    nume_feature_ret = np.array(nume_feature_ret).transpose(0, 2, 1)

    assert nume_feature_ret.shape == (
        len(data), 5, len(time_windows)), "output shape invalid."

    return nume_feature_ret.astype(np.float32), np.array(label_ret).astype(np.int64)

In [27]:
def span_data_3d(
        data: pd.DataFrame,
        time_windows=None,
        spatio_windows=None,
) -> Any:

    if time_windows is None:
        time_windows = [1, 3, 5, 10, 20, 50, 100, 500]
    if spatio_windows is None:
        spatio_windows = [1, 2, 3, 4,5]
    data = data[data['Labels'] != 2]
    data['Location'] = data['Location'].apply(lambda x: int(x.split('L')[1]))
    data['Location'] = data['Location'].apply(lambda x: 1 if x == 100 else x)
    data['Location'] = data['Location'].apply(lambda x: 2 if 102 >= x > 100 else x)
    data['Location'] = data['Location'].apply(lambda x: 3 if 110 >= x > 102 else x)
    data['Location'] = data['Location'].apply(lambda x: 4 if 140 >= x > 110 else x)
    data['Location'] = data['Location'].apply(lambda x: 5 if x > 140 else x)

    nume_feature_ret, label_ret = [], []
    for row_idx in tqdm(range(len(data))):
        record = data.iloc[row_idx]
        acct_no = record['Source']
        location = int(record['Location'])
        feature_of_one_record = []
        for time_span in time_windows:
            feature_of_one_timestamp = []
            prev_records = data.iloc[(row_idx - time_span):row_idx, :]
            prev_and_now_records = data.iloc[(
                                                     row_idx - time_span):row_idx + 1, :]
            prev_records = prev_records[prev_records['Source'] == acct_no]

            for spatio_span in spatio_windows:
                feature_of_one_spatio_stamp = []
                one_spatio_records = prev_records[prev_records['Location'] > location - spatio_span]
                one_spatio_records = one_spatio_records[one_spatio_records['Location'] < location + spatio_span]

                feature_of_one_spatio_stamp.append(
                    one_spatio_records['Amount'].sum() / time_span)
                feature_of_one_spatio_stamp.append(one_spatio_records['Amount'].sum())
                feature_of_one_spatio_stamp.append(
                    record['Amount'] - feature_of_one_spatio_stamp[0])
                feature_of_one_spatio_stamp.append(len(one_spatio_records))

                old_ent = calcu_trading_entropy(prev_records[['Amount', 'Type']])
                new_ent = calcu_trading_entropy(
                    prev_and_now_records[['Amount', 'Type']])
                feature_of_one_spatio_stamp.append(old_ent - new_ent)

                feature_of_one_timestamp.append(feature_of_one_spatio_stamp)
            feature_of_one_record.append(feature_of_one_timestamp)
        nume_feature_ret.append(feature_of_one_record)
        label_ret.append(record['Labels'])

    nume_feature_ret = np.array(nume_feature_ret)
    print(nume_feature_ret.shape)
    assert nume_feature_ret.shape == (
        len(data), len(time_windows), len(spatio_windows), 5), "output shape invalid."

    return nume_feature_ret.astype(np.float32), np.array(label_ret).astype(np.int64)

# Training

In [62]:
def parse_args():
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')
    parser.add_argument("--method", default="mcnn", type=str)
    args = parser.parse_args(args=[])
    method = args.method
    yaml_file = "/S-FFSD-dataset/config/cfg.yaml"
    with open(yaml_file) as file:
        config = yaml.safe_load(file)
    config['method'] = method
    return config

In [66]:
def base_load_data(args: dict):
    data_path = "/S-FFSD-dataset/data/raw/S-FFSD.csv"
    feat_df = pd.read_csv(data_path)
    train_size = 1 - args['test_size']
    features, labels = span_data_2d(feat_df)
    trf, tef, trl, tel = train_test_split(
        features, labels, train_size=train_size, stratify=labels, shuffle=True)
    trf_file, tef_file, trl_file, tel_file = args['trainfeature'], args[
        'testfeature'], args['trainlabel'], args['testlabel']
    np.save(trf_file, trf)
    np.save(tef_file, tef)
    np.save(trl_file, trl)
    np.save(tel_file, tel)
    return

In [67]:
from training import mcnn_main
def main(args):
        base_load_data(args)
        mcnn_main(
            args['trainfeature'],
            args['trainlabel'],
            args['testfeature'],
            args['testlabel'],
            epochs=args['epochs'],
            batch_size=args['batch_size'],
            lr=args['lr'],
            device=args['device']
        )

In [68]:
main(parse_args())

100%|██████████| 29643/29643 [03:36<00:00, 137.10it/s]


Epoch: 0, loss: 1.4094, auc: 0.5214, F1: 0.5187, AP: 0.1846
Epoch: 1, loss: 0.9019, auc: 0.6538, F1: 0.5826, AP: 0.2513
Epoch: 2, loss: 0.7857, auc: 0.6668, F1: 0.5275, AP: 0.2500
Epoch: 3, loss: 0.6944, auc: 0.6775, F1: 0.5316, AP: 0.2560
Epoch: 4, loss: 0.6663, auc: 0.6808, F1: 0.5513, AP: 0.2602
Epoch: 5, loss: 0.6528, auc: 0.7002, F1: 0.5726, AP: 0.2742
Epoch: 6, loss: 0.5902, auc: 0.7124, F1: 0.6327, AP: 0.2979
Epoch: 7, loss: 0.6682, auc: 0.7054, F1: 0.6646, AP: 0.3110
Epoch: 8, loss: 0.5908, auc: 0.7202, F1: 0.6700, AP: 0.3215
Epoch: 9, loss: 0.6079, auc: 0.7202, F1: 0.6678, AP: 0.3201
Epoch: 10, loss: 0.6548, auc: 0.7050, F1: 0.6732, AP: 0.3171
Epoch: 11, loss: 0.5864, auc: 0.7033, F1: 0.6784, AP: 0.3208
Epoch: 12, loss: 0.6165, auc: 0.7236, F1: 0.6799, AP: 0.3301
Epoch: 13, loss: 0.5937, auc: 0.7189, F1: 0.6816, AP: 0.3294
Epoch: 14, loss: 0.6250, auc: 0.7043, F1: 0.6602, AP: 0.3077
Epoch: 15, loss: 0.5583, auc: 0.7122, F1: 0.6832, AP: 0.3282
Epoch: 16, loss: 0.5655, auc: 0.72