In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
url = 'https://raw.githubusercontent.com/geniusai-research/interns_task/main/sampled_data2.csv'
df = pd.read_csv(url)

In [None]:
df.columns

Index(['step', 'customer', 'age', 'gender', 'merchant', 'category', 'amount',
       'fraud'],
      dtype='object')

1.   We need to select nodes and edges to construct heterogeous graph
2.   Making a heterogeneos graph for fraud detection -> https://arxiv.org/pdf/2011.12193.pdf
3. Nodes -> (transaction,  customer,  merchant, category)
            transaction -> (step,amount)
            customer -> (age,gender)
            merchant -> (mean,count) for fraud
            category -> (mean,count) for fraud
4. edges -> (transaction-> customer , merchant , category)
    

In [None]:
# Encoding (step,gender,customer,merchant,category)
df['gen'] = pd.factorize(df.gender)[0]
df['customerID'] = pd.factorize(df.customer)[0]
df['merchantID'] = pd.factorize(df.merchant)[0]
df['stepID'] = pd.factorize(df.step)[0]
df['categoryID'] = pd.factorize(df.category)[0]

df = df.drop(['gender'],axis=1)
df = df.drop(['customer'],axis=1)
df = df.drop(['merchant'],axis=1)
df = df.drop(['step'],axis=1)
df = df.drop(['category'],axis=1)

In [None]:
# Encoding Age
df['age'].replace(['lt18', '19to25','26to35','36to45','46to55','56to65','gt65','Unknown',], [0, 1,2,3,4,5,6,7], inplace=True)

In [None]:
df.head()

Unnamed: 0,age,amount,fraud,gen,customerID,merchantID,stepID,categoryID
0,3,15.78,0,0,0,0,0,0
1,2,608.32,1,1,1,1,1,1
2,2,43.76,0,1,2,2,2,0
3,3,41.57,0,1,3,0,3,0
4,3,42.97,0,0,4,2,4,0


In [None]:
# Making the customer node
customer_node = pd.concat([df['customerID'],df['age'],df['gen']],axis=1,join="inner")
customer_node = customer_node.drop_duplicates()

# mapping
customer_node = customer_node.reset_index(drop=False)
customerID_mapping = customer_node['customerID']

#customer featurs
customer_node = customer_node[['age','gen']]
customer_node.tail()

Unnamed: 0,age,gen
4056,2,0
4057,5,1
4058,0,1
4059,4,0
4060,4,0


In [None]:
customer_node = customer_node.astype(np.float32).to_numpy()
customer_node = torch.from_numpy(customer_node)
customer_node

tensor([[3., 0.],
        [2., 1.],
        [2., 1.],
        ...,
        [0., 1.],
        [4., 0.],
        [4., 0.]])

In [None]:
# Making transaction nodes
txn = pd.concat([df['stepID'],df['amount']],axis=1,join="inner")
txn.head()

Unnamed: 0,stepID,amount
0,0,15.78
1,1,608.32
2,2,43.76
3,3,41.57
4,4,42.97


In [None]:
txn = txn.astype(np.float32).to_numpy()
txn = torch.from_numpy(txn)
txn

tensor([[  0.0000,  15.7800],
        [  1.0000, 608.3200],
        [  2.0000,  43.7600],
        ...,
        [ 74.0000,   2.5000],
        [  8.0000,   5.3800],
        [  0.0000,  93.8900]])

In [None]:
# Making category nodes
a = df.where(df.fraud==1).groupby('categoryID')['fraud'].count()
a.loc[13] = 0
a.loc[5] = 0
a.index = a.index.astype('int64')
b = df.categoryID.value_counts()

In [None]:
c = pd.merge(a,b, left_index=True, right_index=True)
c['mean'] = c['fraud']/c['categoryID']
c = c.rename(columns = {'fraud':'count'})
c = c.drop('categoryID',axis=1)
category_nodes = c.sort_index()
category_nodes.head()

Unnamed: 0,count,mean
0,280,0.017582
1,718,0.605907
2,1982,0.971569
3,1696,0.796992
4,474,1.0


In [None]:
category_nodes = category_nodes.astype(np.float32).to_numpy()
category_nodes = torch.from_numpy(category_nodes)
category_nodes

tensor([[2.8000e+02, 1.7582e-02],
        [7.1800e+02, 6.0591e-01],
        [1.9820e+03, 9.7157e-01],
        [1.6960e+03, 7.9699e-01],
        [4.7400e+02, 1.0000e+00],
        [0.0000e+00, 0.0000e+00],
        [5.4800e+02, 9.3515e-01],
        [1.1600e+02, 3.6250e-01],
        [5.7800e+02, 9.9313e-01],
        [2.2800e+02, 9.0837e-01],
        [1.5800e+02, 7.3832e-01],
        [1.2000e+02, 4.0541e-01],
        [3.0200e+02, 8.7032e-01],
        [0.0000e+00, 0.0000e+00]])

In [None]:
# Making merchant nodes
a = df.where(df.fraud==1).groupby('merchantID')['fraud'].count()
b = df.merchantID.value_counts()
c = pd.merge(b,a,how="left",left_index=True,right_index=True)
c = c.rename(columns = {'fraud':'count'})
c['count'] = c['count'].fillna(0)
c['mean'] = c['count']/c['merchantID']
c = c.drop('merchantID',axis=1)
merchant_nodes = c.sort_index()
merchant_nodes.head()

Unnamed: 0,count,mean
0,0.0,0.0
1,358.0,0.895
2,0.0,0.0
3,1472.0,0.993923
4,1634.0,0.968583


In [None]:
merchant_nodes = merchant_nodes.astype(np.float32).to_numpy()
merchant_nodes = torch.from_numpy(merchant_nodes)
merchant_nodes

tensor([[0.0000e+00, 0.0000e+00],
        [3.5800e+02, 8.9500e-01],
        [0.0000e+00, 0.0000e+00],
        [1.4720e+03, 9.9392e-01],
        [1.6340e+03, 9.6858e-01],
        [2.9000e+02, 1.0000e+00],
        [0.0000e+00, 0.0000e+00],
        [1.6600e+02, 9.8225e-01],
        [1.1600e+02, 8.1119e-01],
        [2.8000e+02, 6.5728e-01],
        [5.1800e+02, 9.9234e-01],
        [2.2800e+02, 9.0837e-01],
        [1.7400e+02, 7.6991e-01],
        [0.0000e+00, 0.0000e+00],
        [1.5800e+02, 8.1026e-01],
        [6.4000e+01, 9.8462e-01],
        [0.0000e+00, 0.0000e+00],
        [1.2000e+02, 4.0541e-01],
        [1.7800e+02, 9.8343e-01],
        [1.2400e+02, 7.9487e-01],
        [0.0000e+00, 0.0000e+00],
        [2.0000e+02, 9.6618e-01],
        [0.0000e+00, 0.0000e+00],
        [1.8400e+02, 1.0000e+00],
        [2.1600e+02, 9.9083e-01],
        [1.6000e+01, 3.0769e-01],
        [9.2000e+01, 1.0000e+00],
        [0.0000e+00, 0.0000e+00],
        [1.0600e+02, 4.9533e-01],
        [1.960

In [None]:
# Making edges
# transaction and customers edge
t_c_edge = df['customerID']
t_c_edge = t_c_edge.reset_index()
t_c_edge = t_c_edge.values.transpose()
t_c_edge = torch.from_numpy(t_c_edge)
t_c_edge

tensor([[    0,     1,     2,  ..., 25197, 25198, 25199],
        [    0,     1,     2,  ...,  3541,   856,  1788]])

In [None]:
#transaction and merchant edge
t_m_edge = df['merchantID']
t_m_edge = t_m_edge.reset_index()
t_m_edge = t_m_edge.values.transpose()
t_m_edge = torch.from_numpy(t_m_edge)
t_m_edge

tensor([[    0,     1,     2,  ..., 25197, 25198, 25199],
        [    0,     1,     2,  ...,     0,     0,    14]])

In [None]:
#transaction and category edge
t_cat_edge = df['categoryID']
t_cat_edge = t_cat_edge.reset_index()
t_cat_edge = t_cat_edge.values.transpose()
t_cat_edge = torch.from_numpy(t_cat_edge)
t_cat_edge

tensor([[    0,     1,     2,  ..., 25197, 25198, 25199],
        [    0,     1,     0,  ...,     0,     0,    10]])

In [None]:
# Making Labels
labels = df.fraud
y = labels.to_numpy()
y = torch.from_numpy(y)
y.dtype

torch.int64

In [None]:
# downloading pytorch geometric
!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
!pip install torch-geometric

In [None]:
#Imports
import argparse
import os.path as osp

import torch
import torch.nn.functional as F
from torch.nn import ReLU
from tqdm import tqdm

import torch_geometric
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.loader import HGTLoader, NeighborLoader
from torch_geometric.nn import Linear, SAGEConv, Sequential, to_hetero

In [None]:
# Making heterograph
from torch_geometric.data import HeteroData
data = HeteroData()

#nodes
data['transaction'].x = txn
data['customer'].x = customer_node
data['merchant'].x = merchant_nodes
data['category'].x = category_nodes

#edges
data['transaction', 'isMadeBy', 'customer'].edge_index = t_c_edge
data['transaction', 'isMadeTo', 'merchant'].edge_index = t_m_edge
data['transaction', 'has', 'category'].edge_index = t_cat_edge

#labels
data['transaction'].y = y

In [None]:
data = T.ToUndirected()(data)

In [None]:
# train_mask
train_mask = np.zeros((25200,),dtype=bool)
train_mask[0:17640] = True
train_mask = torch.from_numpy(train_mask)

# val_mask
val_mask = np.zeros((25200,),dtype=bool)
val_mask[17640:] = True
val_mask = torch.from_numpy(val_mask)

In [None]:
data['transaction'].train_mask = train_mask
data['transaction'].val_mask = val_mask

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device, 'x', 'y')

In [None]:
train_input_nodes = ('transaction', data['transaction'].train_mask)
val_input_nodes = ('transaction', data['transaction'].val_mask)
kwargs = {'batch_size': 1024, 'num_workers': 6, 'persistent_workers': True}

In [None]:
train_loader = NeighborLoader(data, num_neighbors=[10] * 2, shuffle=True,input_nodes=train_input_nodes, **kwargs)
val_loader = NeighborLoader(data, num_neighbors=[10] * 2,input_nodes=val_input_nodes, **kwargs)

  cpuset_checked))


In [None]:
model = Sequential('x, edge_index', [
    (SAGEConv((-1, -1), 64), 'x, edge_index -> x'),
    ReLU(inplace=True),
    (SAGEConv((-1, -1), 64), 'x, edge_index -> x'),
    ReLU(inplace=True),
    (Linear(-1, 2), 'x -> x'),
])
model = to_hetero(model, data.metadata(), aggr='sum').to(device)
model = model.float()

In [None]:
@torch.no_grad()
def init_params():
    # Initialize lazy parameters via forwarding a single batch to the model:
    batch = next(iter(train_loader))
    batch = batch.to(device, 'edge_index')
    model(batch.x_dict, batch.edge_index_dict)

In [None]:
def train():
    model.train()

    total_examples = total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = batch.to(device, 'edge_index')
        batch_size = batch['transaction'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)['transaction'][:batch_size]
        loss = F.cross_entropy(out, batch['transaction'].y[:batch_size])
        loss.backward()
        optimizer.step()

        total_examples += batch_size
        total_loss += float(loss) * batch_size

    return total_loss / total_examples

In [None]:
@torch.no_grad()
def test(loader):
    model.eval()

    total_examples = total_correct = 0
    for batch in tqdm(loader):
        batch = batch.to(device, 'edge_index')
        batch_size = batch['transaction'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)['transaction'][:batch_size]
        pred = out.argmax(dim=-1)

        total_examples += batch_size
        total_correct += int((pred == batch['transaction'].y[:batch_size]).sum())

    return total_correct / total_examples

In [None]:
init_params()  # Initialize parameters.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 21):
    loss = train()
    val_acc = test(val_loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')

100%|██████████| 18/18 [00:01<00:00, 10.57it/s]
100%|██████████| 8/8 [00:00<00:00, 24.57it/s]


Epoch: 01, Loss: 0.4058, Val: 0.7713


100%|██████████| 18/18 [00:01<00:00, 15.53it/s]
100%|██████████| 8/8 [00:00<00:00, 24.16it/s]


Epoch: 02, Loss: 0.1688, Val: 0.9694


100%|██████████| 18/18 [00:01<00:00, 15.32it/s]
100%|██████████| 8/8 [00:00<00:00, 22.91it/s]


Epoch: 03, Loss: 0.1925, Val: 0.9714


100%|██████████| 18/18 [00:01<00:00, 15.18it/s]
100%|██████████| 8/8 [00:00<00:00, 23.46it/s]


Epoch: 04, Loss: 0.1948, Val: 0.9509


100%|██████████| 18/18 [00:01<00:00, 14.87it/s]
100%|██████████| 8/8 [00:00<00:00, 22.66it/s]


Epoch: 05, Loss: 0.1019, Val: 0.9499


100%|██████████| 18/18 [00:01<00:00, 14.98it/s]
100%|██████████| 8/8 [00:00<00:00, 23.33it/s]


Epoch: 06, Loss: 0.1052, Val: 0.9672


100%|██████████| 18/18 [00:01<00:00, 15.08it/s]
100%|██████████| 8/8 [00:00<00:00, 24.00it/s]


Epoch: 07, Loss: 0.0701, Val: 0.9679


100%|██████████| 18/18 [00:01<00:00, 15.47it/s]
100%|██████████| 8/8 [00:00<00:00, 23.96it/s]


Epoch: 08, Loss: 0.0625, Val: 0.7286


100%|██████████| 18/18 [00:01<00:00, 15.53it/s]
100%|██████████| 8/8 [00:00<00:00, 22.75it/s]


Epoch: 09, Loss: 0.1364, Val: 0.9728


100%|██████████| 18/18 [00:01<00:00, 15.32it/s]
100%|██████████| 8/8 [00:00<00:00, 24.03it/s]


Epoch: 10, Loss: 0.1390, Val: 0.9705


100%|██████████| 18/18 [00:01<00:00, 15.57it/s]
100%|██████████| 8/8 [00:00<00:00, 25.16it/s]


Epoch: 11, Loss: 0.1213, Val: 0.9731


100%|██████████| 18/18 [00:01<00:00, 14.40it/s]
100%|██████████| 8/8 [00:00<00:00, 24.14it/s]


Epoch: 12, Loss: 0.1079, Val: 0.9643


100%|██████████| 18/18 [00:01<00:00, 14.88it/s]
100%|██████████| 8/8 [00:00<00:00, 24.12it/s]


Epoch: 13, Loss: 0.1252, Val: 0.9671


100%|██████████| 18/18 [00:01<00:00, 15.06it/s]
100%|██████████| 8/8 [00:00<00:00, 24.08it/s]


Epoch: 14, Loss: 0.2578, Val: 0.9448


100%|██████████| 18/18 [00:01<00:00, 15.63it/s]
100%|██████████| 8/8 [00:00<00:00, 24.23it/s]


Epoch: 15, Loss: 0.1473, Val: 0.9440


100%|██████████| 18/18 [00:01<00:00, 15.47it/s]
100%|██████████| 8/8 [00:00<00:00, 23.82it/s]


Epoch: 16, Loss: 0.1277, Val: 0.9418


100%|██████████| 18/18 [00:01<00:00, 15.60it/s]
100%|██████████| 8/8 [00:00<00:00, 22.79it/s]


Epoch: 17, Loss: 0.1247, Val: 0.9455


100%|██████████| 18/18 [00:01<00:00, 15.50it/s]
100%|██████████| 8/8 [00:00<00:00, 24.23it/s]


Epoch: 18, Loss: 0.1159, Val: 0.9483


100%|██████████| 18/18 [00:01<00:00, 15.23it/s]
100%|██████████| 8/8 [00:00<00:00, 22.03it/s]


Epoch: 19, Loss: 0.1268, Val: 0.9491


100%|██████████| 18/18 [00:01<00:00, 15.12it/s]
100%|██████████| 8/8 [00:00<00:00, 24.40it/s]

Epoch: 20, Loss: 0.1136, Val: 0.9481



