# Modeling

In [4]:
import os
import sys
import glob

os.environ['DGLBACKEND'] = 'pytorch'

import torch as th
import dgl
import numpy as np

from gnn.estimator_fns import *
from gnn.graph_utils import *
from gnn.data import *
from gnn.utils import *
from gnn.pytorch_model import *
from train import *

### Load data

In [5]:
file_list = glob.glob('./data/*edgelist.csv')

edges = ",".join(map(lambda x: x.split("/")[-1], [file for file in file_list if "relation" in file]))

### Generate graph

In [6]:
print('numpy version:{} PyTorch version:{} DGL version:{}'.format(np.__version__,
                                                                    th.__version__,
                                                                    dgl.__version__))

args = parse_args()
print(args)

numpy version:1.20.3 PyTorch version:1.10.1 DGL version:0.7.2
Namespace(training_dir='./data', model_dir='./model/2022_01_24_17_24_08', output_dir='./output', nodes='features.csv', target_ntype='TransactionID', edges='relation*', labels='tags.csv', new_accounts='test.csv', compute_metrics=True, threshold=0, num_gpus=0, optimizer='adam', lr=0.01, n_epochs=1000, n_hidden=32, n_layers=6, weight_decay=0.0005, dropout=0.2, embedding_size=360)


In [7]:
args.edges = edges

args.edges = get_edgelists('relation*', args.training_dir)

g, features, target_id_to_node, id_to_node = construct_graph(args.training_dir,
                                                                args.edges,
                                                                args.nodes,
                                                                args.target_ntype)

mean, stdev, features = normalize(th.from_numpy(features))

print('feature mean shape:{}, std shape:{}'.format(mean.shape, stdev.shape))

Getting relation graphs from the following edge lists : ['relation_addr1_edgelist.csv', 'relation_addr2_edgelist.csv', 'relation_card1_edgelist.csv', 'relation_card2_edgelist.csv', 'relation_card3_edgelist.csv', 'relation_card4_edgelist.csv', 'relation_card5_edgelist.csv', 'relation_card6_edgelist.csv', 'relation_DeviceInfo_edgelist.csv', 'relation_DeviceType_edgelist.csv', 'relation_id_01_edgelist.csv', 'relation_id_02_edgelist.csv', 'relation_id_03_edgelist.csv', 'relation_id_04_edgelist.csv', 'relation_id_05_edgelist.csv', 'relation_id_06_edgelist.csv', 'relation_id_07_edgelist.csv', 'relation_id_08_edgelist.csv', 'relation_id_09_edgelist.csv', 'relation_id_10_edgelist.csv', 'relation_id_11_edgelist.csv', 'relation_id_12_edgelist.csv', 'relation_id_13_edgelist.csv', 'relation_id_14_edgelist.csv', 'relation_id_15_edgelist.csv', 'relation_id_16_edgelist.csv', 'relation_id_17_edgelist.csv', 'relation_id_18_edgelist.csv', 'relation_id_19_edgelist.csv', 'relation_id_20_edgelist.csv', 're

feature mean shape:torch.Size([390]), std shape:torch.Size([390])


In [8]:
g.nodes['target'].data['features'] = features

print("Getting labels")
n_nodes = g.number_of_nodes('target')

labels, _, test_mask = get_labels(target_id_to_node,
                                            n_nodes,
                                            args.target_ntype,
                                            os.path.join(args.training_dir, args.labels),
                                            os.path.join(args.training_dir, args.new_accounts))
print("Got labels")

labels = th.from_numpy(labels).float()
test_mask = th.from_numpy(test_mask).float()

n_nodes = th.sum(th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes]))
n_edges = th.sum(th.tensor([g.number_of_edges(e_type) for e_type in g.etypes]))

print("""----Data statistics------'
            #Nodes: {}
            #Edges: {}
            #Features Shape: {}
            #Labeled Test samples: {}""".format(n_nodes,
                                                    n_edges,
                                                    features.shape,
                                                    test_mask.sum()))

Getting labels
Got labels
----Data statistics------'
            #Nodes: 726345
            #Edges: 19518802
            #Features Shape: torch.Size([590540, 390])
            #Labeled Test samples: 118108.0


### Training

In [9]:
if args.num_gpus:
    cuda = True
    device = th.device('cuda:0')
else:
    cuda = False
    device = th.device('cpu')

In [None]:
print("Initializing Model")
in_feats = features.shape[1]
n_classes = 2

ntype_dict = {n_type: g.number_of_nodes(n_type) for n_type in g.ntypes}

model = get_model(ntype_dict, g.etypes, vars(args), in_feats, n_classes, device)
print("Initialized Model")

features = features.to(device)

labels = labels.long().to(device)
test_mask = test_mask.to(device)
# g = g.to(device)

loss = th.nn.CrossEntropyLoss()

# print(model)
optim = th.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

print("Starting Model training")

initial_record()

model, class_preds, pred_proba = train_fg(model, optim, loss, features, labels, g, g,
                                            test_mask, device, args.n_epochs,
                                            args.threshold,  args.compute_metrics)
print("Finished Model training")

print("Saving model") 

if not os.path.exists(args.model_dir):
    os.makedirs(args.model_dir)

save_model(g, model, args.model_dir, id_to_node, mean, stdev)
print("Model and metadata saved")

Initializing Model
Initialized Model
Starting Model training
Epoch 00000, Time(s) 2019.3499, Loss 0.5230, F1 0.0000 
Epoch 00001, Time(s) 2408.0788, Loss 17.7347, F1 0.1790 
Epoch 00002, Time(s) 2036.9546, Loss 0.2719, F1 0.0000 
Epoch 00003, Time(s) 2106.3953, Loss 0.4772, F1 0.0000 
Epoch 00004, Time(s) 2024.7838, Loss 0.2795, F1 0.0000 
Epoch 00005, Time(s) 2075.7785, Loss 0.2205, F1 0.0000 
Epoch 00006, Time(s) 2099.2977, Loss 0.1873, F1 0.0000 
Epoch 00007, Time(s) 2134.7374, Loss 0.1464, F1 0.1388 
Epoch 00008, Time(s) 2146.4198, Loss 1.4278, F1 0.0000 
Epoch 00009, Time(s) 2140.8130, Loss 0.2177, F1 0.0000 
Epoch 00010, Time(s) 2187.4841, Loss 0.2550, F1 0.0000 
Epoch 00011, Time(s) 2259.3356, Loss 0.2419, F1 0.0000 
Epoch 00012, Time(s) 2241.1635, Loss 0.2051, F1 0.0000 
