In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
cur_path = "/content/drive/MyDrive/graphfrauddetectionworking"
os.chdir(cur_path)
!pwd

/content/drive/MyDrive/graphfrauddetectionworking


In [None]:
!pip install dgl

Collecting dgl
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 36.1 MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1


In [None]:
pip install gnn

Collecting gnn
  Downloading gnn-1.1.9-py3-none-any.whl (11 kB)
Installing collected packages: gnn
Successfully installed gnn-1.1.9


In [None]:
import numpy as np
import pandas as pd
import torch, torchvision.models

In [None]:
import os
import sys
import glob

os.environ['DGLBACKEND'] = 'pytorch'

import torch as th
import dgl
import numpy as np

from estimator_fns import *

Using backend: pytorch


In [None]:
from train import *

DLG version: 0.6.1


In [None]:
def load_train_graph_info(file_path):

    with open(file_path, 'rb') as f:
        info_dict = pickle.load(f)

    etypes = [can_etype for src_type, can_etype, dst_type in info_dict['etypes']]
    ntype_dict = info_dict['ntype_cnt']

    global TARGET_FEAT_MEAN
    TARGET_FEAT_MEAN = info_dict['feat_mean']
    global TARGET_FEAT_STD
    TARGET_FEAT_STD = info_dict['feat_std']

    return etypes, ntype_dict

In [None]:
INPUT_SIZE = 390
HIDDEN_SIZE = 16
N_LAYERS = 2
OUT_SIZE = 2
EMBEDDING_SIZE = 390
TARGET_FEAT_MEAN = None
TARGET_FEAT_STD = None      

In [None]:
def initialize_arguments(metadata_file):

    etypes, ntype_dict = load_train_graph_info(metadata_file)

    input_size = INPUT_SIZE
    hidden_size = HIDDEN_SIZE
    n_layers = N_LAYERS
    out_size = OUT_SIZE
    embedding_size = EMBEDDING_SIZE

    return ntype_dict, etypes, input_size, hidden_size, out_size, n_layers, embedding_size

In [None]:
class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name: nn.Linear(in_size, out_size) for name in etypes
            })

    def forward(self, G, feat_dict):
        # The input is a dictionary of node features for each type
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            # Compute W_r * h
            if srctype in feat_dict:
                Wh = self.weight[etype](feat_dict[srctype])
                # Save it in graph for message passing
                G.nodes[srctype].data['Wh_%s' % etype] = Wh
                # Specify per-relation message passing functions: (message_func, reduce_func).
                funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        # Trigger message passing of multiple types.
        G.multi_update_all(funcs, 'sum')
        # return the updated node feature dictionary
        return {ntype: G.nodes[ntype].data['h'] for ntype in G.ntypes if 'h' in G.nodes[ntype].data}

In [None]:
class HeteroRGCN(nn.Module):
    def __init__(self, ntype_dict, etypes, in_size, hidden_size, out_size, n_layers, embedding_size):
        super(HeteroRGCN, self).__init__()
        # Use trainable node embeddings as featureless inputs.
        embed_dict = {ntype: nn.Parameter(th.Tensor(num_nodes, in_size))
                      for ntype, num_nodes in ntype_dict.items() if ntype != 'target'}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        # create layers
        self.layers = nn.ModuleList()
        self.layers.append(HeteroRGCNLayer(embedding_size, hidden_size, etypes))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(HeteroRGCNLayer(hidden_size, hidden_size, etypes))

        # output layer
        self.layers.append(nn.Linear(hidden_size, out_size))

    def forward(self, g, features):

        # To use in real-time case, need to set embedding with input embeddings that are extracted from GrahpDB.
        h_dict = self.embed
        #h_dict = features

        # pass through all layers
        for i, layer in enumerate(self.layers[:-1]):
            if i != 0:
                h_dict = {k: F.leaky_relu(h) for k, h in h_dict.items()}
            h_dict = layer(g, h_dict)

        # get user binary logits
        bin_logist = self.layers[-1](h_dict['target'])

        # compute softmax value of binary logits
        softmax_logits = bin_logist.softmax(dim=-1)

        # return the probability to be One
        return softmax_logits

In [None]:
num_nodes={'DeviceInfo': 1200, 'DeviceType': 2, 'P_emaildomain': 59, 'ProductCD': 5, 'R_emaildomain': 60, 'addr1': 206, 'addr2': 46, 'card1': 8816, 'card2': 500, 'card3': 86, 'card4': 4, 'card5': 87, 'card6': 4, 'id_01': 47, 'id_02': 32065, 'id_03': 21, 'id_04': 15, 'id_05': 82, 'id_06': 95, 'id_07': 72, 'id_08': 85, 'id_09': 34, 'id_10': 50, 'id_11': 229, 'id_12': 2, 'id_13': 41, 'id_14': 21, 'id_15': 3, 'id_16': 2, 'id_17': 83, 'id_18': 13, 'id_19': 452, 'id_20': 281, 'id_21': 232, 'id_22': 18, 'id_23': 3, 'id_24': 10, 'id_25': 196, 'id_26': 72, 'id_27': 2, 'id_28': 2, 'id_29': 2, 'id_30': 72, 'id_31': 116, 'id_32': 4, 'id_33': 157, 'id_34': 4, 'id_35': 2, 'id_36': 2, 'id_37': 2, 'id_38': 2, 'target': 141730}

In [None]:
ntype_dict, etypes, in_size, hidden_size, out_size, n_layers, embedding_size = \
initialize_arguments('/content/drive/MyDrive/graphfrauddetectionworking/model/2021_08_10_00_00_51/metadata.pkl')

rgcn_model = HeteroRGCN(num_nodes, etypes, in_size, hidden_size, out_size, n_layers, embedding_size)

path = '/content/drive/MyDrive/graphfrauddetectionworking/model/2021_08_10_00_00_51/model.pth'

torch.save(rgcn_model.state_dict(), path) 
rgcn_model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
rgcn_model.eval()

HeteroRGCN(
  (embed): ParameterDict(
      (DeviceInfo): Parameter containing: [torch.FloatTensor of size 1200x390]
      (DeviceType): Parameter containing: [torch.FloatTensor of size 2x390]
      (P_emaildomain): Parameter containing: [torch.FloatTensor of size 59x390]
      (ProductCD): Parameter containing: [torch.FloatTensor of size 5x390]
      (R_emaildomain): Parameter containing: [torch.FloatTensor of size 60x390]
      (addr1): Parameter containing: [torch.FloatTensor of size 206x390]
      (addr2): Parameter containing: [torch.FloatTensor of size 46x390]
      (card1): Parameter containing: [torch.FloatTensor of size 8816x390]
      (card2): Parameter containing: [torch.FloatTensor of size 500x390]
      (card3): Parameter containing: [torch.FloatTensor of size 86x390]
      (card4): Parameter containing: [torch.FloatTensor of size 4x390]
      (card5): Parameter containing: [torch.FloatTensor of size 87x390]
      (card6): Parameter containing: [torch.FloatTensor of size 4

In [None]:
test_df = pd.read_csv('./data/test_2.csv')

In [None]:
test_df.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3146209,0,3349504,57.95,W,12648,317.0,150.0,visa,226.0,debit,264.0,87.0,288.0,,yahoo.com,,41.0,33.0,0.0,0.0,29.0,20.0,0.0,0.0,19.0,0.0,21.0,0.0,181.0,40.0,11.0,11.0,11.0,475.0,39.0,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,107.949997,0.0,0.0,107.949997,0.0,0.0,107.949997,107.949997,107.949997,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
1,3563707,0,15346124,39.0,W,12577,268.0,150.0,visa,166.0,debit,272.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
2,3299439,0,7773774,108.51,W,18018,452.0,150.0,mastercard,117.0,debit,110.0,87.0,,,gmail.com,,7.0,8.0,0.0,0.0,4.0,6.0,0.0,0.0,4.0,0.0,4.0,0.0,13.0,7.0,157.0,157.0,70.0,0.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
3,2996670,0,282096,59.0,W,9500,321.0,150.0,visa,226.0,debit,420.0,87.0,,,bellsouth.net,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
4,3289863,0,7496722,97.0,W,6550,,150.0,visa,226.0,debit,299.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,


In [None]:
id_cols = ['card1','card2','card3','card4','card5','card6','ProductCD','addr1','addr2','P_emaildomain','R_emaildomain']
cat_cols = ['M1','M2','M3','M4','M5','M6','M7','M8','M9']

In [None]:
non_feature_cols = ['isFraud', 'TransactionDT'] + id_cols
print(non_feature_cols)

['isFraud', 'TransactionDT', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain']


In [None]:
print(list(test_df.columns))

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V

In [None]:
feature_cols = [col for col in test_df.columns if col not in non_feature_cols]
print(feature_cols)

['TransactionID', 'TransactionAmt', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102',

In [None]:
features = pd.get_dummies(test_df[feature_cols], columns=cat_cols).fillna(0)
features['TransactionAmt'] = features['TransactionAmt'].apply(np.log10)

In [None]:
features.shape

(141730, 391)

In [None]:
print(list(features.columns))

['TransactionID', 'TransactionAmt', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109

In [None]:
features.head(5)

Unnamed: 0,TransactionID,TransactionAmt,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,...,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,M1_F,M1_T,M2_F,M2_T,M3_F,M3_T,M4_M0,M4_M1,M4_M2,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
0,3146209,1.763053,288.0,0.0,41.0,33.0,0.0,0.0,29.0,20.0,0.0,0.0,19.0,0.0,21.0,0.0,181.0,40.0,11.0,11.0,11.0,475.0,39.0,0.0,0.0,0.0,0.0,0.0,453.0,0.0,0.0,0.0,507.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,3563707,1.591065,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0
2,3299439,2.03547,0.0,0.0,7.0,8.0,0.0,0.0,4.0,6.0,0.0,0.0,4.0,0.0,4.0,0.0,13.0,7.0,157.0,157.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,70.0,0.0,0.0,0.0,157.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,1,0,1
3,2996670,1.770852,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0
4,3289863,1.986772,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0


In [None]:
features.to_csv('./prediction/features.csv', index=False, header=False)

In [None]:
test_df[['TransactionID', 'isFraud']].to_csv('./prediction/tags.csv', index=False)

In [None]:
identity_df = pd.read_csv('./ieee-data/train_identity.csv')

In [None]:
edge_types = id_cols + list(identity_df.columns)
print(edge_types)

['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']


In [None]:
all_id_cols = ['TransactionID'] + id_cols
full_identity_df = test_df[all_id_cols].merge(identity_df, on='TransactionID', how='left')
full_identity_df.head(5)

Unnamed: 0,TransactionID,card1,card2,card3,card4,card5,card6,ProductCD,addr1,addr2,P_emaildomain,R_emaildomain,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3146209,12648,317.0,150.0,visa,226.0,debit,W,264.0,87.0,yahoo.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,3563707,12577,268.0,150.0,visa,166.0,debit,W,272.0,87.0,gmail.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3299439,18018,452.0,150.0,mastercard,117.0,debit,W,110.0,87.0,gmail.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2996670,9500,321.0,150.0,visa,226.0,debit,W,420.0,87.0,bellsouth.net,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3289863,6550,,150.0,visa,226.0,debit,W,299.0,87.0,gmail.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
edges = {}
for etype in edge_types:
    edgelist = full_identity_df[['TransactionID', etype]].dropna()
    edgelist.to_csv('./prediction/relation_{}_edgelist.csv'.format(etype), index=False, header=True)
    edges[etype] = edgelist

print(edges)

2021-12-17 08:05:29,874 INFO numexpr.utils: NumExpr defaulting to 2 threads.


{'card1':         TransactionID  card1
0             3146209  12648
1             3563707  12577
2             3299439  18018
3             2996670   9500
4             3289863   6550
...               ...    ...
141725        3178740  10112
141726        3035015  16393
141727        3288897  12839
141728        3272637   8131
141729        3275329  12695

[141730 rows x 2 columns], 'card2':         TransactionID  card2
0             3146209  317.0
1             3563707  268.0
2             3299439  452.0
3             2996670  321.0
5             3048088  106.0
...               ...    ...
141725        3178740  360.0
141726        3035015  177.0
141727        3288897  321.0
141728        3272637  583.0
141729        3275329  490.0

[139504 rows x 2 columns], 'card3':         TransactionID  card3
0             3146209  150.0
1             3563707  150.0
2             3299439  150.0
3             2996670  150.0
4             3289863  150.0
...               ...    ...
141725        317

In [None]:
import glob

file_list = glob.glob('./prediction/*edgelist.csv')

edges_files = ",".join(map(lambda x: x.split("/")[-1], [file for file in file_list if "relation" in file]))

edges_full = ''
for etype in edge_types:
    edges_full += ',relation_{}_edgelist.csv'.format(etype)

In [None]:
edges_files

'relation_card1_edgelist.csv,relation_card2_edgelist.csv,relation_ProductCD_edgelist.csv,relation_addr1_edgelist.csv,relation_card3_edgelist.csv,relation_card5_edgelist.csv,relation_card4_edgelist.csv,relation_addr2_edgelist.csv,relation_card6_edgelist.csv,relation_R_emaildomain_edgelist.csv,relation_P_emaildomain_edgelist.csv,relation_id_03_edgelist.csv,relation_id_01_edgelist.csv,relation_id_04_edgelist.csv,relation_TransactionID_edgelist.csv,relation_id_05_edgelist.csv,relation_id_08_edgelist.csv,relation_id_07_edgelist.csv,relation_id_10_edgelist.csv,relation_id_11_edgelist.csv,relation_id_09_edgelist.csv,relation_id_02_edgelist.csv,relation_id_13_edgelist.csv,relation_id_12_edgelist.csv,relation_id_06_edgelist.csv,relation_id_14_edgelist.csv,relation_id_17_edgelist.csv,relation_id_18_edgelist.csv,relation_id_15_edgelist.csv,relation_id_19_edgelist.csv,relation_id_16_edgelist.csv,relation_id_21_edgelist.csv,relation_id_20_edgelist.csv,relation_id_22_edgelist.csv,relation_id_24_edge

In [None]:
file_list = glob.glob('./prediction/*edgelist.csv')

edges = ",".join(map(lambda x: x.split("/")[-1], [file for file in file_list if "relation" in file]))

In [None]:
args = parse_args()
print(args)

Namespace(aggregator_type='gcn', compute_metrics=True, dropout=0.5, edges='relation*', embedding_size=360, labels='tags.csv', lr=0.01, model_dir='./model/2021_12_17_08_05_47', n_epochs=300, n_hidden=64, n_layers=3, new_accounts='test.csv', new_accounts_test='test2.csv', nodes='features.csv', num_gpus=0, optimizer='adam', output_dir='./output', target_ntype='TransactionID', test_dir='./prediction', threshold=0, training_dir='./data', weight_decay=0.0005)


In [None]:
test_dir = './prediction'

args.edges = edges

args.edges = get_edgelists('relation*', test_dir)

g, features2, target_id_to_node, id_to_node = construct_graph(test_dir,
                                                                args.edges,
                                                                args.nodes,
                                                                args.target_ntype)


Getting relation graphs from the following edge lists : ['relation_card1_edgelist.csv', 'relation_card2_edgelist.csv', 'relation_ProductCD_edgelist.csv', 'relation_addr1_edgelist.csv', 'relation_card3_edgelist.csv', 'relation_card5_edgelist.csv', 'relation_card4_edgelist.csv', 'relation_addr2_edgelist.csv', 'relation_card6_edgelist.csv', 'relation_R_emaildomain_edgelist.csv', 'relation_P_emaildomain_edgelist.csv', 'relation_id_03_edgelist.csv', 'relation_id_01_edgelist.csv', 'relation_id_04_edgelist.csv', 'relation_TransactionID_edgelist.csv', 'relation_id_05_edgelist.csv', 'relation_id_08_edgelist.csv', 'relation_id_07_edgelist.csv', 'relation_id_10_edgelist.csv', 'relation_id_11_edgelist.csv', 'relation_id_09_edgelist.csv', 'relation_id_02_edgelist.csv', 'relation_id_13_edgelist.csv', 'relation_id_12_edgelist.csv', 'relation_id_06_edgelist.csv', 'relation_id_14_edgelist.csv', 'relation_id_17_edgelist.csv', 'relation_id_18_edgelist.csv', 'relation_id_15_edgelist.csv', 'relation_id_19_

In [None]:
num_nodes={'DeviceInfo': 1200, 'DeviceType': 2, 'P_emaildomain': 59, 'ProductCD': 5, 'R_emaildomain': 60, 'addr1': 206, 'addr2': 46, 'card1': 8816, 'card2': 500, 'card3': 86, 'card4': 4, 'card5': 87, 'card6': 4, 'id_01': 47, 'id_02': 32065, 'id_03': 21, 'id_04': 15, 'id_05': 82, 'id_06': 95, 'id_07': 72, 'id_08': 85, 'id_09': 34, 'id_10': 50, 'id_11': 229, 'id_12': 2, 'id_13': 41, 'id_14': 21, 'id_15': 3, 'id_16': 2, 'id_17': 83, 'id_18': 13, 'id_19': 452, 'id_20': 281, 'id_21': 232, 'id_22': 18, 'id_23': 3, 'id_24': 10, 'id_25': 196, 'id_26': 72, 'id_27': 2, 'id_28': 2, 'id_29': 2, 'id_30': 72, 'id_31': 116, 'id_32': 4, 'id_33': 157, 'id_34': 4, 'id_35': 2, 'id_36': 2, 'id_37': 2, 'id_38': 2, 'target': 141730}

In [None]:
mean, stdev, features2 = normalize(th.from_numpy(features2))

print('feature mean shape:{}, std shape:{}'.format(mean.shape, stdev.shape))

feature mean shape:torch.Size([390]), std shape:torch.Size([390])


In [None]:
g.nodes['target'].data['features'] = features2

n_nodes = g.number_of_nodes('target')

n_nodes = th.sum(th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes]))
n_edges = th.sum(th.tensor([g.number_of_edges(e_type) for e_type in g.etypes]))

print("""----Data statistics------'
            #Nodes: {}
            #Edges: {}
            #Features Shape: {}""".format(n_nodes, n_edges, features.shape))

----Data statistics------'
            #Nodes: 187396
            #Edges: 4683632
            #Features Shape: (141730, 391)


In [None]:
if args.num_gpus:
    cuda = True
    device = th.device('cuda:0')
else:
    cuda = False
    device = th.device('cpu')

In [None]:
pip install pixiedust

Collecting pixiedust
  Downloading pixiedust-1.1.19.tar.gz (197 kB)
[?25l[K     |█▋                              | 10 kB 23.6 MB/s eta 0:00:01[K     |███▎                            | 20 kB 28.6 MB/s eta 0:00:01[K     |█████                           | 30 kB 32.7 MB/s eta 0:00:01[K     |██████▋                         | 40 kB 33.4 MB/s eta 0:00:01[K     |████████▎                       | 51 kB 35.0 MB/s eta 0:00:01[K     |██████████                      | 61 kB 37.6 MB/s eta 0:00:01[K     |███████████▋                    | 71 kB 34.1 MB/s eta 0:00:01[K     |█████████████▎                  | 81 kB 33.6 MB/s eta 0:00:01[K     |███████████████                 | 92 kB 35.3 MB/s eta 0:00:01[K     |████████████████▋               | 102 kB 30.6 MB/s eta 0:00:01[K     |██████████████████▎             | 112 kB 30.6 MB/s eta 0:00:01[K     |████████████████████            | 122 kB 30.6 MB/s eta 0:00:01[K     |█████████████████████▋          | 133 kB 30.6 MB/s eta 0:00:0

In [None]:
import pixiedust

Pixiedust database opened successfully
Table VERSION_TRACKER created successfully
Table METRICS_TRACKER created successfully

Share anonymous install statistics? (opt-out instructions)

PixieDust will record metadata on its environment the next time the package is installed or updated. The data is anonymized and aggregated to help plan for future releases, and records only the following values:

{
   "data_sent": currentDate,
   "runtime": "python",
   "application_version": currentPixiedustVersion,
   "space_id": nonIdentifyingUniqueId,
   "config": {
       "repository_id": "https://github.com/ibm-watson-data-lab/pixiedust",
       "target_runtimes": ["Data Science Experience"],
       "event_id": "web",
       "event_organizer": "dev-journeys"
   }
}
You can opt out by calling pixiedust.optOut() in a new cell.


[31mPixiedust runtime updated. Please restart kernel[0m
Table USER_PREFERENCES created successfully
Table service_connections created successfully


In [None]:
!jupyter pixiedust list


Available kernels:
  ir         /usr/local/share/jupyter/kernels/ir
  python2    /usr/local/share/jupyter/kernels/python2
  python3    /usr/local/share/jupyter/kernels/python3


In [None]:
!jupyter pixiedust installyyyy

Invalid syntax: You need to specify a subcommand from this list ['install', 'list', 'generate']

Subcommands
Subcommands are launched as `application cmd [args]`. For information on using
subcommand 'cmd', do: `application cmd -h`.

install
    Install Kernels locally for development with Pixiedust
list
    List of pixiedust kernels
generate
    Generate boiler plate code for a PixieDust plugin



In [None]:
import pdb

In [None]:
%xmode Verbose

Exception reporting mode: Verbose


In [None]:
rgcn_model(g, features2)

tensor([[0.4600, 0.5400],
        [0.4580, 0.5420],
        [0.4572, 0.5428],
        ...,
        [0.4582, 0.5418],
        [0.4608, 0.5392],
        [0.4559, 0.5441]], grad_fn=<SoftmaxBackward0>)

In [None]:
rgcn_model.eval()
predictions = rgcn_model(g, features2)
preds = th.argmax(predictions, axis=1).numpy()

In [None]:
predictions

tensor([[0.4673, 0.5327],
        [0.4670, 0.5330],
        [0.4655, 0.5345],
        ...,
        [0.4670, 0.5330],
        [0.4777, 0.5223],
        [0.4559, 0.5441]], grad_fn=<SoftmaxBackward0>)

In [None]:
preds

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
TransactionID = test_df['TransactionID']
Pred = list(preds)

In [None]:
pred_df = pd.DataFrame(TransactionID)
pred_df['Prediction'] = Pred

In [None]:
pred_df.head(5)

Unnamed: 0,TransactionID,Prediction
0,3146209,1
1,3563707,1
2,3299439,1
3,2996670,1
4,3289863,1


In [None]:
pred_df.to_csv('./prediction/prediction.csv', index=False)