In [1]:
import numpy as np
import torch
import pandas as pd
import numpy as np
#
import pickle as pkl
#
from sklearn import preprocessing
#
import matplotlib
matplotlib.use('agg')

In [7]:
print(torch.__version__) # v1.5.0
print(torch.version.cuda) # v10.2

1.5.0
10.1


In [2]:
from torch_geometric.nn import GCNConv, RGCNConv, global_sort_pool, global_add_pool
from torch_geometric.utils import dropout_adj

# Overall Architecture

0. Arrange multi-view matrices (prepare matrices)
1. Select closely related matrices (matrices that share common entity)
2. Generate subgraphs (multi-partite graphs) for all matrix groups
3. Node labeling 
4. Transform a multi-partite graph to bipartite graphs as layers
5. Group up similar bipartite-graphs for training (inspired by IGMC)
6. Training with GNN using multiple layers (potentially using R-GCN) concurrently

**Loss function**: Reduce MSE with random bias for each layers (need to revisit GC-MC and R-GCN)

**Optimization**: Adam optimizer

**Evaluation with other models**: RMSE and AUC


In [3]:
# initialise all necesarry functions here. Maybe should be moved to another file and import later
def one_hot(idx, length):
    idx = np.array(idx)
    x = np.zeros([len(idx), length])
    x[np.arange(len(idx)), idx] = 1.0
    return x

# Step 0:  Prepare matrices
This includes:
* Load matrices 
* Arrange matrices

In [4]:
data_dir = '../../data/sample_data/'
num_folds = 1


# Load data

In [25]:
# load matrices
print("== Loading data from data_dir: ", data_dir)
U1 = pkl.load(open(data_dir+"X_13.pkl", 'rb'))
U2 = pkl.load(open(data_dir+"X_14.pkl", 'rb'))
V1 = pkl.load(open(data_dir+"X_26.pkl", 'rb'))
W1 = pkl.load(open(data_dir+"X_53.pkl", 'rb'))
r_temp_dict = {}
for fold_num in np.arange(1, num_folds+1):
    r_train = pkl.load(open(data_dir+'/X_12_train_fold_'+str(fold_num)+'.pkl', 'rb'))
    r_train_idx = pkl.load(open(data_dir+'/X_12_train_idx_'+str(fold_num)+'.pkl', 'rb'))
    r_test = pkl.load(open(data_dir+'/X_12_test_fold_'+str(fold_num)+'.pkl', 'rb'))
    r_test_idx = pkl.load(open(data_dir+'/X_12_test_idx_'+str(fold_num)+'.pkl', 'rb'))
    r_doublets = pkl.load(open(data_dir+'/R_doublets_'+str(fold_num)+'.pkl', 'rb'))
    r_temp_dict[fold_num] = {"Rtrain": r_train, "Rtrain_idx": r_train_idx, "Rtest": r_test, "Rtest_idx": r_test_idx, "Rdoublets": r_doublets}

data_dict = {"U1": U1, "U2": U2, "V1": V1, "W1": W1, "R": r_temp_dict}
print("== Finish loading data from data_dir: ", U1[2])

== Loading data from data_dir:  ../../data/sample_data/
== Finish loading data from data_dir:  [0.83691185 0.82596964 0.77240981 0.80207477 0.76281822 0.77551997
 0.80361883 0.79251464 0.78068612 0.76320014 0.79757753 0.79181566
 0.81303564 0.8065129  0.83852059 0.79130437 0.82792796 0.80002413
 0.85594075 0.78325622]


# Preprocess matrices
Load using a method from Monti et al.

In [32]:
u1_trch = torch.from_numpy(U1)
a = torch.nonzero(u1_trch)

print('torch:', u1_trch)
print('nonzero:', a)

torch: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.8369, 0.8260, 0.7724,  ..., 0.8000, 0.8559, 0.7833],
        ...,
        [0.8135, 0.8352, 0.7965,  ..., 0.7297, 0.8365, 0.7799],
        [0.8509, 0.8953, 0.7573,  ..., 0.8010, 0.8494, 0.7836],
        [0.9231, 0.8716, 0.8780,  ..., 0.9730, 0.9534, 0.9023]],
       dtype=torch.float64)
nonzero: tensor([[  2,   0],
        [  2,   1],
        [  2,   2],
        ...,
        [999,  17],
        [999,  18],
        [999,  19]])


In [None]:
from preprocessing import load_data_monti


In [14]:
print("U1.shape: ",U1.shape)
print("U2.shape: ",U2.shape)
print("V1.shape: ",V1.shape)
print("W1.shape: ",W1.shape)
print("R.shape: ",data_dict['R'][1]['Rtrain'].shape)
print("U1 data:",U1)

U1.shape:  (1000, 20)
U2.shape:  (1000, 150)
V1.shape:  (2000, 250)
W1.shape:  (300, 20)
R.shape:  (1000, 2000)
U1 data: [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.83691185 0.82596964 0.77240981 ... 0.80002413 0.85594075 0.78325622]
 ...
 [0.81349176 0.83518323 0.79653601 ... 0.72965597 0.83648859 0.77993258]
 [0.8508699  0.8952519  0.75729878 ... 0.80102254 0.84939108 0.78360706]
 [0.92305017 0.87163958 0.87802393 ... 0.97301008 0.95341843 0.90227069]]


In [7]:
G = {
    "e1":["X1","X2","X3"],\
    "e2":["X1","X4"],\
    "e3":["X2","X5"],\
    "e4":["X3"],\
    "e5":["X5"],\
    "e6":["X4"]}

In [8]:
X_data = {
    "X1":{"1":data_dict['R'][1]["Rtrain"]},\
    "X2":{"1":U1},\
    "X3":U2,\
    "X4":V1,\
    "X5":W1}

In [9]:
X_meta = {
    "X1":["e1","e2"],\
    "X2":["e1","e3"],\
    "X3":["e1","e4"],\
    "X4":["e2","e6"],\
    "X5":["e5","e3"]}

In [10]:
Rtest_triplets1 = [[1,1,1],[2,2,0]]
Rtest_triplets2 = [[1,1,1],[3,3,0],[1,2,0],[0,1,0],[0,2,0],[0,3,0]]

In [11]:
X_val = {
    "X1":{"1":Rtest_triplets1},
    "X2":{"1":Rtest_triplets2}
}

# Step 1: Select sub-matrix that shares common entities

After arranging matrices, we will 
* Select the size of sub-matrix *(m x n)* where 'm' is the size of common domain (i.e. user)

In [None]:


# Question: How to extract data from matrix and build it into subgraphs? Using torch_geometric! see documentation

# Step 2: Generate subgraphs for all matrix groups

Do the following until cover all groups
* In one group, by using sub-matrix size from first step, loop connected matrices to generate multi-partite graphs.
* Please refer to step 3 for node-labelling process.

# Step 3: Node labelling

From arranged matrices, label them from 0 to N to build multipartite graph. For each entities, the label goes from N+1 to N+N, in the first hop. The following hops goes 2N+1 to 3N. So, the general formula for this node labelling is:

> i x (N + k), 

where *i* is the number of hop, *N* is the number of involved matrices,*k* is the index of selected and arranged matrices
**bold text**


In [None]:
# below is just a sample model
# Prepare dummy variables 
global_ids = np.array([0,3,3,3,3,3,1,4,4,4,4,4,2,5,5,5,5]) # assume 3 matrices: 0 = user; 1 = item; 2 = description
ids = torch.from_numpy(global_ids)

# encode it
global_encoded = torch.from_numpy(one_hot(global_ids, max(global_ids)+1))
global_encoded

tensor([[1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]], dtype=torch.float64)

In [None]:
# parent = [target parent ids]
# children = [target child index, relationship weight]

# user
user_parent = [[2,2],[2,2],[],[]]
user_children = [\
                 [[0,4],[1,3]],\
                 [[2,1],[3,4]],\
                 [],\
                 []\
                ]
# item
item_parent = [[0,4],[0],[0,4],[0]]
item_children =[\
                [[0,4],[0,1]],\
                [[1,3]],\
                [[2,1],[2,1]],\
                [[3,4]]\
               ]

# feature
feature_parent = [[2],[2],[],[]]
feature_children = [[[0,1]],[[2,1]],[],[]]

data = [user_parent, user_children, item_parent, item_children, feature_parent, feature_children]
data

[[[2, 2], [2, 2], [], []],
 [[[0, 4], [1, 3]], [[2, 1], [3, 4]], [], []],
 [[0, 4], [0], [0, 4], [0]],
 [[[0, 4], [0, 1]], [[1, 3]], [[2, 1], [2, 1]], [[3, 4]]],
 [[2], [2], [], []],
 [[[0, 1]], [[2, 1]], [], []]]

In [None]:
# Question: How to merge similar relationship to make it undirected?

## Node labeling functions

In [None]:
def get_parent_id(data, parent_id):
    all_parents_pos = [id for (id,item) in enumerate(data) if item % 2 == 0]    
    print('function', all_parents_pos)
    local_parent_id = int(parent_id/2) # give index 0,1,2 etc. from parent id 2,4,6 etc.
    parent_idx = all_parents_pos[local_parent_id]
    return parent_idx

def get_relationships(idx):
    # variables
    # idx = index
    # pos = position

    c_idx = idx
    target_parents = []
    target_children = []

    all_parents_pos = [id for (id,item) in enumerate(global_ids) if item % 2 == 0]    

    # calculate positions
    c_pos = global_ids[idx] # child position
    p_id = c_pos if(c_idx in all_parents_pos) else c_pos - 1
    local_parent_id = int(p_id/2) # give index 0,1,2 etc. from parent id 2,4,6 etc.
    p_idx = all_parents_pos[local_parent_id]
    cp_pos = c_idx - p_idx - 1

    # return
    target_parents = data[p_idx][cp_pos] if(not p_idx == c_idx ) else data[p_id]
    target_children = data[c_pos][cp_pos] if(not p_idx == c_idx ) else data[p_id+1]

    return [target_parents,target_children]

In [None]:
# Test
print('Node labels', global_ids)

index = 7
relationships = get_relationships(index)
print('index:', index)
print('target parent:', relationships[0])
print('target weights:', relationships[1])

Node labels [0, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 5, 5, 5, 5]
index: 7
target parent: [[2, 1]]
target weights: [[1, 3]]


# Step 4: Transform multipartite-graph into bipartite graphs

# Step 5: Group up similar bipartite-graphs for training

In [None]:
# Question: How to convert it to PyTorch, for training later?

# Step 6: Train in GNN

Use R-GCN with 'concat' + MLP

Activation function: either Swish/ReLU by default

Optimized using Adam optimizer (default)

Reduce 'MSE' with customised loss function by considering layers (need a research paper about this)

# Questions/Problems
* How to deal the ranking problems? It happens in IGMC
* What will happen when we update a matrix?
* Can it be transferable for other dataset? It has to be inductive!