In [13]:
import sys
sys.path.append("..")

In [1]:

import numpy as np
import torch
import pandas as pd
import numpy as np
import os
#
import pickle as pkl
#
from sklearn import preprocessing
#
import matplotlib
matplotlib.use('agg')

In [2]:
print('PyTorch version:', torch.__version__) # v1.5.0
print("PyTorch CUDA version:",torch.version.cuda) # v10.2
torch.cuda.current_device()
print('is CUDA available?',torch.cuda.is_available())

PyTorch version: 1.5.0
PyTorch CUDA version: 10.1
is CUDA available? True


In [3]:
from torch_geometric.nn import GCNConv, RGCNConv, global_sort_pool, global_add_pool
from torch_geometric.utils import dropout_adj

# Overall Architecture

0. Arrange multi-view matrices (prepare matrices)
1. Select closely related matrices (matrices that share common entity)
2. Generate subgraphs (multi-partite graphs) for all matrix groups
3. Node labeling 
4. Transform a multi-partite graph to bipartite graphs as layers
5. Group up similar bipartite-graphs for training (inspired by IGMC)
6. Training with GNN using multiple layers (potentially using R-GCN) concurrently

**Loss function**: Reduce MSE with random bias for each layers (need to revisit GC-MC and R-GCN)

**Optimization**: Adam optimizer

**Evaluation with other models**: RMSE and AUC


In [4]:
# initialise all necesarry functions here. Maybe should be moved to another file and import later
def one_hot(idx, length):
    idx = np.array(idx)
    x = np.zeros([len(idx), length])
    x[np.arange(len(idx)), idx] = 1.0
    return x

# Step 0:  Prepare matrices
This includes:
* Load matrices 
* Arrange matrices

In [5]:
data_dir = '../../data/sample_data/'
num_folds = 1

## 1.Load data
Load matrices. Data is taken from dCMF paper

In [6]:
# load matrices
print("== Loading data from data_dir: ", data_dir)
U1 = pkl.load(open(data_dir+"X_13.pkl", 'rb'))
U2 = pkl.load(open(data_dir+"X_14.pkl", 'rb'))
V1 = pkl.load(open(data_dir+"X_26.pkl", 'rb'))
W1 = pkl.load(open(data_dir+"X_53.pkl", 'rb'))
r_temp_dict = {}
for fold_num in np.arange(1, num_folds+1):
    r_train = pkl.load(open(data_dir+'/X_12_train_fold_'+str(fold_num)+'.pkl', 'rb'))
    r_train_idx = pkl.load(open(data_dir+'/X_12_train_idx_'+str(fold_num)+'.pkl', 'rb'))
    r_test = pkl.load(open(data_dir+'/X_12_test_fold_'+str(fold_num)+'.pkl', 'rb'))
    r_test_idx = pkl.load(open(data_dir+'/X_12_test_idx_'+str(fold_num)+'.pkl', 'rb'))
    r_doublets = pkl.load(open(data_dir+'/R_doublets_'+str(fold_num)+'.pkl', 'rb'))
    r_temp_dict[fold_num] = {"Rtrain": r_train, "Rtrain_idx": r_train_idx, "Rtest": r_test, "Rtest_idx": r_test_idx, "Rdoublets": r_doublets}

data_dict = {"U1": U1, "U2": U2, "V1": V1, "W1": W1, "R": r_temp_dict}
print("Loaded!", data_dict.keys())

== Loading data from data_dir:  ../../data/sample_data/
Loaded! dict_keys(['U1', 'U2', 'V1', 'W1', 'R'])


In [7]:
print("U1.shape: ",U1.shape)
print("U2.shape: ",U2.shape)
print("V1.shape: ",V1.shape)
print("W1.shape: ",W1.shape)
print("R.shape: ",data_dict['R'][1]['Rtrain'].shape)
# print("U1 data:",U1)

U1.shape:  (1000, 20)
U2.shape:  (1000, 150)
V1.shape:  (2000, 250)
W1.shape:  (300, 20)
R.shape:  (1000, 2000)


## 2. Preprocess matrices
> TODO: create training, validation, and test data for each matrices by dropping out matrices randomly 

Inspired by IGMC, load matrix using a method from Monti et al.

In [15]:
import src.preprocessing # custom preprocessing functions

In [14]:
# TODO: work on this function to split matrix between testing, validation and test
# after that, move it to src.preprocessing file!
def splitting_data(matrix):
    m_torch = torch.from_numpy(matrix)
    holes = torch.empty(m_torch.shape).random_(2)
    holes_idx = torch.nonzero(holes)
    training_data = torch.mul(m_torch,holes) # one-hot encoding to give holes on matrices
    training_idx = torch.nonzero(training_data)
    print(holes)
    print(holes_idx)
    return (matrix,training_data, None, None)
    
matrix = data_dict['R'][1]['Rtrain']
m, train_data, test_data, val_data =  splitting_data(matrix)
print(m)
train_data

tensor([[0., 1., 0.,  ..., 1., 0., 0.],
        [1., 0., 0.,  ..., 1., 1., 0.],
        [1., 1., 0.,  ..., 0., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 1.],
        [0., 1., 1.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.]])
tensor([[   0,    1],
        [   0,    6],
        [   0,    9],
        ...,
        [ 999, 1997],
        [ 999, 1998],
        [ 999, 1999]])
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.8057179  0.         0.         ... 0.83843999 0.75367971 0.        ]
 ...
 [0.81335164 0.         0.         ... 0.87319588 0.76432996 0.78645457]
 [0.82397774 0.         0.         ... 0.83491009 0.80835884 0.86646459]
 [0.9348797  0.         0.         ... 0.90277274 0.83821403 0.90299402]]


tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.8057, 0.0000, 0.0000,  ..., 0.0000, 0.7537, 0.0000],
        ...,
        [0.8134, 0.0000, 0.0000,  ..., 0.8732, 0.0000, 0.7865],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.8665],
        [0.0000, 0.0000, 0.0000,  ..., 0.9028, 0.8382, 0.9030]],
       dtype=torch.float64)

In [17]:
# TODO

"""
function to extract labels and indices from a matrix.
Taken from Monti et al. with slight modifications (remove training,validation, and test)
"""
def extract_matrix_info(M, testing=False):
    num_users = M.shape[0]
    num_items = M.shape[1]

    u_nodes_ratings = np.where(M)[0]
    v_nodes_ratings = np.where(M)[1]
    ratings = M[np.where(M)]

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(
        np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    print('number of users = ', len(set(u_nodes)))
    print('number of item = ', len(set(v_nodes)))

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {r: i for i, r in enumerate(
        np.sort(np.unique(ratings)).tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])


    for i in range(len(u_nodes)):
        assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number training
    num_train = np.where(Otraining)[0].shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    # non_zero arrays
    pairs_nonzero_train = torch.nonzero(torch.from_numpy(M))
    idx_nonzero_train = np.array(
        [u * num_items + v for u, v in pairs_nonzero_train])

     # Internally shuffle training set (before splitting off validation set)
    rand_idx = list(range(len(idx_nonzero_train)))
    np.random.seed(42)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate(
        [pairs_nonzero_train, pairs_nonzero_test], axis=0)

    train_idx = idx_nonzero[num_val:num_train + num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    train_labels = labels[train_idx]

    class_values = np.sort(np.unique(ratings))

    '''Note here rating matrix elements' values + 1 !!!'''

    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)

    if post_rating_map is None:
        rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    else:
        rating_mx_train[train_idx] = np.array(
            [post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.

    rating_mx_train = sp.csr_matrix(
        rating_mx_train.reshape(num_users, num_items))

    if u_features is not None:
        u_features = sp.csr_matrix(u_features)
        print("User features shape: " + str(u_features.shape))

    if v_features is not None:
        v_features = sp.csr_matrix(v_features)
        print("Item features shape: " + str(v_features.shape))

    return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, class_values

In [7]:
G = {
    "e1":["X1","X2","X3"],\
    "e2":["X1","X4"],\
    "e3":["X2","X5"],\
    "e4":["X3"],\
    "e5":["X5"],\
    "e6":["X4"]}

In [8]:
X_data = {
    "X1":{"1":data_dict['R'][1]["Rtrain"]},\
    "X2":{"1":U1},\
    "X3":U2,\
    "X4":V1,\
    "X5":W1}

In [9]:
X_meta = {
    "X1":["e1","e2"],\
    "X2":["e1","e3"],\
    "X3":["e1","e4"],\
    "X4":["e2","e6"],\
    "X5":["e5","e3"]}

In [10]:
Rtest_triplets1 = [[1,1,1],[2,2,0]]
Rtest_triplets2 = [[1,1,1],[3,3,0],[1,2,0],[0,1,0],[0,2,0],[0,3,0]]

In [11]:
X_val = {
    "X1":{"1":Rtest_triplets1},
    "X2":{"1":Rtest_triplets2}
}

# Step 1: Select sub-matrix that shares common entities

After arranging matrices, we will 
* Select the size of sub-matrix *(m x n)* where 'm' is the size of common domain (i.e. user)

In [None]:


# Question: How to extract data from matrix and build it into subgraphs? Using torch_geometric! see documentation

# Step 2: Generate subgraphs for all matrix groups

Do the following until cover all groups
* In one group, by using sub-matrix size from first step, loop connected matrices to generate multi-partite graphs.
* Please refer to step 3 for node-labelling process.

# Step 3: Node labelling

From arranged matrices, label them from 0 to N to build multipartite graph. For each entities, the label goes from N+1 to N+N, in the first hop. The following hops goes 2N+1 to 3N. So, the general formula for this node labelling is:

> i x (N + k), 

where *i* is the number of hop, *N* is the number of involved matrices,*k* is the index of selected and arranged matrices
**bold text**


In [None]:
# below is just a sample model
# Prepare dummy variables 
global_ids = np.array([0,3,3,3,3,3,1,4,4,4,4,4,2,5,5,5,5]) # assume 3 matrices: 0 = user; 1 = item; 2 = description
ids = torch.from_numpy(global_ids)

# encode it
global_encoded = torch.from_numpy(one_hot(global_ids, max(global_ids)+1))
global_encoded

tensor([[1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]], dtype=torch.float64)

In [None]:
# parent = [target parent ids]
# children = [target child index, relationship weight]

# user
user_parent = [[2,2],[2,2],[],[]]
user_children = [\
                 [[0,4],[1,3]],\
                 [[2,1],[3,4]],\
                 [],\
                 []\
                ]
# item
item_parent = [[0,4],[0],[0,4],[0]]
item_children =[\
                [[0,4],[0,1]],\
                [[1,3]],\
                [[2,1],[2,1]],\
                [[3,4]]\
               ]

# feature
feature_parent = [[2],[2],[],[]]
feature_children = [[[0,1]],[[2,1]],[],[]]

data = [user_parent, user_children, item_parent, item_children, feature_parent, feature_children]
data

[[[2, 2], [2, 2], [], []],
 [[[0, 4], [1, 3]], [[2, 1], [3, 4]], [], []],
 [[0, 4], [0], [0, 4], [0]],
 [[[0, 4], [0, 1]], [[1, 3]], [[2, 1], [2, 1]], [[3, 4]]],
 [[2], [2], [], []],
 [[[0, 1]], [[2, 1]], [], []]]

In [None]:
# Question: How to merge similar relationship to make it undirected?

## Node labeling functions

In [None]:
def get_parent_id(data, parent_id):
    all_parents_pos = [id for (id,item) in enumerate(data) if item % 2 == 0]    
    print('function', all_parents_pos)
    local_parent_id = int(parent_id/2) # give index 0,1,2 etc. from parent id 2,4,6 etc.
    parent_idx = all_parents_pos[local_parent_id]
    return parent_idx

def get_relationships(idx):
    # variables
    # idx = index
    # pos = position

    c_idx = idx
    target_parents = []
    target_children = []

    all_parents_pos = [id for (id,item) in enumerate(global_ids) if item % 2 == 0]    

    # calculate positions
    c_pos = global_ids[idx] # child position
    p_id = c_pos if(c_idx in all_parents_pos) else c_pos - 1
    local_parent_id = int(p_id/2) # give index 0,1,2 etc. from parent id 2,4,6 etc.
    p_idx = all_parents_pos[local_parent_id]
    cp_pos = c_idx - p_idx - 1

    # return
    target_parents = data[p_idx][cp_pos] if(not p_idx == c_idx ) else data[p_id]
    target_children = data[c_pos][cp_pos] if(not p_idx == c_idx ) else data[p_id+1]

    return [target_parents,target_children]

In [None]:
# Test
print('Node labels', global_ids)

index = 7
relationships = get_relationships(index)
print('index:', index)
print('target parent:', relationships[0])
print('target weights:', relationships[1])

Node labels [0, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 5, 5, 5, 5]
index: 7
target parent: [[2, 1]]
target weights: [[1, 3]]


# Step 4: Transform multipartite-graph into bipartite graphs

# Step 5: Group up similar bipartite-graphs for training

In [None]:
# Question: How to convert it to PyTorch, for training later?

# Step 6: Train in GNN

Use R-GCN with 'concat' + MLP

Activation function: either Swish/ReLU by default

Optimized using Adam optimizer (default)

Reduce 'MSE' with customised loss function by considering layers (need a research paper about this)

# Questions/Problems
* How to deal the ranking problems? It happens in IGMC
* What will happen when we update a matrix?
* Can it be transferable for other dataset? It has to be inductive!