In [None]:
# default_exp utils.graph

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
import numpy as np
import pandas as pd
import time
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset

## Bi-partite Dataset
> Generate bi-partite graph dataset.

In [None]:
#export
class BipartiteDataset(Dataset): 
    def __init__(self, args, train, neg_dist, offset, num_u, num_v,K): 
        self.args = args
        self.edge_1 = torch.tensor(train[self.args.user_col].values-1)
        self.edge_2 = torch.tensor(train[self.args.item_col].values-1) +num_u
        self.edge_3 = torch.tensor(train[self.args.feedback_col].values) - offset
        self.neg_dist = neg_dist
        self.K = K;
        self.num_u = num_u
        self.num_v = num_v
        self.tot = np.arange(num_v)
        self.train = train
        
    def negs_gen_(self):
        print('negative sampling...'); st=time.time()
        self.edge_4 = torch.empty((len(self.edge_1),self.K),dtype=torch.long)
        prog = tqdm(desc='negative sampling for each epoch...',total=len(set(self.train[self.args.user_col].values)),position=0)
        for j in set(self.train[self.args.user_col].values):
            pos=self.train[self.train[self.args.user_col]==j][self.args.item_col].values-1
            neg = np.setdiff1d(self.tot,pos)
            temp = (torch.tensor(np.random.choice(neg,len(pos)*self.K,replace=True,p=self.neg_dist[neg]/self.neg_dist[neg].sum()))+self.num_u).long()
            self.edge_4[self.edge_1==j-1]=temp.view(int(len(temp)/self.K),self.K)
            prog.update(1)
        prog.close()
        self.edge_4 = torch.tensor(self.edge_4).long()
        print('complete ! %s'%(time.time()-st))
        
    def negs_gen_EP(self,epoch):
        print('negative sampling for next epochs...'); st=time.time()
        self.edge_4_tot = torch.empty((len(self.edge_1),self.K,epoch),dtype=torch.long)
        prog = tqdm(desc='negative sampling for next epochs...',total=len(set(self.train[self.args.user_col].values)),position=0)
        for j in set(self.train[self.args.user_col].values):
            pos=self.train[self.train[self.args.user_col]==j][self.args.item_col].values-1
            neg = np.setdiff1d(self.tot,pos)
            temp = (torch.tensor(np.random.choice(neg,len(pos)*self.K*epoch,replace=True,p=self.neg_dist[neg]/self.neg_dist[neg].sum()))+self.num_u).long()
            self.edge_4_tot[self.edge_1==j-1]=temp.view(int(len(temp)/self.K/epoch),self.K,epoch)
            prog.update(1)
        prog.close()
        self.edge_4_tot = torch.tensor(self.edge_4_tot).long()
        print('complete ! %s'%(time.time()-st))

    def __len__(self):
        return len(self.edge_1)

    def __getitem__(self,idx):
        u = self.edge_1[idx]
        v = self.edge_2[idx]
        w = self.edge_3[idx]
        negs = self.edge_4[idx]
        return u,v,w,negs

Example

In [None]:
train = pd.DataFrame(
    {'userId':[1,1,2,2,3,4,5],
     'itemId':[1,2,1,3,2,4,5],
     'rating':[4,5,2,5,3,2,4]}
)

train

Unnamed: 0,userId,itemId,rating
0,1,1,4
1,1,2,5
2,2,1,2
3,2,3,5
4,3,2,3
5,4,4,2
6,5,5,4


In [None]:
class Args:
    # default column names
    user_col = 'userId'
    item_col = 'itemId'
    feedback_col = 'rating'
    # params
    K = 1 # The number of negative samples
    offset = 3.5 # Criterion of likes/dislikes
    # dataset
    num_u=5
    num_v=5

In [None]:
args = Args()

In [None]:
def deg_dist(train, num_v):
    uni, cou = np.unique(train[args.item_col].values-1, return_counts=True)
    cou = cou**(0.75)
    deg = np.zeros(num_v)
    deg[uni] = cou
    return torch.tensor(deg)

neg_dist = deg_dist(train, args.num_v)
neg_dist

tensor([1.6818, 1.6818, 1.0000, 1.0000, 1.0000], dtype=torch.float64)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
training_dataset = BipartiteDataset(args, train, neg_dist, args.offset, args.num_u, args.num_v, args.K)
training_dataset.negs_gen_EP(1)
training_dataset.edge_4 = training_dataset.edge_4_tot[:,:,:]

negative sampling for next epochs...


negative sampling for next epochs...:   0%|          | 0/5 [00:00<?, ?it/s]

complete ! 0.05542922019958496


In [None]:
train

Unnamed: 0,userId,itemId,rating
0,1,1,4
1,1,2,5
2,2,1,2
3,2,3,5
4,3,2,3
5,4,4,2
6,5,5,4


In [None]:
[(a,b,c,d) for a,b,c,d in zip(training_dataset.edge_1.tolist(),
                      training_dataset.edge_2.tolist(),
                      training_dataset.edge_3.tolist(),
                      training_dataset.edge_4[:,0,0].tolist())]

[(0, 5, 0.5, 7),
 (0, 6, 1.5, 9),
 (1, 5, -1.5, 8),
 (1, 7, 1.5, 9),
 (2, 6, -0.5, 8),
 (3, 8, -1.5, 6),
 (4, 9, 0.5, 6)]

## PandasGraphBuilder

In [None]:
#export
class PandasGraphBuilder(object):
    """Creates a heterogeneous graph from multiple pandas dataframes.
    """
    def __init__(self):
        self.entity_tables = {}
        self.relation_tables = {}

        self.entity_pk_to_name = {}     # mapping from primary key name to entity name
        self.entity_pk = {}             # mapping from entity name to primary key
        self.entity_key_map = {}        # mapping from entity names to primary key values
        self.num_nodes_per_type = {}
        self.edges_per_relation = {}
        self.relation_name_to_etype = {}
        self.relation_src_key = {}      # mapping from relation name to source key
        self.relation_dst_key = {}      # mapping from relation name to destination key

    def add_entities(self, entity_table, primary_key, name):
        entities = entity_table[primary_key].astype('category')
        if not (entities.value_counts() == 1).all():
            raise ValueError('Different entity with the same primary key detected.')
        # preserve the category order in the original entity table
        entities = entities.cat.reorder_categories(entity_table[primary_key].values)

        self.entity_pk_to_name[primary_key] = name
        self.entity_pk[name] = primary_key
        self.num_nodes_per_type[name] = entity_table.shape[0]
        self.entity_key_map[name] = entities
        self.entity_tables[name] = entity_table

    def add_binary_relations(self, relation_table, source_key, destination_key, name):
        src = relation_table[source_key].astype('category')
        src = src.cat.set_categories(
            self.entity_key_map[self.entity_pk_to_name[source_key]].cat.categories)
        dst = relation_table[destination_key].astype('category')
        dst = dst.cat.set_categories(
            self.entity_key_map[self.entity_pk_to_name[destination_key]].cat.categories)
        if src.isnull().any():
            raise ValueError(
                'Some source entities in relation %s do not exist in entity %s.' %
                (name, source_key))
        if dst.isnull().any():
            raise ValueError(
                'Some destination entities in relation %s do not exist in entity %s.' %
                (name, destination_key))

        srctype = self.entity_pk_to_name[source_key]
        dsttype = self.entity_pk_to_name[destination_key]
        etype = (srctype, name, dsttype)
        self.relation_name_to_etype[name] = etype
        self.edges_per_relation[etype] = (src.cat.codes.values.astype('int64'), dst.cat.codes.values.astype('int64'))
        self.relation_tables[name] = relation_table
        self.relation_src_key[name] = source_key
        self.relation_dst_key[name] = destination_key

    def build(self):
        # Create heterograph
        import dgl
        dgl.backend_name = 'pytorch'
        graph = dgl.heterograph(self.edges_per_relation, self.num_nodes_per_type)
        return graph

Example

In [None]:
!pip install dgl

In [None]:
users = pd.DataFrame({
    'user_id': ['A','B','C'],
    'country': ['US','Chine','US'],
    'age': [24,26,21],
})

games = pd.DataFrame({
    'game_id': [1, 2],
    'title': ['Minecraft', 'Tetris 99'],
    'is_sandbox': [True, False],
    'is_multiplayer': [True, True],
})

plays = pd.DataFrame({
    'user_id': ['A','B','B','C'],
    'game_id': [1,1,2,2],
    'hours': [24,20,16,28]
})

In [None]:
builder = PandasGraphBuilder()
builder.add_entities(users, 'user_id', 'user')
builder.add_entities(games, 'game_id', 'game')
builder.add_binary_relations(plays, 'user_id', 'game_id', 'plays')
builder.add_binary_relations(plays, 'game_id', 'user_id', 'played-by')
g = builder.build()

Using backend: pytorch


In [None]:
test_eq(g.number_of_nodes('user'), 3)
test_eq(g.number_of_edges('plays'), 4)

In [None]:
#hide
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-19 17:12:48

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy  : 1.19.5
IPython: 5.5.0
pandas : 1.1.5
torch  : 1.10.0+cu111

