# Try on temporary graph

In [1]:
import dgl
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GENRES_ML_100K =\
    ['unknown', 'Action', 'Adventure', 'Animation',
     'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
     'Thriller', 'War', 'Western']
GENRES_ML_1M = GENRES_ML_100K[1:]

In [3]:
graph_data = {
    ('user','rates','movie') : (torch.tensor([5, 2, 2]), torch.tensor([10, 10, 7])),
    ('movie','rev-rates','user') : (torch.tensor([10, 7]), torch.tensor([5, 15]))
}

hetero_graph = dgl.heterograph(graph_data)

all_edges_rates = hetero_graph.edges(etype='rates')
all_edges_rev_rates = hetero_graph.edges(etype='rev-rates')

actual_users = list(set(all_edges_rates[0].tolist()).union(all_edges_rev_rates[1].tolist()))
actual_movies = list(set(all_edges_rates[1].tolist()).union(all_edges_rev_rates[0].tolist()))

ex_users = hetero_graph.nodes(ntype='user').tolist()
ex_movies = hetero_graph.nodes(ntype='movie').tolist()

hetero_graph.nodes['user'].data['node_ID'] = torch.tensor(ex_users).squeeze()
hetero_graph.nodes['movie'].data['node_ID'] = torch.tensor(ex_movies).squeeze()

not_users = list(set(ex_users).difference(set(actual_users)))
not_movies = list(set(ex_movies).difference(set(actual_movies)))

hetero_graph.remove_nodes(torch.tensor(not_users), ntype='user')
hetero_graph.remove_nodes(torch.tensor(not_movies), ntype='movie')
# hetero_graph.remove_nodes(torch.tensor(not_users), ntype='A')
# hetero_graph.remove_nodes(torch.tensor(not_movies), ntype='C')

In [4]:
hetero_graph.nodes['user'].data['node_ID']

tensor([ 2,  5, 15])

In [5]:
MAP = {
    2 : torch.tensor([1,1,0,0,1]),
    5 : torch.tensor([1,1,0,0,1]),
    15: torch.tensor([1,0,0,0,1]),
}

In [6]:
a=torch.randn(2,3)
b=torch.randn(2,3)

In [7]:
torch.stack([MAP[2], MAP[5], MAP[15]], axis=0)

tensor([[1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 0, 0, 0, 1]])

In [8]:
 third_tensor = torch.stack([MAP[2], MAP[5], MAP[15]], axis=0)

In [9]:
third_tensor

tensor([[1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 0, 0, 0, 1]])

In [10]:
hetero_graph.nodes['user'].data['age'] = third_tensor

In [11]:
hetero_graph.nodes['user']

NodeSpace(data={'node_ID': tensor([ 2,  5, 15]), 'age': tensor([[1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 0, 0, 0, 1]])})

In [12]:
hetero_graph.ndata

defaultdict(<class 'dict'>, {'node_ID': {'movie': tensor([ 7, 10]), 'user': tensor([ 2,  5, 15])}, 'age': {'user': tensor([[1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 0, 0, 0, 1]])}})

In [13]:
hetero_graph.nodes['movie'].data['node_ID']

tensor([ 7, 10])

In [14]:
hetero_graph.num_nodes(ntype='movie')

2

In [15]:
hetero_graph.ndata['x']['movie'] = torch.zeros((3, 5))

In [16]:
hetero_graph.ndata

defaultdict(<class 'dict'>, {'node_ID': {'movie': tensor([ 7, 10]), 'user': tensor([ 2,  5, 15])}, 'age': {'user': tensor([[1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 0, 0, 0, 1]])}})

In [17]:
hetero_graph.nodes['movie'].data['node_ID']

tensor([ 7, 10])

# Attempt on real dataset

In [18]:
from dgl.data.utils import get_download_dir
import os
import pandas as pd
import numpy as np
import dgl

In [19]:
download_dir = get_download_dir()
_dir = os.path.join(download_dir, "ml-1m", "ml-1m")

In [21]:
def _load_rating_info(file_path, sep):

        rating_info = pd.read_csv(
            file_path, sep=sep, header=None,
            names=['user_id', 'movie_id', 'rating', 'timestamp'],
            dtype={'user_id': np.int32, 'movie_id' : np.int32,
                   'ratings': np.float32, 'timestamp': np.int64}, engine='python')
        return rating_info

def _load_raw_user_info(file_path):

    user_info = pd.read_csv(file_path, sep='::', header=None,
                            names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

    return user_info
    
def _load_raw_movie_info(file_path):

        GENRES = GENRES_ML_1M

        movie_info = pd.read_csv(file_path, sep='::', header=None,
                                    names=['id', 'title', 'genres'], encoding='iso-8859-1')
        genre_map = {ele: i for i, ele in enumerate(GENRES)}
        genre_map['Children\'s'] = genre_map['Children']
        genre_map['Childrens'] = genre_map['Children']
        movie_genres = np.zeros(shape=(movie_info.shape[0], len(GENRES)), dtype=np.float32)
        for i, genres in enumerate(movie_info['genres']):
            for ele in genres.split('|'):
                if ele in genre_map:
                    movie_genres[i, genre_map[ele]] = 1.0
                else:
                    print('genres not found, filled with unknown: {}'.format(genres))
                    movie_genres[i, genre_map['unknown']] = 1.0
        
        for idx, genre_name in enumerate(GENRES):
            assert idx == genre_map[genre_name]
            movie_info[genre_name] = movie_genres[:, idx]
        
        movie_info.drop(columns=["genres"])

        return movie_info

In [22]:
user_data = _load_raw_user_info(os.path.join(_dir, 'users.dat'))
movie_data = _load_raw_movie_info(os.path.join(_dir, 'movies.dat'))
ratings_data = _load_rating_info(os.path.join(_dir, 'ratings.dat'), '::')

  movie_info = pd.read_csv(file_path, sep='::', header=None,


# We create an index on the movie data so that we can use that to change ids on the ratings dataset

In [25]:
movie_data["movie_id"] = movie_data.index
user_data["user_id"] = user_data.index

In [26]:
HM_movie_ID = dict(zip(movie_data.id,movie_data.movie_id))
HM_user_ID = dict(zip(user_data.id,user_data.user_id))

In [27]:
ratings_data["user_id"] = ratings_data["user_id"].map(HM_user_ID)
ratings_data["movie_id"] = ratings_data["movie_id"].map(HM_movie_ID)

In [28]:
ratings_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1176,5,978300760
1,0,655,3,978302109
2,0,902,3,978301968
3,0,3339,4,978300275
4,0,2286,5,978824291


In [29]:
ratings_data['user_id'].nunique()

6040

In [30]:
ratings_data['movie_id'].nunique()

3706

In [42]:
graph_data = {
    ('user','rates','movie') : (ratings_data['user_id'].to_numpy(), ratings_data['movie_id'].to_numpy()),
    ('movie','rev-rates','user') : (ratings_data['movie_id'].to_numpy(), ratings_data['user_id'].to_numpy())
}

movie_hetero_graph = dgl.heterograph(graph_data)

In [43]:
movie_hetero_graph.nodes['movie'].data['node_ID'] = torch.tensor(movie_data["movie_id"]).squeeze()

In [44]:
movie_hetero_graph.nodes['user'].data['node_ID'] = torch.tensor(user_data["user_id"]).squeeze()

In [45]:
movie_hetero_graph

Graph(num_nodes={'movie': 3883, 'user': 6040},
      num_edges={('movie', 'rev-rates', 'user'): 1000209, ('user', 'rates', 'movie'): 1000209},
      metagraph=[('movie', 'user', 'rev-rates'), ('user', 'movie', 'rates')])

In [46]:
movie_hetero_graph

Graph(num_nodes={'movie': 3883, 'user': 6040},
      num_edges={('movie', 'rev-rates', 'user'): 1000209, ('user', 'rates', 'movie'): 1000209},
      metagraph=[('movie', 'user', 'rev-rates'), ('user', 'movie', 'rates')])

In [47]:
isolated_nodes =  (movie_hetero_graph.in_degrees(etype='rates') == 0).nonzero().squeeze(1)

In [48]:
# isolated_nodes =  (movie_hetero_graph.out_degrees(etype='rev-rates') == 0).nonzero().squeeze(1)

In [49]:
isolated_nodes

tensor([  50,  107,  113,  141,  281,  282,  391,  395,  396,  399,  600,  616,
         620,  624,  631,  648,  669,  670,  676,  684,  690,  704,  712,  714,
         718,  729,  730,  742,  758,  760,  762,  763,  767,  784,  785,  787,
         802,  806,  808,  811,  814,  834,  844,  845,  846,  860,  862,  878,
         882,  967,  971,  988, 1032, 1038, 1051, 1059, 1090, 1092, 1093, 1094,
        1106, 1121, 1124, 1125, 1127, 1130, 1139, 1140, 1141, 1142, 1143, 1150,
        1288, 1289, 1294, 1298, 1299, 1347, 1379, 1400, 1418, 1423, 1435, 1440,
        1488, 1518, 1520, 1528, 1536, 1537, 1585, 1650, 1651, 1657, 1658, 1660,
        1662, 1667, 1674, 1687, 1689, 1691, 1704, 1710, 1712, 1718, 1719, 1723,
        1730, 1753, 1778, 1961, 2130, 2147, 2151, 2153, 2155, 2156, 2159, 2160,
        2161, 2201, 2205, 2250, 2420, 2439, 2478, 2495, 2519, 2526, 2532, 2534,
        2535, 2611, 2615, 2629, 2763, 2769, 2841, 2885, 2888, 2889, 2911, 2940,
        2954, 2990, 3011, 3101, 3122, 31

In [50]:
movie_hetero_graph.remove_nodes(isolated_nodes.clone().detach(), ntype='movie')

In [51]:
movie_hetero_graph

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'rev-rates', 'user'): 1000209, ('user', 'rates', 'movie'): 1000209},
      metagraph=[('movie', 'user', 'rev-rates'), ('user', 'movie', 'rates')])

In [53]:
movie_data.head(1)

Unnamed: 0,id,title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_id
0,1,Toy Story (1995),Animation|Children's|Comedy,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [54]:
movie_hetero_graph.successors(0, etype='rates').shape

torch.Size([53])

In [55]:
ratings_data[ratings_data.user_id == 0].shape

(53, 4)

In [56]:
def _process_user_fea(user_data):
    HM = {}
    
    for index, row in user_data.iterrows():
        
        age = row['age']
        gender = (row['gender'] == 'F')
        
        HM[row['user_id']] = torch.FloatTensor([age, gender])
        
    return HM

In [57]:
def _process_movie_fea(movie_data):
    
    import re
    
    HM = {}
    p = re.compile(r'(.+)\s*\((\d+)\)')
    
    for index, row in movie_data.iterrows():
        match_res = p.match(row['title'])
        
        if match_res is None:
            print('{} cannot be matched, index={}, name={}'.format(title, i, self._name))
            title_context, year = title, 1950
        else:
            title_context, year = match_res.groups()
        
        HM[row['movie_id']] = torch.FloatTensor([ (float(year)- 1950.0) / 100.0])

    return HM

In [58]:
movieid_to_feat = _process_movie_fea(movie_data)

In [59]:
userid_to_feat = _process_user_fea(user_data)

# Create User and Movie Features

In [60]:
movie_hetero_graph.nodes['movie'].data['node_ID']

tensor([   0,    1,    2,  ..., 3880, 3881, 3882])

In [61]:
mapped_movie_features = []
for value in movie_hetero_graph.nodes['movie'].data['node_ID'].tolist():
    mapped_movie_features.append(movieid_to_feat[value])

In [62]:
mapped_user_features = []
for value in movie_hetero_graph.nodes['user'].data['node_ID'].tolist():
    mapped_user_features.append(userid_to_feat[value])

In [63]:
movie_hetero_graph.nodes['user'].data['features'] = torch.stack(mapped_user_features, axis=0)

In [64]:
movie_hetero_graph.nodes['movie'].data['features'] = torch.stack(mapped_movie_features, axis=0)

In [65]:
movie_hetero_graph.nodes['user'].data['features']

tensor([[ 1.,  1.],
        [56.,  0.],
        [25.,  0.],
        ...,
        [56.,  1.],
        [45.,  1.],
        [25.,  0.]])

In [66]:
user_data[user_data.user_id == 1]

Unnamed: 0,id,gender,age,occupation,zip_code,user_id
1,2,M,56,16,70072,1


## We can see that second node has value of age as '56'. Hence we can confirm that features are mapped to the correct node.

In [67]:
movie_hetero_graph.nodes['movie'].data['features']

tensor([[0.4500],
        [0.4500],
        [0.4500],
        ...,
        [0.5000],
        [0.5000],
        [0.5000]])

In [68]:
movie_data[movie_data.movie_id == 1]

Unnamed: 0,id,title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_id
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [69]:
(1995-1950)/100

0.45

## We can see that first node has value of movie feature as '0.45'. We also see 1st movie ids release year. Hence we can confirm that features are mapped to the correct node.