In [1]:
import torch
import pandas as pd
import numpy as np
def load_user_node_csv(encoders = None):
    colnames = ["userID", "age", "gender", "occupation", "zip"]
    fp = "data/u.user"
    index_col = "userID"
    df = pd.read_csv(fp, index_col=index_col, names = colnames, sep = "|")
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [2]:
from sentence_transformers import SentenceTransformer
class SequenceEncoder(object):
    def __init__(self,model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, text_series):
        x = self.model.encode(text_series.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [3]:
class IdentityEncoder(object):
    def __init__(self, val_series, dtype=None):
        self.dtype = dtype
        self.value_series = val_series

    def __call__(self):
        return torch.from_numpy(self.val_series).view(-1, 1).to(self.dtype)

In [4]:
class GenresEncoder(object):
    def __call__(self, dfg):
        x = torch.from_numpy(dfg.values)
        return x



In [5]:
from torch_geometric.data import HeteroData

data = HeteroData()

In [6]:
_, user_mapping = load_user_node_csv()

In [7]:
movie_attr_cols = ["itemID", "title", "release_date","IMDB_URL", "G_UNKNOWN", "G_ACTION",\
                  "G_ADVENTURE", "G_ANIMATION", "G_CHILDREN", "G_COMEDY", "G_CRIME", "G_DOCUMENTARY", "G_DRAMA",\
                  "G_FANTASY", "G_FILM_NOIR", "G_HORROR", "G_MUISICAL", "G_MYSTERY", "G_ROMANCE", "G_SCIFI", "G_THRILLER",\
                  "G_WAR", "G_WESTERN", "rating", "poor_rating"]
genre_cols = ["G_UNKNOWN", "G_ACTION",\
                  "G_ADVENTURE", "G_ANIMATION", "G_CHILDREN", "G_COMEDY", "G_CRIME", "G_DOCUMENTARY", "G_DRAMA",\
                  "G_FANTASY", "G_FILM_NOIR", "G_HORROR", "G_MUISICAL", "G_MYSTERY", "G_ROMANCE", "G_SCIFI", "G_THRILLER",\
                  "G_WAR", "G_WESTERN"]
fp = "data/u.item.fe"
df = pd.read_csv(fp)
df.columns = movie_attr_cols

In [8]:
df

Unnamed: 0,itemID,title,release_date,IMDB_URL,G_UNKNOWN,G_ACTION,G_ADVENTURE,G_ANIMATION,G_CHILDREN,G_COMEDY,...,G_HORROR,G_MUISICAL,G_MYSTERY,G_ROMANCE,G_SCIFI,G_THRILLER,G_WAR,G_WESTERN,rating,poor_rating
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,3.878319,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,3.206107,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.033333,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,3.550239,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.302326,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.000000,1
1678,1679,B. Monkey (1998),06-Feb-1998,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,3.000000,0
1679,1680,Sliding Doors (1998),01-Jan-1998,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,2.000000,1
1680,1681,You So Crazy (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3.000000,0


In [9]:
SequenceEncoder()(df["title"])

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

tensor([[-0.0828,  0.0530,  0.0536,  ...,  0.0226,  0.0538,  0.1030],
        [-0.0888,  0.0044,  0.0044,  ...,  0.0418, -0.0063,  0.0269],
        [ 0.0404, -0.0017, -0.0039,  ...,  0.0935, -0.1068, -0.0159],
        ...,
        [-0.0513,  0.0196, -0.0042,  ...,  0.0649, -0.0345, -0.0004],
        [-0.0559,  0.0123, -0.0664,  ...,  0.0943, -0.0927, -0.0248],
        [-0.0114,  0.0354, -0.0673,  ..., -0.0342, -0.0023, -0.0419]])

In [11]:
dfg= df[genre_cols]

In [12]:
GenresEncoder()(dfg)

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [13]:
movie_title_x = SequenceEncoder()(df["title"])

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [14]:
movie_genre_x = GenresEncoder()(dfg)

In [15]:
movie_x = torch.cat([movie_title_x, movie_genre_x], dim=-1)

In [16]:
data['movie'].x = movie_x

In [17]:
movie_x.shape

torch.Size([1682, 403])