In [1]:
# built-in imports
import os
import re
import argparse
import pickle
import sys

# third-party imports
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

# local imports
sys.path.insert(0, '../src/pinsage')
import builder 
# from pinsage.data_utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get directory of data files
directory = './ml-1m'
# get output file path
output_path = 'processed.pkl'

## Get Users

In [3]:
users = []
with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
    # UserID::Gender::Age::Occupation::Zip-code
    for l in f:
        id_, gender, age, occupation, zip_ = l.strip().split('::')
        users.append({
            'user_id': int(id_),
            'gender': gender,
            'age': age,
            'occupation': occupation,
            'zip': zip_,
            })
users = pd.DataFrame(users).astype('category')

In [4]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
print(users.shape)
print(users.dtypes)

(6040, 5)
user_id       category
gender        category
age           category
occupation    category
zip           category
dtype: object


## Get Movies

In [6]:
movies = []
with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
    for l in f:
        # MovieID::Title::Genres
        id_, title, genres = l.strip().split('::')
        
        # get unique genres
        genres_set = set(genres.split('|'))

        # extract year
        assert re.match(r'.*\([0-9]{4}\)$', title)
        year = title[-5:-1]
        title = title[:-6].strip()

        data = {'movie_id': int(id_), 'title': title, 'year': year}
        for g in genres_set:
            data[g] = True
        movies.append(data)
movies = pd.DataFrame(movies).astype({'year': 'category'})

In [7]:
movies.head()

Unnamed: 0,movie_id,title,year,Children's,Animation,Comedy,Fantasy,Adventure,Romance,Drama,...,Action,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,1995,True,True,True,,,,,...,,,,,,,,,,
1,2,Jumanji,1995,True,,,True,True,,,...,,,,,,,,,,
2,3,Grumpier Old Men,1995,,,True,,,True,,...,,,,,,,,,,
3,4,Waiting to Exhale,1995,,,True,,,,True,...,,,,,,,,,,
4,5,Father of the Bride Part II,1995,,,True,,,,,...,,,,,,,,,,


In [8]:
print(movies.shape)
print(movies.dtypes)

(3883, 21)
movie_id          int64
title            object
year           category
Children's       object
Animation        object
Comedy           object
Fantasy          object
Adventure        object
Romance          object
Drama            object
Thriller         object
Action           object
Crime            object
Horror           object
Sci-Fi           object
Documentary      object
War              object
Musical          object
Mystery          object
Film-Noir        object
Western          object
dtype: object


## Get Ratings (Users-Movies)

In [9]:
ratings = []
with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
    # UserID::MovieID::Rating::Timestamp
    for l in f:
        user_id, movie_id, rating, timestamp = [int(_) for _ in l.split('::')]
        ratings.append({
            'user_id': user_id,
            'movie_id': movie_id,
            'rating': rating,
            'timestamp': timestamp,
            })
ratings = pd.DataFrame(ratings)

In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
print(ratings.shape)
print(ratings.dtypes)

(1000209, 4)
user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object


## Get distinct users and movies

In [12]:
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings = ratings['user_id'].unique()
distinct_movies_in_ratings = ratings['movie_id'].unique()
users = users.copy()[users['user_id'].isin(distinct_users_in_ratings)]
movies = movies.copy()[movies['movie_id'].isin(distinct_movies_in_ratings)]

In [13]:
# users have a minimum of 20 movies
ratings.user_id.value_counts().min()

20

In [14]:
# movies have a minimum of 1 user
ratings.movie_id.value_counts().min()

1

In [15]:
print(users.shape)

(6040, 5)


In [16]:
print(movies.shape)

(3706, 21)


In [17]:
movies.columns

Index(['movie_id', 'title', 'year', 'Children's', 'Animation', 'Comedy',
       'Fantasy', 'Adventure', 'Romance', 'Drama', 'Thriller', 'Action',
       'Crime', 'Horror', 'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery',
       'Film-Noir', 'Western'],
      dtype='object')

In [18]:
# Group the movie features into genres (a vector), year (a category), title (a string)
genre_columns = movies.columns.drop(['movie_id', 'title', 'year'])
movies.loc[:, genre_columns] = movies.loc[:, genre_columns].fillna(False).astype(bool)
movies_categorical = movies.drop('title', axis=1)

In [19]:
# Build graph
graph_builder = builder.PandasGraphBuilder()
graph_builder.add_entities(users, 'user_id', 'user')
graph_builder.add_entities(movies_categorical, 'movie_id', 'movie')
graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id', 'watched')
graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id', 'watched-by')

In [20]:
g = graph_builder.build()

In [21]:
g

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 1000209, ('user', 'watched', 'movie'): 1000209},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])

## Add user node features

In [22]:
# Assign features.
# Note that variable-sized features such as texts or images are handled elsewhere.
g.nodes['user'].data['gender'] = torch.LongTensor(users['gender'].cat.codes.values.copy())
g.nodes['user'].data['age'] = torch.LongTensor(users['age'].cat.codes.values.copy())
g.nodes['user'].data['occupation'] = torch.LongTensor(users['occupation'].cat.codes.values.copy())
g.nodes['user'].data['zip'] = torch.LongTensor(users['zip'].cat.codes.values.copy())

## Add movie node features

In [23]:
g.nodes['movie'].data['year'] = torch.LongTensor(movies['year'].cat.codes.values.copy())
g.nodes['movie'].data['genre'] = torch.FloatTensor(movies[genre_columns].values)

## Add user-movie edge features

In [24]:
g.edges['watched'].data['rating'] = torch.LongTensor(ratings['rating'].values)
g.edges['watched'].data['timestamp'] = torch.LongTensor(ratings['timestamp'].values)
g.edges['watched-by'].data['rating'] = torch.LongTensor(ratings['rating'].values)
g.edges['watched-by'].data['timestamp'] = torch.LongTensor(ratings['timestamp'].values)

## Train, Validation, Split by Time

In [25]:
train_indices, val_indices, test_indices = train_test_split_by_time(ratings, 'timestamp', 'user_id')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  .apply(train_test_split) \


In [26]:
print("train", train_indices.shape)
print("val", val_indices.shape)
print("test", test_indices.shape)

train (988129,)
val (6040,)
test (6040,)


### Build Train Graph

In [27]:
train_g = build_train_graph(g, train_indices, 'user', 'movie', 'watched', 'watched-by')
assert train_g.out_degrees(etype='watched').min() > 0



In [28]:
train_g

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])

In [29]:
val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user', 'movie', 'watched')

In [30]:
val_matrix

<6040x3706 sparse matrix of type '<class 'numpy.int64'>'
	with 6040 stored elements in COOrdinate format>

In [31]:
test_matrix

<6040x3706 sparse matrix of type '<class 'numpy.int64'>'
	with 6040 stored elements in COOrdinate format>

## Text Features

In [32]:
movie_textual_dataset = {'title': movies['title'].values}
movie_textual_dataset

{'title': array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Tigerland',
        'Two Family House', 'Contender, The'], dtype=object)}

## Save processed data

In [33]:
dataset = {
    'train-graph': train_g,
    'val-matrix': val_matrix,
    'test-matrix': test_matrix,
    'item-texts': movie_textual_dataset,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'movie',
    'user-to-item-type': 'watched',
    'item-to-user-type': 'watched-by',
    'timestamp-edge-column': 'timestamp'}

In [34]:
with open(output_path, 'wb') as f:
    pickle.dump(dataset, f)

In [35]:
# create a field
# field = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True)

# create an example
# examples = [torchtext.data.Example.fromlist([t], [('title', title_field)]) for t in texts]

# craete a title dataset
# titleset = torchtext.data.Dataset(examples, [('title', title_field)])

# build vocab based on title dataset
# field.build_vocab(titleset.title, vectors='fasttext.simple.300d

# get token_ids, and lengths
# token_ids, lengths = field.process([examples[0].title, examples[1].title])