In [3]:
import torch
import random as rd
from os.path import join
from os.path import isfile
import numpy as np
import pandas as pd
import itertools
from collections import Counter
import tqdm
import pickle
import sys
sys.path.insert(0, "/Users/sayamsingla/Desktop/Warwick/PEAGNN/graph_recsys_benchmark/datasets")
sys.path.insert(1, "/Users/sayamsingla/Desktop/Warwick/PEAGNN/graph_recsys_benchmark/parser")

from dataset import Dataset
from ml import parse_ml25m, parse_mlsmall

In [None]:
class MovieLens(Dataset):
    url = 'http://files.grouplens.org/datasets/movielens/'

    def __init__(self,
                 root,
                 name,
                 transform=None,
                 pre_transform=None,
                 pre_filter=None,
                 **kwargs):

        self.name = name.lower()
        self.type = kwargs['type']
        assert self.name in ['25m', 'latest-small']
        assert self.type in ['hete']
        self.num_core = kwargs['num_core']
        self.num_feat_core = kwargs['num_feat_core']

        self.entity_aware = kwargs['entity_aware']

        self.num_negative_samples = kwargs['num_negative_samples']
        self.sampling_strategy = kwargs['sampling_strategy']
#         self.cf_loss_type = kwargs['cf_loss_type']

        super(MovieLens, self).__init__(root, transform, pre_transform, pre_filter)

        with open(self.processed_paths[0], 'rb') as f:  # Read the class property
            dataset_property_dict = pickle.load(f)
        for k, v in dataset_property_dict.items():
            self[k] = v

        print('Dataset loaded!')

    @property
    def raw_file_names(self):
        return 'ml-{}.zip'.format(self.name.lower())

    @property
    def processed_file_names(self):
        return ['ml_{}_{}.pkl'.format(self.name, self.build_suffix())]

#     def download(self):
#         path = download_url(self.url + self.raw_file_names, self.raw_dir)
#         extract_zip(path, self.raw_dir)
        
    def build_suffix(self):
        return 'core_{}_type_{}'.format(self.num_core, self.type)

    def process(self):
        if self.name == 'latest-small':
            
                movies = pd.read_csv('/Users/sayamsingla/Downloads/ml-latest-small-2/movies.csv').fillna('')
                ratings = pd.read_csv('/Users/sayamsingla/Downloads/ml-latest-small-2/ratings.csv')
                tagging = pd.read_csv('/Users/sayamsingla/Downloads/ml-latest-small-2/tags.csv')
#                 print('Read data frame from {}!'.format(self.processed_dir))
            
                
                # Remove duplicates
                movies = movies.drop_duplicates()
                ratings = ratings.drop_duplicates()
                tagging = tagging.drop_duplicates()

                # Sync
                movies = movies[movies.iid.isin(ratings.iid.unique())]
                ratings = ratings[ratings.iid.isin(movies.iid.unique())]
                tagging = tagging[tagging.iid.isin(ratings.iid.unique())]
                tagging = tagging[tagging.uid.isin(ratings.uid.unique())]

                # Remove infrequent movies
                movie_count = ratings['iid'].value_counts()
                movie_count.name = 'movie_count'
                ratings = ratings[ratings.join(movie_count, on='iid').movie_count > self.num_core]

                # Remove infrequent users
                user_count = ratings['uid'].value_counts()
                user_count.name = 'user_count'
                ratings = ratings[ratings.join(user_count, on='uid').user_count > self.num_core]

                # Sync
                movies = movies[movies.iid.isin(ratings.iid.unique())]
                tagging = tagging[tagging.iid.isin(ratings.iid.unique())]
                tagging = tagging[tagging.uid.isin(ratings.uid.unique())]

                # Remove infrequent tags
                tag_count = tagging['tag'].value_counts()
                tag_count.name = 'tag_count'
                tagging = tagging[tagging.join(tag_count, on='tag').tag_count > self.num_feat_core]

                # filter the years
                years = movies.year.to_numpy()
                years[years < 1950] = 1950
                movies['year'] = years
                if self.type == 'hete':
                    years = movies.year.to_numpy().astype(np.int)
                    min_year = min(years)
                    max_year = max(years)
                    num_years = (max_year - min_year) // 10
                    discretized_years = [min_year + i * 10 for i in range(num_years + 1)]
                    for i in range(len(discretized_years) - 1):
                        years[(discretized_years[i] <= years) & (years < discretized_years[i + 1])] = str(
                                discretized_years[i])
                    years[years < discretized_years[0]] = discretized_years[0]
                    years[years >= discretized_years[-1]] = discretized_years[-1]

                    movies['year'] = years

                # Reindex the uid and iid in case of missing values
                movies, ratings, tagging, tags = reindex_df_mlsmall(
                    movies, ratings, tagging)

                # Drop the infrequent writer, actor and directors
                movies = drop_infrequent_concept_from_str(movies, 'writers', self.num_feat_core)
                movies = drop_infrequent_concept_from_str(movies, 'directors', self.num_feat_core)
                movies = drop_infrequent_concept_from_str(movies, 'actors', self.num_feat_core)



In [None]:
MovieLens("/Users/sayamsingla/Downloads/ml-latest-small-2", 'latest-small', type = 'hete', num_core = 10, 
          num_feat_core = 10, entity_aware = False, num_negative_samples= 4, sampling_strategy = 'unseen'
         
)

In [None]:
with open('/Users/sayamsingla/Desktop/Warwick/PEAGNN/experiments/checkpoint/data/Movielenslatest-small/processed/ml_latest-small_core_10_type_hete.pkl', 'rb') as f:
      df = pickle.load(f)
        

In [None]:
df['edge_index_nps'].keys()

In [None]:
def update_pea_graph_input(dataset_args, train_args, dataset):
    if dataset_args['dataset'] == "Movielens":
        if dataset_args['name'] == "latest-small":
            user2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['user2item']).long().to(train_args['device'])
            year2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['year2item']).long().to(train_args['device'])
            actor2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['actor2item']).long().to(train_args['device'])
            director2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['director2item']).long().to(train_args['device'])
            writer2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['writer2item']).long().to(train_args['device'])
            genre2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['genre2item']).long().to(train_args['device'])
            tag2item_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['tag2item']).long().to(train_args['device'])
            tag2user_edge_index = \
                torch.from_numpy(dataset.edge_index_nps['tag2user']).long().to(train_args['device'])

            meta_path_edge_indicis_1 = [user2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_2 = [torch.flip(user2item_edge_index, dims=[0]), user2item_edge_index]
            meta_path_edge_indicis_3 = [year2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_4 = [actor2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_5 = [writer2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_6 = [director2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_7 = [genre2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_8 = [tag2item_edge_index, torch.flip(user2item_edge_index, dims=[0])]
            meta_path_edge_indicis_9 = [tag2user_edge_index, user2item_edge_index]
            meta_path_edge_index_list = [
                meta_path_edge_indicis_1, meta_path_edge_indicis_2, meta_path_edge_indicis_3,
                meta_path_edge_indicis_4, meta_path_edge_indicis_5, meta_path_edge_indicis_6,
                meta_path_edge_indicis_7, meta_path_edge_indicis_8, meta_path_edge_indicis_9
            ]
    return meta_path_edge_index_list

In [4]:
import torch
from torch_geometric.nn import GCNConv
import sys

sys.path.insert(0, "/Users/sayamsingla/Desktop/Warwick/PEAGNN/graph_recsys_benchmark/models")
from base import PEABaseChannel, PEABaseRecsysModel


class PEAGCNChannel(PEABaseChannel):
    def __init__(self, **kwargs):
        super(PEAGCNChannel, self).__init__()
        self.num_steps = kwargs['num_steps']
        self.num_nodes = kwargs['num_nodes']
        self.dropout = kwargs['dropout']

        self.gnn_layers = torch.nn.ModuleList()
        if kwargs['num_steps'] == 1:
            self.gnn_layers.append(GCNConv(kwargs['emb_dim'], kwargs['repr_dim']))
        else:
            self.gnn_layers.append(GCNConv(kwargs['emb_dim'], kwargs['hidden_size']))
            for i in range(kwargs['num_steps'] - 2):
                self.gnn_layers.append(GCNConv(kwargs['hidden_size'], kwargs['hidden_size']))
            self.gnn_layers.append(GCNConv(kwargs['hidden_size'], kwargs['repr_dim']))

        self.reset_parameters()


class PEAGCNRecsysModel(PEABaseRecsysModel):
    def __init__(self, **kwargs):
        kwargs['channel_class'] = PEAGCNChannel
        super(PEAGCNRecsysModel, self).__init__(**kwargs)

In [6]:
import torch
import os
import sys

sys.path.insert(0,"/Users/sayamsingla/Desktop/Warwick/PEAGNN/graph_recsys_benchmark")

from graph_recsys_benchmark.models import PEAGCNRecsysModel
from graph_recsys_benchmark.utils import get_folder_path, update_pea_graph_input
from graph_recsys_benchmark.solvers import BaseSolver

MODEL_TYPE = 'Graph'
LOSS_TYPE = 'BPR'
MODEL = 'PEAGCN'
GRAPH_TYPE = 'hete'

dataset_args = {
    'root': data_folder, 'dataset': args.dataset, 'name': args.dataset_name,
    'if_use_features': args.if_use_features.lower() == 'true', 'num_negative_samples': args.num_negative_samples,
    'num_core': args.num_core, 'num_feat_core': args.num_feat_core,
    'cf_loss_type': LOSS_TYPE, 'type': GRAPH_TYPE,
    'sampling_strategy': args.sampling_strategy, 'entity_aware': args.entity_aware.lower() == 'true',
    'model': MODEL
}
model_args = {
    'model_type': MODEL_TYPE,
    'if_use_features': args.if_use_features.lower() == 'true',
    'emb_dim': args.emb_dim, 'hidden_size': args.hidden_size,
    'repr_dim': args.repr_dim, 'dropout': args.dropout,
    'meta_path_steps': [int(i) for i in args.meta_path_steps.split(',')], 'channel_aggr': args.channel_aggr,
    'entity_aware': args.entity_aware.lower() == 'true',
    'entity_aware_coff': args.entity_aware_coff
}

In [None]:
python3 peagat_solver_bpr.py --dataset=Movielens --dataset_name=latest-small 
--num_core=10 --num_feat_core=10 --sampling_strategy=unseen 
--entity_aware=false --dropout=0 --emb_dim=64 --repr_dim=16 
--hidden_size=64 --meta_path_steps=2,2,2,2,2,2,2,2,2 --entity_aware_coff=0.1 
--init_eval=true --gpu_idx=0 --runs=5 --epochs=30 --batch_size=1024 --save_every_epoch=26 --metapath_test=false
