In [1]:
import os
import copy
import glob
import shutil
import pandas as pd
import numpy as np
from scipy.io import savemat

import torch
from torch_geometric.utils import one_hot
from torch_scatter import scatter_add
from torch_geometric.data import InMemoryDataset, Data, download_url, extract_zip

In [2]:
def create_df(csv_path):
        col_names = ['user_id', 'item_id', 'relation', 'ts']
        df = pd.read_csv(csv_path, sep='\t', names=col_names)
        df = df.drop('ts', axis=1)
        df_copy = df.copy()
        df['user_id'] = df['user_id'] - 1
        df['item_id'] = df['item_id'] - 1
        df['relation'] = df['relation'] - 1

        nums = {'user': df.max()['user_id'] + 1,
                'item': df.max()['item_id'] + 1,
                'node': df.max()['user_id'] + df.max()['item_id'] + 2,
                'edge': len(df)}
        return df_copy, nums

csv_path = "data\\ml-100k\\raw\\u1.base" # TODO: attach it somewhere 
user_item_df, nums = create_df(csv_path)
user_item_df.head()

Unnamed: 0,user_id,item_id,relation
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [13]:
# import files
torch.manual_seed(0)

def generate_item_genre_df(item_path, user_item_df):
    col_names = ['item_id', 'title', 'release_date', 'vid_release_date' ,'url']
    category_cols = ['unknown', 'action', 'adventure', 'animation' , 'children' , 'comedy' , 'crime' , 'documentary' , 'drama' , 'fantasy' ,'noir' , 'horror' , 'musical' , 'mystery' , 'romance' , 'scifi' ,'thriller' , 'war' , 'western']
    col = np.concatenate((col_names,category_cols))

    # extract data from csv
    item_genre_df = pd.read_csv(item_path, sep='|', names=col)
    
    # combine 2 dfs
    df = user_item_df.join(item_genre_df.set_index('item_id'), on='item_id') 
    df = df.drop(['user_id','item_id','relation','title','release_date','vid_release_date','url'], axis=1) # delete unwanted columns
    col_length = len(df.columns)
    base_data = df.to_numpy()

    torch_base = torch.tensor(base_data, dtype=torch.long)
    idx_base = torch.nonzero(torch_base)
    
    # Create training matrix
    # Select about 30% randomly from the idx array 
    n = int(0.3*len(idx_base))  
    idx = np.random.choice(idx_base.shape[0], n, replace=False)

    # Use the dropout index as training set
    idx_training = [e.numpy() for i,e in enumerate(idx_base) if i not in idx] 
    print("training percentage: ", len(idx_training)/len(idx_base))
    row_length = len(base_data)
    training_data = torch.zeros(row_length,col_length)
    for i in idx_training:
        training_data[i[0],i[1]] = torch.tensor(1, dtype=torch.int64)

    idx_testing = [e.numpy() for i,e in enumerate(idx_base) if i in idx] 
    testing_data = torch.zeros(row_length, col_length)
    for i in idx_testing:
        testing_data[i[0],i[1]] = torch.tensor(1, dtype=torch.int64)
    
    # TODO: hardcoded! Must be converted to dynamic file naming system
    # Store into matlab file
    savemat('./data/ml-100k/processed/u1_genre.mat', mdict={'u1_genre_base': base_data,'u1_genre_training': training_data.numpy() ,'u1_genre_test': testing_data.numpy()})

    return base_data

item_path = "data\\ml-100k\\raw\\u.item" # TODO: attach it somewhere 
data = generate_item_genre_df(item_path,user_item_df)
data

training percentage:  0.7000005867270604


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [87]:

x = [[1,0,0],[0,1,0],[0,1,1]]
a = torch.tensor(x, dtype=torch.int64)
indices = torch.tensor([[1,1],[2,2]])

print(x)
a
# a[torch.arange(a.size(0)).unsqueeze(1), indices] = 1.


[[1, 0, 0], [0, 1, 0], [0, 1, 1]]


tensor([[1, 0, 0],
        [0, 0, 0],
        [0, 1, 0]])

In [73]:
index = torch.tensor([[ 0,  1],
                [ 1,  2],
                [ 4,  1]], dtype=torch.long)
a = torch.Tensor([[ 0,  0,  0,  0,  0,  0],[ 0,  0,  0,  0,  0,  0],[ 0,  0,  0,  0,  0,  0]])

for i, ind in enumerate(index):
    a[i].index_fill_(0, ind, 1)

a

tensor([[1., 1., 0., 0., 0., 0.],
        [0., 1., 1., 0., 0., 0.],
        [0., 1., 0., 0., 1., 0.]])