In [1]:
import os
import copy
import glob
import shutil
import pandas as pd
import numpy as np
from scipy.io import savemat
import h5py
import hdf5storage

import torch
from torch_geometric.utils import one_hot
from torch_scatter import scatter_add
from torch_geometric.data import InMemoryDataset, Data, download_url, extract_zip

In [2]:
def cleaning_df(file_path, col_removed, col_features, separator=r'|', encoding='latin-1', ):
    col = np.concatenate((col_removed,col_features)) 

    # extract data from csv
    df = pd.read_csv(file_path, sep=separator, names=col, header=None, engine='python', encoding=encoding)

    # df = user_item_df.join(item_genre_df.set_index('item_id'), on='item_id')  # combine 2 dfs
    df = df.drop(col_removed, axis=1) # delete unwanted columns
    col_length = len(df.columns)

    return df, col_length

def store_into_matlab(data, dataset, mat_file_name, main_location='../raw_data/',):
    ds = sorted(data.items())
    base_data, training_data, testing_data = [d[1] for d in ds]
    matdict = {'M': base_data,'Otraining': training_data.numpy() ,'Otest': testing_data.numpy()}
    
    path_store =  main_location + dataset + '/'+ mat_file_name
    hdf5storage.write(matdict, '.', path_store, matlab_compatible=True)
    print('Sucessfully created '+ mat_file_name+ '.mat file at' + path_store)

In [3]:
def split_data(clean_df, col_length):

    torch_base = torch.tensor(clean_df, dtype=torch.long)
    base_data = clean_df
    idx_base = torch.nonzero(torch_base)
    
    # Create training matrix
    # Select about 20% randomly from the idx array 
    n = int(0.2*len(idx_base))
 
    print('number of test:', n) 
    idx = np.random.choice(idx_base.shape[0], n, replace=False)

    # Use the dropout index as training set
    idx_training = [e.numpy() for i,e in enumerate(idx_base) if i not in idx] 
    print("training percentage: ", len(idx_training)/len(idx_base))
    row_length = len(base_data)
    training_data = torch.zeros(row_length,col_length)
    for i in idx_training:
        # print(training_data[i[0],i[1]])
        training_data[i[0],i[1]] = torch.tensor(1, dtype=torch.int64)

    idx_testing = [e.numpy() for i,e in enumerate(idx_base) if i in idx] 
    testing_data = torch.zeros(row_length, col_length)
    for i in idx_testing:
        testing_data[i[0],i[1]] = torch.tensor(1, dtype=torch.int64)
    
    print('base data shape:',base_data.shape)
    print('train data shape:',training_data.shape)
    print('test data shape:',testing_data.shape)

    data = {'base_data': base_data, 'training_data': training_data, 'testing_data': testing_data}
    return data

In [4]:
def generate_user_feature_df(raw_df, col_length):
   
    users_df = raw_df
    num_users = users_df.shape[0]
    users = set(users_df['user id'].values.tolist())
    occupation = set(users_df['occupation'].values.tolist())

    age = users_df['age'].values
    age_max = age.max()

    gender_dict = {'M': 0., 'F': 1.}
    u_dict={f: i for i, f in enumerate(users, start=0)}
    occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

    num_feats = 2 + len(occupation_dict)

    u_features = np.zeros((num_users, num_feats), dtype=np.float32)
    for _, row in users_df.iterrows():
        u_id = row['user id']
        if u_id in u_dict.keys():
            # age
            u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
            # gender
            u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
            # occupation
            u_features[u_dict[u_id],
                        occupation_dict[row['occupation']]] = 1.
    return u_features, num_feats



In [5]:
# create item_features dataset in matlab
item_path = "../raw_data/ml_100k/u.item" # TODO: attach it somewhere
col_removed = ['item_id','title','date','url']
col_features = ['unknown', 'action', 'adventure', 'animation' , 'children' , 'comedy' , 'crime' , 'documentary' , 'drama' , 'fantasy' ,'noir' , 'horror' , 'musical' , 'mystery' , 'romance' , 'scifi' ,'thriller' , 'war' , 'western']
df, col_length = cleaning_df(item_path, col_removed, col_features)
clean_df = df.to_numpy()

data = split_data(clean_df, col_length)
store_into_matlab(data, 'ml_100k', 'item_features')

number of test: 578
training percentage:  0.8002073971655721
base data shape: (1682, 19)
train data shape: torch.Size([1682, 19])
test data shape: torch.Size([1682, 19])
Sucessfully created item_features.mat file at../raw_data/ml_100k/item_features


In [6]:
# create user_features dataset in matlab
item_path = "../raw_data/ml_100k/u.user" # TODO: attach it somewhere
col_removed = []
col_features = ['user id', 'age', 'gender', 'occupation', 'zip code']

df, col_length = cleaning_df(item_path, col_removed, col_features)
clean_df, num_feats = generate_user_feature_df(df, col_length)

data = split_data(clean_df, num_feats)
store_into_matlab(data, 'ml_100k', 'user_features')

number of test: 243
training percentage:  0.8003286770747741
base data shape: (943, 23)
train data shape: torch.Size([943, 23])
test data shape: torch.Size([943, 23])
Sucessfully created user_features.mat file at../raw_data/ml_100k/user_features


# UNUSED BLOCKS!

In [9]:
# NOT USED!
def create_df(csv_path):
        col_names = ['user_id', 'item_id', 'relation', 'ts']
        df = pd.read_csv(csv_path, sep='\t', names=col_names)
        df = df.drop('ts', axis=1)
        df_copy = df.copy()
        df['user_id'] = df['user_id'] - 1
        df['item_id'] = df['item_id'] - 1
        df['relation'] = df['relation'] - 1

        nums = {'user': df.max()['user_id'] + 1,
                'item': df.max()['item_id'] + 1,
                'node': df.max()['user_id'] + df.max()['item_id'] + 2,
                'edge': len(df)}
        return df_copy, nums

csv_path = "../raw_data/ml_100k/u1.base" # TODO: attach it somewhere 
user_item_df, nums = create_df(csv_path)
user_item_df.head()

Unnamed: 0,user_id,item_id,relation
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [87]:

x = [[1,0,0],[0,1,0],[0,1,1]]
a = torch.tensor(x, dtype=torch.int64)
indices = torch.tensor([[1,1],[2,2]])

print(x)
a
# a[torch.arange(a.size(0)).unsqueeze(1), indices] = 1.


[[1, 0, 0], [0, 1, 0], [0, 1, 1]]


tensor([[1, 0, 0],
        [0, 0, 0],
        [0, 1, 0]])

In [73]:
index = torch.tensor([[ 0,  1],
                [ 1,  2],
                [ 4,  1]], dtype=torch.long)
a = torch.Tensor([[ 0,  0,  0,  0,  0,  0],[ 0,  0,  0,  0,  0,  0],[ 0,  0,  0,  0,  0,  0]])

for i, ind in enumerate(index):
    a[i].index_fill_(0, ind, 1)

a

tensor([[1., 1., 0., 0., 0., 0.],
        [0., 1., 1., 0., 0., 0.],
        [0., 1., 0., 0., 1., 0.]])