In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from torch.autograd import Variable
import random
import torch.optim as optim
import pickle
import torch.utils.data
from torch.backends import cudnn
from scipy.sparse import csr_matrix
import math
import bottleneck as bn
import time
import matplotlib.pyplot as plt

In [2]:
train_file = "yahooR3/ydata-ymusic-rating-study-v1_0-train.txt"
test_file = "yahooR3/ydata-ymusic-rating-study-v1_0-test.txt"

num_user = 15400
num_item = 1000

In [3]:
train_table = pd.read_csv(train_file, header=None, sep='\t', names=['u','i','r'])
test_table = pd.read_csv(test_file, header=None, sep='\t', names=['u','i','r'])

In [4]:
# filtering
train_table = train_table[train_table['r'] > 3.5] # filtering.

# indices start from 0
train_table['u'] = train_table['u'] - 1
train_table['i'] = train_table['i'] - 1
test_table['u'] = test_table['u'] - 1
test_table['i'] = test_table['i'] - 1

In [5]:
# to numpy
train_bi = train_table[train_table.columns[0:2]]
train_pairs = train_bi.to_numpy()
test_triples = test_table.to_numpy()

In [6]:
# train_mat
train_mat = dict()
for u, i in train_pairs:
    if u not in train_mat:
        train_mat[u] = [i]
    else:
        train_mat[u].append(i)

In [7]:
# val split - 10% of train set
train2_mat = dict()
val_mat = dict()
for u in range(num_user):
    if u in train_mat:
        items = train_mat[u]
        if len(items) > 4:
            num_val = max(round(len(items)*0.1), 1)
            val_mat[u] = items[-num_val:]
            train2_mat[u] = items[:len(items)-num_val]
        else:
            train2_mat[u] = items

In [8]:
# dict to pair
train2_pairs = []
val_pairs = []
for u in range(num_user):
    if u in train2_mat:
        for i in train2_mat[u]:
            train2_pairs.append((u, i))
    if u in val_mat:
        for i in val_mat[u]:
            val_pairs.append((u, i))
            
train2_pair = np.asarray(train2_pairs)
val_pair = np.asarray(val_pairs)

In [9]:
# train_matrix
train_matrix = np.zeros((num_user, num_item))
for u, i in train2_pair:
    train_matrix[u][i] = 1
    
# trainval_matrix
trainval_matrix = np.zeros((num_user, num_item))
for u, i in train2_pair:
    trainval_matrix[u][i] = 1
for u, i in val_pair:
    trainval_matrix[u][i] = 1    

In [10]:
# test dict
test_mat = dict()
for u, i, r in test_triples:
    if r > 3.5:
        if u not in test_mat:
            test_mat[u] = [i]
        else:
            test_mat[u].append(i)

test_cdd = dict()
for u, i, r in test_triples:
    if u not in test_cdd:
        test_cdd[u] = [i]
    else:
        test_cdd[u].append(i)

In [11]:
np.save('yahooR3/train', train2_pair)
np.save('yahooR3/train_dic', train2_mat)
np.save('yahooR3/train_mat', train_matrix)

np.save('yahooR3/val', val_pair)
np.save('yahooR3/val_dic', val_mat)

np.save('yahooR3/trainval_dic', train_mat)
np.save('yahooR3/trainval_mat', trainval_matrix)

np.save('yahooR3/test', test_triples)
np.save('yahooR3/test_dic', test_mat)
np.save('yahooR3/test_cdd', test_cdd)