In [6]:
import json
import pandas as pd
import numpy as np
import scipy.sparse as sp
import scipy.linalg as la
import seaborn as sns
import pickle

# Load Data

In [7]:
df = pd.read_csv("../data/user_movie_1500k.csv")
df500k = pd.read_csv("../data/user_movie_500k.csv")
df100k = pd.read_csv("../data/user_movie_100k.csv")

# Train-Val-Test Split

In [1]:
def map_ids(row, mapper):
    return mapper[row]

In [3]:
def matrix_data(df):
    mid_to_idx = {}
    idx_to_mid = {}
    for (idx, mid) in enumerate(df.movie_id.unique().tolist()):
        mid_to_idx[mid] = idx
        idx_to_mid[idx] = mid

    uid_to_idx = {}
    idx_to_uid = {}
    for (idx, uid) in enumerate(df.user_id.unique().tolist()):
        uid_to_idx[uid] = idx
        idx_to_uid[idx] = uid

    I = df.user_id.apply(map_ids, args=[uid_to_idx]).to_numpy()
    J = df.movie_id.apply(map_ids, args=[mid_to_idx]).to_numpy()
    V = np.ones(I.shape[0])
    
    X_sp = sp.coo_matrix((V, (I, J)), dtype=np.float64)
    X_sp = X_sp.tocsr()
    
    return X_sp, idx_to_mid, uid_to_idx

In [4]:
def train_test_split(X, split_count=1, test_fraction=0.2):
    train = X.copy().tocoo()
    test = sp.lil_matrix(train.shape)
    
    limit_idx = int(test_fraction * train.sum())
    
    try:
        user_index = np.random.choice(
            np.where(np.bincount(train.row) >= split_count * 2)[0], 
            replace=False, 
            size=limit_idx
        ).tolist()
    except Exception as e: 
        print(e)
    
    user_index = user_index[:limit_idx]

    train = train.tolil()

    for user in user_index:
        test_X = np.random.choice(X.getrow(user).indices, 
                                        size=split_count, 
                                        replace=False)
        train[user, test_X] = 0.
        test[user, test_X] = X[user, test_X]
   
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index

In [5]:
def val_test_split(X, user_index):
    val = X.copy().tocoo()
    test = sp.lil_matrix(val.shape)
    
    idx = len(user_index)//2
    val_user_index = user_index[0:idx]
    test_user_index = user_index[idx:]
    
    val = val.tolil()

    for user in test_user_index:
        test_X = np.random.choice(X.getrow(user).indices, size=1, replace=False)
        val[user, test_X] = 0.
        test[user, test_X] = X[user, test_X]
    
    assert(val.multiply(test).nnz == 0)
    return val.tocsr(), test.tocsr(), val_user_index, test_user_index

### 100k dataset

In [8]:
X_100k, idx_to_mid, uid_to_idx = matrix_data(df100k)
train_100k, temp_100k, user_index_100k = train_test_split(X_100k)
val_100k, test_100k, val_user_index_100k, test_user_index_100k = val_test_split(temp_100k, user_index_100k)
print("Train data: {}".format(train_100k.sum()))
print("Val data: {}".format(val_100k.sum()))
print("Test data: {}".format(test_100k.sum()))

Train data: 87803.0
Val data: 10975.0
Test data: 10975.0


In [10]:
sp.save_npz('../data/train_100k.npz', train_100k)
sp.save_npz('../data/val_100k.npz', val_100k)
sp.save_npz('../data/test_100k.npz', test_100k)

with open('../data/val_user_index_100k.npy', 'wb') as f:
    np.save(f, np.array(val_user_index_100k))
    
with open('../data/test_user_index_100k.npy', 'wb') as f:
    np.save(f, np.array(test_user_index_100k))
    
with open('../data/idx_to_mid_100k.pkl', 'wb') as f:
    pickle.dump(idx_to_mid, f)

with open('../data/uid_to_idx_100k.pkl', 'wb') as f:
    pickle.dump(uid_to_idx, f)

In [9]:
X_100k.shape

(28348, 3720)

### 500k dataset

In [11]:
X_500k, idx_to_mid, uid_to_idx = matrix_data(df500k)

In [12]:
X_500k.shape

(73093, 6736)

In [13]:
train_500k, temp_500k, user_index_500k = train_test_split(X_500k, test_fraction=0.10)
val_500k, test_500k, val_user_index_500k, test_user_index_500k = val_test_split(temp_500k, user_index_500k)
print("Train data: {}".format(train_500k.sum()))
print("Val data: {}".format(val_500k.sum()))
print("Test data: {}".format(test_500k.sum()))

Train data: 453415.0
Val data: 25189.0
Test data: 25190.0


In [14]:
sp.save_npz('../data/train_500k.npz', train_500k)
sp.save_npz('../data/val_500k.npz', val_500k)
sp.save_npz('../data/test_500k.npz', test_500k)

with open('../data/val_user_index_500k.npy', 'wb') as f:
    np.save(f, np.array(val_user_index_500k))
    
with open('../data/test_user_index_500k.npy', 'wb') as f:
    np.save(f, np.array(test_user_index_500k))

with open('../data/idx_to_mid_500k.pkl', 'wb') as f:
    pickle.dump(idx_to_mid, f)

with open('../data/uid_to_idx_500k.pkl', 'wb') as f:
    pickle.dump(uid_to_idx, f)

### 1.5M dataset

In [15]:
X_1500k, idx_to_mid, uid_to_idx = matrix_data(df)

In [16]:
X_1500k.shape

(94641, 15224)

In [17]:
train_1500k, temp_1500k, user_index_1500k = train_test_split(X_1500k, test_fraction=0.05)
val_1500k, test_1500k, val_user_index_1500k, test_user_index_1500k = val_test_split(temp_1500k, user_index_1500k)
print("Train data: {}".format(train_1500k.sum()))
print("Val data: {}".format(val_1500k.sum()))
print("Test data: {}".format(test_1500k.sum()))

Train data: 1417693.0
Val data: 37307.0
Test data: 37308.0


In [18]:
sp.save_npz('../data/train_1500k.npz', train_1500k)
sp.save_npz('../data/val_1500k.npz', val_1500k)
sp.save_npz('../data/test_1500k.npz', test_1500k)

with open('../data/val_user_index_1500k.npy', 'wb') as f:
    np.save(f, np.array(val_user_index_1500k))
    
with open('../data/test_user_index_1500k.npy', 'wb') as f:
    np.save(f, np.array(test_user_index_1500k))

with open('../data/idx_to_mid_1500k.pkl', 'wb') as f:
    pickle.dump(idx_to_mid, f)

with open('../data/uid_to_idx_1500k.pkl', 'wb') as f:
    pickle.dump(uid_to_idx, f)

In [19]:
sp.save_npz('../data/X_1500k.npz', X_1500k)