In [1]:
pwd

u'/Users/melshrif/Documents/Matrix Completion Project/neural-matrix-completion-master'

In [2]:
import numpy as np 
import scipy.sparse
import random
import sys
import time 

class DataLoader():
    def __init__(self, data_dir):
        ''' data_dir: dataset directory
            N: number of rows
            M: number of columns
        '''
        self.data_dir = data_dir
        self.rating_fname = data_dir + 'rating.npz'
        self.tr_m_fname = data_dir + 'train_mask.npz'
        self.v_m_fname = data_dir + 'val_mask.npz'
        self.n_X = 0
        self.n_Y = 0
        self.R = None
        self.X = None
        self.Y = None
        self.max_val = None
        self.min_val = None
        self.train_mask = None
        self.val_mask = None
        self.current_X_tr_ind = 0
        self.current_Y_tr_ind = 0
        self.current_X_val_ind = 0
        self.current_Y_val_ind = 0

    def load_data(self):
        self.R = scipy.sparse.load_npz(self.rating_fname)
        self.N, self.M = self.R.shape
        print('Original data: %d x %d' %(self.R.shape[0], self.R.shape[1]))
        val_set = np.unique(self.R.data)
        self.min_val = float(val_set[0]) 
        self.max_val = float(val_set[-1])
        self.train_mask = scipy.sparse.load_npz(self.tr_m_fname).astype(np.float32)
        self.val_mask = scipy.sparse.load_npz(self.v_m_fname).astype(np.float32)
        print('Finished loading data')
        self.X_tr_indices = np.arange(self.N)
        self.Y_tr_indices = np.arange(self.M)
        self.X_val_indices = np.arange(self.N)
        self.Y_val_indices = np.arange(self.M)
        print('Finished initializing indices')

    def split(self):
        self.R_tr_unnormalized = self.R.multiply(self.train_mask)  
        self.R_val_unnormalized = self.R.multiply(self.val_mask)
        self.X_tr = self.R_tr_unnormalized.copy()
        self.Y_tr = self.R_tr_unnormalized.copy().T.tocsr()

    def shuffle_indices(self, for_x=False, for_y=False):
        if for_x:
            print('Shuffle train X indices')
            self.X_tr_indices = np.random.permutation(range(self.N))
        if for_y:
            print('Shuffle train Y indices')
            self.Y_tr_indices = np.random.permutation(range(self.M))

    def get_num_samples(self, dataset):
        if dataset == 'train':
            return self.x_train.shape[0]
        if dataset == 'val':
            return self.x_val.shape[0]

    def get_X_dim(self):
        return self.X_tr.shape[1]

    def get_Y_dim(self):
        return self.Y_tr.shape[1]

    def next_full_batch(self, dataset):
        X_set = None
        Y_set = None
        mask = None
        X_set = self.X_tr
        Y_set = self.Y_tr
        if dataset == 'train':
            mask = self.train_mask
        elif dataset == 'val':
            mask = self.val_mask
        R = X_set
        return X_set, Y_set, R, mask.astype(np.float32), True

    def get_toread_indices(self, current, all_indices, batch_size):
        ''' Get indices of samples to-be-read from all_indices
        '''
        start = current
        end = current + batch_size
        n_samples = all_indices.shape[0]
        to_read = 0
        flag = False
        if end > n_samples:
            to_read = end - n_samples
            end = n_samples
            flag = True
        to_read_indices = all_indices[start:end]
        start = end   
        if to_read > 0:
            to_read_indices = np.append(to_read_indices, all_indices[0:to_read])     
            start = 0
        return to_read_indices, start, flag

    def get_elements(self, R, x_indices, y_indices):
        ''' get values of all pairs of selected rows and columns '''
        n = x_indices.shape[0]
        m = y_indices.shape[0]
        values = np.zeros((n,m))
        for i in range(n):
            for j in range(m):
                values[i,j] = R[x_indices[i],y_indices[j]]
        return values

    def get_elements_vectorized(self, R, x_indices, y_indices):
        ''' vectorized implementation of get_ratings functions. Gain 3x speedup '''
        n = x_indices.shape[0]
        m = y_indices.shape[0]
        values = np.zeros((n,m)).astype(np.float32)
        value_ind1, value_ind2 = np.meshgrid(x_indices, y_indices)
        ind1, ind2 = np.meshgrid(range(n), range(m))
        values[ind1.flatten(),ind2.flatten()] = R[value_ind1.flatten(), value_ind2.flatten()]
        return values

    def next_batch(self, bs_x, bs_y, dataset, verbose=False):
        ''' read next batch of input
        '''
        if dataset == 'train':
            start_x = self.current_X_tr_ind
            start_y = self.current_Y_tr_ind
            all_x_indices = self.X_tr_indices
            all_y_indices = self.Y_tr_indices
            full_mask = self.train_mask
            full_R = self.R_tr_unnormalized
        elif dataset == 'val':
            start_x = self.current_X_val_ind
            start_y = self.current_Y_val_ind
            all_x_indices = self.X_val_indices
            all_y_indices = self.Y_val_indices
            full_mask = self.val_mask
            full_R = self.R_val_unnormalized
        else:
            assert False, 'Invalid dataset'
        x_indices, start_x, flag_x = self.get_toread_indices(start_x, all_x_indices, bs_x)
        y_indices, start_y, flag_y = self.get_toread_indices(start_y, all_y_indices, bs_y)

        start = time.time()
        x = self.X_tr[x_indices,:].todense()
        y = self.Y_tr[y_indices,:].todense()
        if verbose:
            print('Load dense x and y takes %f s' %(time.time() - start))

        R = self.get_elements_vectorized(full_R, x_indices, y_indices)
        mask = self.get_elements_vectorized(full_mask, x_indices, y_indices)
        
        start = time.time()
        # scale R to be in range [-1,1]
        mid = (self.max_val + self.min_val) / 2
        R = (R - mid) / (mid - self.min_val)

        if dataset == 'train':
            self.current_X_tr_ind = start_x
            self.current_Y_tr_ind = start_y 
        elif dataset == 'val':
            self.current_X_val_ind = start_x
            self.current_Y_val_ind = start_y

        if dataset == 'train':
            if flag_x:
                self.shuffle_indices(for_x=True)
            if flag_y:
                self.shuffle_indices(for_y=True)
        flag = flag_x or flag_y
        return x, y, R, mask, flag


In [3]:
import tensorflow as tf

class ModelConfig():
    u_hidden_sizes = [2048,1024]    # hidden layers's size for the row branch
    v_hidden_sizes = [2048,1024]    # hidden layers's size for the column branch
    dropout_keep_prob = 0.1         # dropout rate = 1.0 - dropout_keep_prob
    use_bn = True                   # use batch normalization after fully-connected layer
    activation_fn = 'relu'          # activation function
    summarization = False           # user summarization layers 
    n_u_summ_filters = [32]         # no. conv filters in summarization layers in the row branch
    n_v_summ_filters = [32]         # no. conv filters in summarization layers in the column branch
    u_summ_layer_sizes = [20]       # conv filter sizes in summarization layers in the row branch
    v_summ_layer_sizes = [10]       # conv filter sizes in summarization layers in the column branch

class TrainConfig(object):
    """Sets the default training hyperparameters."""
    batch_size_x = 100
    batch_size_y = 170              # should be set accordingly to the ratio between row and column of the original matrix 

    initial_lr = 1e-2               # initial learning rate
    lr_decay_factor = 0.65          # learning rate decay factor
    num_epochs_per_decay = 50       # decay learning every ? epochs
    n_epochs = 1000                 # number of training epochs (1 epoch is one round passing through all the rows or columns)
    save_every_n_epochs = 500       # saving model every ? epochs
    log_every_n_steps = 20          # print training log every ? steps

    weight_decay = 0.0              # weight of the l2 regularization 

def arr_to_string(arr):
    for i in range(len(arr)):
        arr[i] = str(arr[i])
    return ','.join(arr)

# model configs
tf.flags.DEFINE_string('u_hidden_sizes', arr_to_string(ModelConfig.u_hidden_sizes),'')
tf.flags.DEFINE_string('v_hidden_sizes', arr_to_string(ModelConfig.v_hidden_sizes),'')
tf.flags.DEFINE_float('dropout_keep_prob', ModelConfig.dropout_keep_prob,'')
tf.flags.DEFINE_boolean('use_bn', ModelConfig.use_bn,'')
tf.flags.DEFINE_string('activation_fn', ModelConfig.activation_fn,'')
tf.flags.DEFINE_boolean('summarization', ModelConfig.summarization,'')
tf.flags.DEFINE_string('n_u_summ_filters', arr_to_string(ModelConfig.n_u_summ_filters),'')
tf.flags.DEFINE_string('n_v_summ_filters', arr_to_string(ModelConfig.n_v_summ_filters),'')
tf.flags.DEFINE_string('u_summ_layer_sizes', arr_to_string(ModelConfig.u_summ_layer_sizes),'')
tf.flags.DEFINE_string('v_summ_layer_sizes', arr_to_string(ModelConfig.v_summ_layer_sizes),'')

# training configs
tf.flags.DEFINE_integer('batch_size_x', TrainConfig.batch_size_x,'')
tf.flags.DEFINE_integer('batch_size_y', TrainConfig.batch_size_y,'')
tf.flags.DEFINE_float('initial_lr', TrainConfig.initial_lr,'')
tf.flags.DEFINE_float('lr_decay_factor', TrainConfig.lr_decay_factor,'')
tf.flags.DEFINE_integer('num_epochs_per_decay', TrainConfig.num_epochs_per_decay,'')
tf.flags.DEFINE_integer('n_epochs', TrainConfig.n_epochs,'')
tf.flags.DEFINE_integer('save_every_n_epochs', TrainConfig.save_every_n_epochs,'')
tf.flags.DEFINE_integer('log_every_n_steps', TrainConfig.log_every_n_steps,'')
tf.flags.DEFINE_float('weight_decay', TrainConfig.weight_decay,'')

CONFIGS = tf.app.flags.FLAGS

In [5]:
import tensorflow as tf
import numpy as np 
# import configs.configs_ML100K as configs
# from model import NMC
# from data_loader import DataLoader
import time 

FLAGS = tf.app.flags.FLAGS
tf.flags.DEFINE_string("data_dir", "./data/MovieLens100K/", "Data directory.")
tf.flags.DEFINE_string("output_basedir", "./outputs/", "Directory for saving and loading model checkpoints.")
tf.flags.DEFINE_string("pretrained_fname", "", "Name of the pretrained model checkpoints (to resume from)")
tf.flags.DEFINE_string("output_dir", "", "Model output directory.")
FLAGS.output_dir = FLAGS.output_basedir + 'snapshots/snapshot'

In [6]:
dl = DataLoader(FLAGS.data_dir)

In [7]:
dl.load_data()

Original data: 943 x 1682
Finished loading data
Finished initializing indices


In [13]:
dl.split()

In [22]:
x_dim = dl.get_X_dim()

In [23]:
y_dim = dl.get_Y_dim()

In [25]:
y_dim

943

In [27]:
cfgs = CONFIGS

In [30]:
assert FLAGS.output_dir, "--output_dir is required"
    # Create training directory.
output_dir = FLAGS.output_dir
if not tf.gfile.IsDirectory(output_dir):
    tf.logging.info("Creating training directory: %s", output_dir)
    tf.gfile.MakeDirs(output_dir)

INFO:tensorflow:Creating training directory: ./outputs/snapshots/snapshot


In [32]:
x, y, R, mask, flag = dl.next_batch(cfgs.batch_size_x, cfgs.batch_size_y, 'train')

In [36]:
x.shape

(100, 1682)

In [37]:
y.shape

(170, 943)

In [38]:
R.shape

(100, 170)

In [39]:
mask.shape

(100, 170)

In [42]:
flag

False

In [62]:
x[0,0:50]

matrix([[ 5.,  3.,  4.,  3.,  0.,  5.,  4.,  1.,  5.,  0.,  0.,  0.,  5.,
          5.,  0.,  5.,  3.,  4.,  0.,  0.,  1.,  0.,  4.,  3.,  4.,  0.,
          2.,  4.,  1.,  3.,  3.,  0.,  4.,  2.,  0.,  2.,  2.,  3.,  4.,
          3.,  2.,  5.,  0.,  0.,  0.,  0.,  4.,  5.,  3.,  0.]], dtype=float32)

In [63]:
y[0,0:50]

matrix([[ 5.,  4.,  0.,  0.,  4.,  4.,  0.,  0.,  0.,  4.,  0.,  0.,  3.,
          0.,  1.,  0.,  4.,  0.,  0.,  3.,  5.,  0.,  0.,  0.,  5.,  3.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,
          0.,  0.,  5.,  5.,  4.,  5.,  0.,  0.,  0.,  2.,  0.]], dtype=float32)

In [64]:
R[0,0:50]

array([ 1. ,  0. ,  0.5,  0. , -1.5,  1. ,  0.5, -1. ,  1. , -1.5, -1.5,
       -1.5,  1. ,  1. , -1.5,  1. ,  0. ,  0.5, -1.5, -1.5, -1. , -1.5,
        0.5,  0. ,  0.5, -1.5, -0.5,  0.5, -1. ,  0. ,  0. , -1.5,  0.5,
       -0.5, -1.5, -0.5, -0.5,  0. ,  0.5,  0. , -0.5,  1. , -1.5, -1.5,
       -1.5, -1.5,  0.5,  1. ,  0. , -1.5], dtype=float32)

In [65]:
mask[0,0:50]

array([ 1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,
        1.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.], dtype=float32)