In [None]:
import os
project_name = "reco-tut-cris"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

/content/reco-tut-cris
Initialized empty Git repository in /content/reco-tut-cris/.git/
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 36 (delta 7), reused 31 (delta 3), pack-reused 0[K
Unpacking objects: 100% (36/36), done.
From https://github.com/sparsh-ai/reco-tut-cris
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


### Dataloader for Interest modeling

In [None]:
import os
import csv
import pdb
import time
import pickle
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import LabelEncoder
from torch.utils import data
from torch.utils.data.dataloader import default_collate

In [None]:
def toymd(time):
    return datetime.utcfromtimestamp(time)#.strftime('%Y-%m-%d')

In [None]:
class Dataset(data.Dataset):

    def __init__(self, data):
        st = time.time()
        
        self.iids, self.labels, self.timediffs = [], [], []
        self.most_oldtime = None
        
        for row in data:
            self.iids.append(row[0])
            self.labels.append(row[1])
            self.timediffs.append(row[2:])
            
        self.iids = np.array(self.iids)
        self.timediffs = np.array(self.timediffs).astype(int)
        self.labels = (np.array(self.labels) == 'True').astype(int) 
        
        print('Data building time : %.1fs' % (time.time()-st))
        
    def __getitem__(self, index):
        return self.iids[index], self.timediffs[index], self.labels[index]
    
    def __len__(self):
        """Returns the total number of user-item pairs."""
        return len(self.timediffs)

In [None]:
def build_loader(eachdata, batch_size, shuffle=True, num_workers=0):
    
    def my_collate(batch):
        batch = [i for i in filter(lambda x:x is not None, batch)]
        return default_collate(batch)
    
    """Builds and returns Dataloader."""
    dataset = Dataset(eachdata)
    
    data_loader = data.DataLoader(dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        collate_fn=my_collate)

    return data_loader  

In [None]:
def build_data_directly(dpath, period, binsize):
    def toymd(time):
        return datetime.utcfromtimestamp(time)
    
    def build_data(true_items, item_feature):
        output = []
        for i in item_feature:
            feature = item_feature[i]
            instance = [i] + [bool(i in true_items)] + list(feature) # [iid, label, features]
            output.append(instance)    
        return np.array(output)
    
    def get_item_feature(data):
        times = data[:,-1].astype(float).astype(int)
        mintime, maxtime = toymd(min(times)), toymd(max(times))

        # Binning training time (D_f) with fixed-sized bins
        timedelta = relativedelta(weeks=binsize)
        bins = np.array([mintime + timedelta*i for i in range(1000) # quick implementation
                         if mintime + timedelta*i < maxtime + timedelta*0])

        # Build features from data
        idict = {}
        for u, i, r, t in data:
            if i not in idict: idict[i] = []
            idict[i].append(toymd(int(float(t))))

        # Build features for each item
        item_feature = {}
        for i in idict:
            times = np.array(idict[i])

            # Transform times into frequency bins
            binned_times = []
            for t in times:
                binidx = np.where(bins <= t)[0][-1]
                each_binfeature = np.zeros(len(bins))
                each_binfeature[binidx] = 1
                binned_times.append(each_binfeature)
            binned_times = np.array(binned_times).sum(axis=0).astype(int)

            item_feature[i] = binned_times
            
        return item_feature

    rawtrn = np.array([l for l in csv.reader(open(dpath+'train.csv'))])
    rawvld = np.array([l for l in csv.reader(open(dpath+'valid.csv'))])
    rawtst = np.array([l for l in csv.reader(open(dpath+'test.csv'))])
    
    times_trn = rawtrn[:,-1].astype(int)
    
    # Split data by period (unit: week)
    # [trn_start - trnfront - vld_start - tst_start - tst_end]
    trnfront_time = times_trn.max() - 60 * 60 * 24 * 7 * period 
    trnfront_idx = np.where(times_trn < trnfront_time)[0][-1]
    trn_start_time = int(float(times_trn[0])) # -1 denotes the time index
    trnfront_start_time = int(float(rawtrn[trnfront_idx][-1]))
    vld_start_time = int(float(rawvld[0][-1]))
    tst_start_time = int(float(rawtst[0][-1]))
    tst_end_time = int(float(rawtst[-1][-1]))
    
    print('\n📋 Data loaded from: {}\n'.format(dpath))

    print('Trn start time:\t{}'.format(toymd(trn_start_time)))
    print('Trn front time:\t{}'.format(toymd(trnfront_start_time)))
    print('Vld start time:\t{}'.format(toymd(vld_start_time)))
    print('Tst start time:\t{}'.format(toymd(tst_start_time)))
    print('Tst end time:\t{}'.format(toymd(tst_end_time)))
    
    trn_4feature = rawtrn[:trnfront_idx]
    feature_trn = get_item_feature(trn_4feature) # features for training
    feature_eval = get_item_feature(rawtrn) # features for evaluation (to get ISS for training RS)
    
    trn_4label = rawtrn[trnfront_idx:] # D_b
    
    trndata = build_data(set(trn_4label[:,1]), feature_trn)
    vlddata = build_data(set(rawvld[:,1]), feature_eval)
    tstdata = build_data(set(rawtst[:,1]), feature_eval)
    
    return trndata, vlddata, tstdata

In [None]:
class DataLoader:
    def __init__(self, opt):
        self.dpath = opt.dataset_path + '/'
        self.batch_size = opt.batch_size
        
        trndata, vlddata, tstdata = build_data_directly(self.dpath, opt.period, opt.binsize)        
        
        self.trn_loader = build_loader(trndata, opt.batch_size, shuffle=True)
        self.vld_loader = build_loader(vlddata, opt.batch_size, shuffle=False)
        self.tst_loader = build_loader(tstdata, opt.batch_size, shuffle=False)
        
        print(("train/val/test/ divided by batch size {:d}/{:d}/{:d}".format(len(self.trn_loader), len(self.vld_loader),len(self.tst_loader))))
        print("==================================================================================")
            
    def get_loaders(self):
        return self.trn_loader, self.vld_loader, self.tst_loader
    
    def get_embedding(self):
        return self.input_embedding

### Unit testing

In [None]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='amazon_tools', type=str)    
parser.add_argument('--period', default=16, type=float)
parser.add_argument('--binsize', default=8, type=int)
parser.add_argument('--learning_rate', default=1e-2, type=float)
parser.add_argument('--l2reg', default=1e-4, type=float)
parser.add_argument('--num_epoch', default=100, type=int)
parser.add_argument('--batch_size', default=128, type=int)    
parser.add_argument('--hidden_dim', default=64, type=int)    
parser.add_argument('--pos_weight', default=1e-2, type=float)   
parser.add_argument('--gpu', default=3, type=int)       

opt = parser.parse_args(args={})
dataset_path = './data/silver/{}'.format(opt.dataset)    

opt.dataset_path = dataset_path
opt

Namespace(batch_size=128, binsize=8, dataset='amazon_tools', dataset_path='./data/silver/amazon_tools', gpu=3, hidden_dim=64, l2reg=0.0001, learning_rate=0.01, num_epoch=100, period=16, pos_weight=0.01)

In [None]:
from collections import Counter

data_loader = DataLoader(opt)
trn_loader, vld_loader, tst_loader = data_loader.get_loaders()

trnlen = trn_loader.dataset.timediffs.shape[1]        

print('TRN labels: {}'.format(Counter(trn_loader.dataset.labels)))
print('VLD labels: {}'.format(Counter(vld_loader.dataset.labels)))
print('TST labels: {}'.format(Counter(tst_loader.dataset.labels)))


📋 Data loaded from: ./data/silver/amazon_tools/

Trn start time:	1999-11-08 00:00:00
Trn front time:	2014-01-30 00:00:00
Vld start time:	2014-05-23 00:00:00
Tst start time:	2014-06-23 00:00:00
Tst end time:	2014-07-22 00:00:00
Data building time : 0.4s
Data building time : 0.4s
Data building time : 0.4s
train/val/test/ divided by batch size 79/80/80
TRN labels: Counter({1: 6158, 0: 3891})
VLD labels: Counter({0: 7945, 1: 2232})
TST labels: Counter({0: 8092, 1: 2085})


### Dataloader for recommendation modeling

In [None]:
import os
import pdb
import time
import torch
import pickle
import random
import numpy as np
import pandas as pd
from torch.utils import data
from torch.utils.data.dataloader import default_collate

random.seed(2020)

In [None]:
class ML_Dataset(data.Dataset):
    
    def build_consumption_history(self, uir):
        # Build a dictionary for user: items consumed by the user
        uir = uir.astype(int)
        uidict = {}
        allitems = set()
        for u, i, _ in uir:
            if u not in uidict: uidict[u] = set()
            uidict[u].add(i)
            allitems.add(i)
            
        self.ui_cand_dict = {}    
        for u in uidict:
            self.ui_cand_dict[u] = np.array(list(allitems - uidict[u]))
        
        return uidict, allitems
        
    def __init__(self, path, trn_numneg):
        dpath = '/'.join(path.split('/')[:-1])
        if dpath[-1] != '/': dpath += '/'
        dtype = path.split('/')[-1].split('.')[0]
        
        st = time.time()        
        
        if dtype == 'train': self.numneg = trn_numneg
        self.uir = np.load(path)

        if dtype == 'train':             
            self.uir[:,-1] = 1 # Mark explicit feedback as implicit feedback

            self.first = self.uir[:,0].astype(int)
            self.second = self.uir[:,1].astype(int)
            self.third = np.zeros(self.uir.shape[0]) # This will be replaced in 'train_collate'
            
            self.numuser = len(set(self.uir[:,0].astype(int)))
            self.numitem = len(set(self.uir[:,1].astype(int)))
            
            self.uidict, self.allitems = self.build_consumption_history(self.uir)
            
        elif dtype == 'valid' or dtype == 'test':             
            # Build validation data for ranking evaluation
            newuir = []
            for row in self.uir:
                user = row[0]
                true_item = row[1]
                newuir.append([user, true_item, 1]) # a true consumption
                for item in row[2:]: newuir.append([user, item, 0]) # negative candidates
            self.uir = np.array(newuir) # User, Item, Rating
        
            self.first, self.second, self.third = self.uir[:,0], self.uir[:,1], self.uir[:,2]
        
        
        print('Data building time : %.1fs' % (time.time()-st))

    def __getitem__(self, index):
        # Training: [user, positive, negative]
        # Testing: [user, canidate item, label] 
        return self.first[index], self.second[index], self.third[index]
    
    def __len__(self):
        """Returns the total number of user-item pairs."""
        return len(self.first)
    
    
    def train_collate(self, batch):
        # Input: [user, postive item, dummy]
        # Output: [user, positive item, negative item]
        batch = [i for i in filter(lambda x:x is not None, batch)]
        
        # Negative sampling for each batch
        outputs = []
        for u, pi, dummy in batch:
            rand_idx = np.random.randint(len(self.ui_cand_dict[u]), size=self.numneg)
            neg_items = self.ui_cand_dict[u][rand_idx]
            
            for ni in neg_items: 
                outputs.append([u, pi, ni])
            
        return default_collate(outputs)      

In [None]:
def test_collate(batch):
    batch = [i for i in filter(lambda x:x is not None, batch)]
    return default_collate(batch)

In [None]:
def get_each_loader(data_path, batch_size, trn_negnum, shuffle=True, num_workers=0):
    """Builds and returns Dataloader."""
    
    dataset = ML_Dataset(data_path, trn_negnum)
    
    if data_path.endswith('train.npy') == True:
        collate = dataset.train_collate
    else:
        collate = test_collate

    data_loader = data.DataLoader(dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        collate_fn=collate)

    return data_loader

In [None]:
class DataLoader:
    def __init__(self, opt):
        self.dpath = opt.dataset_path + '/'
        self.batch_size = opt.batch_size
        self.trn_numneg = opt.numneg
        
        self.trn_loader, self.vld_loader, self.tst_loader = self.get_loaders_for_metric_learning(self.trn_numneg)
    
        print(("train/val/test/ divided by batch size {:d}/{:d}/{:d}".format(len(self.trn_loader), len(self.vld_loader),len(self.tst_loader))))
        print("=" * 80)
        
    def get_loaders_for_metric_learning(self, trn_numneg):
        print("\n📋 Loading data...\n")
        trn_loader = get_each_loader(self.dpath+'train.npy', self.batch_size, trn_numneg, shuffle=True)
        print('\tTraining data loaded')
        
        vld_loader = get_each_loader(self.dpath+'valid.npy', self.batch_size, trn_numneg, shuffle=False)
        print('\tValidation data loaded')
        
        tst_loader = get_each_loader(self.dpath+'test.npy', self.batch_size, trn_numneg, shuffle=False)
        print('\tTest data loaded')
        
        return trn_loader, vld_loader, tst_loader
    
    def get_loaders(self):
        return self.trn_loader, self.vld_loader, self.tst_loader
    
    def get_embedding(self):
        return self.input_embedding

### Unit testing

In [None]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='amazon_tools', type=str)    
parser.add_argument('--batch_size', default=4096, type=int)      
parser.add_argument('--numneg', default=10, type=int)

opt = parser.parse_args(args={})
dataset_path = './data/gold/{}'.format(opt.dataset)    

opt.dataset_path = dataset_path
opt

Namespace(batch_size=4096, dataset='amazon_tools', dataset_path='./data/gold/amazon_tools', learning_rate=0.01, num_epoch=50, numneg=10)

In [None]:
data_loader = DataLoader(opt)

trn_loader, vld_loader, tst_loader = data_loader.get_loaders()

opt.numuser = trn_loader.dataset.numuser
opt.numitem = trn_loader.dataset.numitem


📋 Loading data...

Data building time : 21.9s
	Training data loaded
Data building time : 0.8s
	Validation data loaded
Data building time : 0.7s
	Test data loaded
train/val/test/ divided by batch size 31/85/73


## Exporting the methods

In [None]:
%%writefile ./code/dataloader_interest.py
import os
import csv
import pdb
import time
import pickle
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import LabelEncoder
from torch.utils import data
from torch.utils.data.dataloader import default_collate


def toymd(time):
    return datetime.utcfromtimestamp(time)#.strftime('%Y-%m-%d')


class Dataset(data.Dataset):

    def __init__(self, data):
        st = time.time()
        
        self.iids, self.labels, self.timediffs = [], [], []
        self.most_oldtime = None
        
        for row in data:
            self.iids.append(row[0])
            self.labels.append(row[1])
            self.timediffs.append(row[2:])
            
        self.iids = np.array(self.iids)
        self.timediffs = np.array(self.timediffs).astype(int)
        self.labels = (np.array(self.labels) == 'True').astype(int) 
        
        print('Data building time : %.1fs' % (time.time()-st))
        
    def __getitem__(self, index):
        return self.iids[index], self.timediffs[index], self.labels[index]
    
    def __len__(self):
        """Returns the total number of user-item pairs."""
        return len(self.timediffs)


def build_loader(eachdata, batch_size, shuffle=True, num_workers=0):
    
    def my_collate(batch):
        batch = [i for i in filter(lambda x:x is not None, batch)]
        return default_collate(batch)
    
    """Builds and returns Dataloader."""
    dataset = Dataset(eachdata)
    
    data_loader = data.DataLoader(dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        collate_fn=my_collate)

    return data_loader  


def build_data_directly(dpath, period, binsize):
    def toymd(time):
        return datetime.utcfromtimestamp(time)
    
    def build_data(true_items, item_feature):
        output = []
        for i in item_feature:
            feature = item_feature[i]
            instance = [i] + [bool(i in true_items)] + list(feature) # [iid, label, features]
            output.append(instance)    
        return np.array(output)
    
    def get_item_feature(data):
        times = data[:,-1].astype(float).astype(int)
        mintime, maxtime = toymd(min(times)), toymd(max(times))

        # Binning training time (D_f) with fixed-sized bins
        timedelta = relativedelta(weeks=binsize)
        bins = np.array([mintime + timedelta*i for i in range(1000) # quick implementation
                         if mintime + timedelta*i < maxtime + timedelta*0])

        # Build features from data
        idict = {}
        for u, i, r, t in data:
            if i not in idict: idict[i] = []
            idict[i].append(toymd(int(float(t))))

        # Build features for each item
        item_feature = {}
        for i in idict:
            times = np.array(idict[i])

            # Transform times into frequency bins
            binned_times = []
            for t in times:
                binidx = np.where(bins <= t)[0][-1]
                each_binfeature = np.zeros(len(bins))
                each_binfeature[binidx] = 1
                binned_times.append(each_binfeature)
            binned_times = np.array(binned_times).sum(axis=0).astype(int)

            item_feature[i] = binned_times
            
        return item_feature

    rawtrn = np.array([l for l in csv.reader(open(dpath+'train.csv'))])
    rawvld = np.array([l for l in csv.reader(open(dpath+'valid.csv'))])
    rawtst = np.array([l for l in csv.reader(open(dpath+'test.csv'))])
    
    times_trn = rawtrn[:,-1].astype(int)
    
    # Split data by period (unit: week)
    # [trn_start - trnfront - vld_start - tst_start - tst_end]
    trnfront_time = times_trn.max() - 60 * 60 * 24 * 7 * period 
    trnfront_idx = np.where(times_trn < trnfront_time)[0][-1]
    trn_start_time = int(float(times_trn[0])) # -1 denotes the time index
    trnfront_start_time = int(float(rawtrn[trnfront_idx][-1]))
    vld_start_time = int(float(rawvld[0][-1]))
    tst_start_time = int(float(rawtst[0][-1]))
    tst_end_time = int(float(rawtst[-1][-1]))
    
    print('\n📋 Data loaded from: {}\n'.format(dpath))

    print('Trn start time:\t{}'.format(toymd(trn_start_time)))
    print('Trn front time:\t{}'.format(toymd(trnfront_start_time)))
    print('Vld start time:\t{}'.format(toymd(vld_start_time)))
    print('Tst start time:\t{}'.format(toymd(tst_start_time)))
    print('Tst end time:\t{}'.format(toymd(tst_end_time)))
    
    trn_4feature = rawtrn[:trnfront_idx]
    feature_trn = get_item_feature(trn_4feature) # features for training
    feature_eval = get_item_feature(rawtrn) # features for evaluation (to get ISS for training RS)
    
    trn_4label = rawtrn[trnfront_idx:] # D_b
    
    trndata = build_data(set(trn_4label[:,1]), feature_trn)
    vlddata = build_data(set(rawvld[:,1]), feature_eval)
    tstdata = build_data(set(rawtst[:,1]), feature_eval)
    
    return trndata, vlddata, tstdata


class DataLoader:
    def __init__(self, opt):
        self.dpath = opt.dataset_path + '/'
        self.batch_size = opt.batch_size
        
        trndata, vlddata, tstdata = build_data_directly(self.dpath, opt.period, opt.binsize)        
        
        self.trn_loader = build_loader(trndata, opt.batch_size, shuffle=True)
        self.vld_loader = build_loader(vlddata, opt.batch_size, shuffle=False)
        self.tst_loader = build_loader(tstdata, opt.batch_size, shuffle=False)
        
        print(("train/val/test/ divided by batch size {:d}/{:d}/{:d}".format(len(self.trn_loader), len(self.vld_loader),len(self.tst_loader))))
        print("==================================================================================")
            
    def get_loaders(self):
        return self.trn_loader, self.vld_loader, self.tst_loader
    
    def get_embedding(self):
        return self.input_embedding

Writing ./code/dataloader_interest.py


In [None]:
%%writefile ./code/dataloader_recommendation.py
import os
import pdb
import time
import torch
import pickle
import random
import numpy as np
import pandas as pd
from torch.utils import data
from torch.utils.data.dataloader import default_collate

random.seed(2020)


class ML_Dataset(data.Dataset):
    
    def build_consumption_history(self, uir):
        # Build a dictionary for user: items consumed by the user
        uir = uir.astype(int)
        uidict = {}
        allitems = set()
        for u, i, _ in uir:
            if u not in uidict: uidict[u] = set()
            uidict[u].add(i)
            allitems.add(i)
            
        self.ui_cand_dict = {}    
        for u in uidict:
            self.ui_cand_dict[u] = np.array(list(allitems - uidict[u]))
        
        return uidict, allitems
        
    def __init__(self, path, trn_numneg):
        dpath = '/'.join(path.split('/')[:-1])
        if dpath[-1] != '/': dpath += '/'
        dtype = path.split('/')[-1].split('.')[0]
        
        st = time.time()        
        
        if dtype == 'train': self.numneg = trn_numneg
        self.uir = np.load(path)

        if dtype == 'train':             
            self.uir[:,-1] = 1 # Mark explicit feedback as implicit feedback

            self.first = self.uir[:,0].astype(int)
            self.second = self.uir[:,1].astype(int)
            self.third = np.zeros(self.uir.shape[0]) # This will be replaced in 'train_collate'
            
            self.numuser = len(set(self.uir[:,0].astype(int)))
            self.numitem = len(set(self.uir[:,1].astype(int)))
            
            self.uidict, self.allitems = self.build_consumption_history(self.uir)
            
        elif dtype == 'valid' or dtype == 'test':             
            # Build validation data for ranking evaluation
            newuir = []
            for row in self.uir:
                user = row[0]
                true_item = row[1]
                newuir.append([user, true_item, 1]) # a true consumption
                for item in row[2:]: newuir.append([user, item, 0]) # negative candidates
            self.uir = np.array(newuir) # User, Item, Rating
        
            self.first, self.second, self.third = self.uir[:,0], self.uir[:,1], self.uir[:,2]
        
        
        print('Data building time : %.1fs' % (time.time()-st))

    def __getitem__(self, index):
        # Training: [user, positive, negative]
        # Testing: [user, canidate item, label] 
        return self.first[index], self.second[index], self.third[index]
    
    def __len__(self):
        """Returns the total number of user-item pairs."""
        return len(self.first)
    
    
    def train_collate(self, batch):
        # Input: [user, postive item, dummy]
        # Output: [user, positive item, negative item]
        batch = [i for i in filter(lambda x:x is not None, batch)]
        
        # Negative sampling for each batch
        outputs = []
        for u, pi, dummy in batch:
            rand_idx = np.random.randint(len(self.ui_cand_dict[u]), size=self.numneg)
            neg_items = self.ui_cand_dict[u][rand_idx]
            
            for ni in neg_items: 
                outputs.append([u, pi, ni])
            
        return default_collate(outputs)      


def test_collate(batch):
    batch = [i for i in filter(lambda x:x is not None, batch)]
    return default_collate(batch)


def get_each_loader(data_path, batch_size, trn_negnum, shuffle=True, num_workers=0):
    """Builds and returns Dataloader."""
    
    dataset = ML_Dataset(data_path, trn_negnum)
    
    if data_path.endswith('train.npy') == True:
        collate = dataset.train_collate
    else:
        collate = test_collate

    data_loader = data.DataLoader(dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        collate_fn=collate)

    return data_loader


class DataLoader:
    def __init__(self, opt):
        self.dpath = opt.dataset_path + '/'
        self.batch_size = opt.batch_size
        self.trn_numneg = opt.numneg
        
        self.trn_loader, self.vld_loader, self.tst_loader = self.get_loaders_for_metric_learning(self.trn_numneg)
    
        print(("train/val/test/ divided by batch size {:d}/{:d}/{:d}".format(len(self.trn_loader), len(self.vld_loader),len(self.tst_loader))))
        print("=" * 80)
        
    def get_loaders_for_metric_learning(self, trn_numneg):
        print("\n📋 Loading data...\n")
        trn_loader = get_each_loader(self.dpath+'train.npy', self.batch_size, trn_numneg, shuffle=True)
        print('\tTraining data loaded')
        
        vld_loader = get_each_loader(self.dpath+'valid.npy', self.batch_size, trn_numneg, shuffle=False)
        print('\tValidation data loaded')
        
        tst_loader = get_each_loader(self.dpath+'test.npy', self.batch_size, trn_numneg, shuffle=False)
        print('\tTest data loaded')
        
        return trn_loader, vld_loader, tst_loader
    
    def get_loaders(self):
        return self.trn_loader, self.vld_loader, self.tst_loader
    
    def get_embedding(self):
        return self.input_embedding

Writing ./code/dataloader_recommendation.py


In [None]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mcode/dataloader_interest.py[m
	[31mcode/dataloader_recommendation.py[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
!git add . && git commit -m 'ADD code dataloaders for interest modeling and recommendations' && git push origin main

[main e7a3c51] ADD code dataloaders for interest modeling and recommendations
 2 files changed, 312 insertions(+)
 create mode 100644 code/dataloader_interest.py
 create mode 100644 code/dataloader_recommendation.py
Counting objects: 5, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 3.80 KiB | 3.80 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/sparsh-ai/reco-tut-cris.git
   aa75f54..e7a3c51  main -> main
