In [None]:
import os
project_name = "recobase"; branch = "US567625"; account = "recohut"
project_path = os.path.join('/content', branch)

if not os.path.exists(project_path):
    !pip install -U -q dvc dvc[gdrive]
    !cp -r /content/drive/MyDrive/git_credentials/. ~
    !mkdir "{project_path}"
    %cd "{project_path}"
    !git init
    !git remote add origin https://github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout -b "{branch}"
else:
    %cd "{project_path}"

In [None]:
!git status

In [None]:
!git add .
!git commit -m 'commit'
!git push origin "{branch}"

In [None]:
!dvc pull data/bronze/ml-1m/ratings.dat

In [None]:
!dvc repro

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      'data/bronze/ml-1m/ratings.dat.dvc' didn't change, skipping
Running stage 'preprocess':
> python src/preprocess.py
Filtering triplets
Densifying index
Splitting
100% 6040/6040 [00:07<00:00, 860.16it/s]
Computing file/dir hashes (only done once)          |0.00 [00:00,      ?md5/s]
![A
          |0.00 [00:00,       ?it/s][A
Transferring:   0% 0/2 [00:00<?, ?file/s{'info': ''}]
676bf10661c9f231027a2ca0715450.dir:   0% 0/71 [00:00<?, ?it/s][A
676bf10661c9f231027a2ca0715450.dir:   0% 0/71 [00:00<?, ?it/s{'info': ''}][A
Use `dvc push` to send your updates to remote storage.
[0m

In [None]:
%%writefile ./src/negative_sampling.py
from abc import *
from pathlib import Path
import pickle
import os
from tqdm import trange
import numpy as np
from collections import Counter


class AbstractNegativeSampler(metaclass=ABCMeta):
    def __init__(self, train, val, test, user_count, item_count, sample_size, seed, flag, save_folder):
        self.train = train
        self.val = val
        self.test = test
        self.user_count = user_count
        self.item_count = item_count
        self.sample_size = sample_size
        self.seed = seed
        self.flag = flag
        self.save_path = save_path

    @classmethod
    @abstractmethod
    def code(cls):
        pass

    @abstractmethod
    def generate_negative_samples(self):
        pass

    def get_negative_samples(self):
        savefile_path = self._get_save_path()
        print("Negative samples don't exist. Generating.")
        seen_samples, negative_samples = self.generate_negative_samples()
        with savefile_path.open('wb') as f:
            pickle.dump([seen_samples, negative_samples], f)
        return seen_samples, negative_samples

    def _get_save_path(self):
        folder = Path(self.save_path)
        if not folder.is_dir():
            folder.mkdir(parents=True)
        # filename = '{}-sample_size{}-seed{}-{}.pkl'.format(
        #     self.code(), self.sample_size, self.seed, self.flag)
        filename = 'negative_samples_{}.pkl'.format(self.flag)
        return folder.joinpath(filename)


class RandomNegativeSampler(AbstractNegativeSampler):
    @classmethod
    def code(cls):
        return 'random'

    def generate_negative_samples(self):
        assert self.seed is not None, 'Specify seed for random sampling'
        np.random.seed(self.seed)
        num_samples = 2 * self.user_count * self.sample_size
        all_samples = np.random.choice(self.item_count, num_samples) + 1

        seen_samples = {}
        negative_samples = {}
        print('Sampling negative items randomly...')
        j = 0
        for i in trange(self.user_count):
            user = i + 1
            seen = set(self.train[user])
            seen.update(self.val[user])
            seen.update(self.test[user])
            seen_samples[user] = seen

            samples = []
            while len(samples) < self.sample_size:
                item = all_samples[j % num_samples]
                j += 1
                if item in seen or item in samples:
                    continue
                samples.append(item)
            negative_samples[user] = samples

        return seen_samples, negative_samples


class PopularNegativeSampler(AbstractNegativeSampler):
    @classmethod
    def code(cls):
        return 'popular'

    def generate_negative_samples(self):
        assert self.seed is not None, 'Specify seed for random sampling'
        np.random.seed(self.seed)
        popularity = self.items_by_popularity()
        items = list(popularity.keys())
        total = 0
        for i in range(len(items)):
            total += popularity[items[i]]
        for i in range(len(items)):
            popularity[items[i]] /= total
        probs = list(popularity.values())
        num_samples = 2 * self.user_count * self.sample_size
        all_samples = np.random.choice(items, num_samples, p=probs)

        seen_samples = {}
        negative_samples = {}
        print('Sampling negative items by popularity...')
        j = 0
        for i in trange(self.user_count):
            user = i + 1
            seen = set(self.train[user])
            seen.update(self.val[user])
            seen.update(self.test[user])
            seen_samples[user] = seen

            samples = []
            while len(samples) < self.sample_size:
                item = all_samples[j % num_samples]
                j += 1
                if item in seen or item in samples:
                    continue
                samples.append(item)
            negative_samples[user] = samples

        return seen_samples, negative_samples

    def items_by_popularity(self):
        popularity = Counter()
        self.users = sorted(self.train.keys())
        for user in self.users:
            popularity.update(self.train[user])
            popularity.update(self.val[user])
            popularity.update(self.test[user])

        popularity = dict(popularity)
        popularity = {k: v for k, v in sorted(popularity.items(), key=lambda item: item[1], reverse=True)}
        return popularity


NEGATIVE_SAMPLERS = {
    PopularNegativeSampler.code(): PopularNegativeSampler,
    RandomNegativeSampler.code(): RandomNegativeSampler,
}

def negative_sampler_factory(code, train, val, test, 
                             user_count, item_count,
                             sample_size, seed, flag,
                             save_path):
    negative_sampler = NEGATIVE_SAMPLERS[code]
    return negative_sampler(train, val, test, user_count,
                            item_count, sample_size, seed, 
                            flag, save_path)


if __name__ == '__main__':
    PREP_DATASET_ROOT_FOLDER = 'data/silver'
    FEATURES_ROOT_FOLDER = 'data/gold'
    source_filepath = Path(os.path.join(PREP_DATASET_ROOT_FOLDER, 'ml-1m/dataset.pkl'))
    dataset = pickle.load(source_filepath.open('rb'))
    code = 'random'
    train = dataset['train']
    val = dataset['val']
    test = dataset['test']
    umap = dataset['umap']
    smap = dataset['smap']
    user_count = len(umap)
    item_count = len(smap)
    sample_size = 100
    seed = 0
    flag = 'val'
    save_path = os.path.join(FEATURES_ROOT_FOLDER, 'ml-1m', 'negative_samples')
    negative_sampler = negative_sampler_factory(code, train, val, test, 
                             user_count, item_count,
                             sample_size, seed, flag,
                             save_path)
    _, _ = negative_sampler.get_negative_samples()

Overwriting ./src/negative_sampling.py


In [None]:
!python ./src/negative_sampling.py

In [None]:
!dvc run -n negative_sampling \
          -d src/negative_sampling.py -d data/silver/ml-1m/dataset.pkl \
          -o data/gold/ml-1m/negative_samples \
          python src/negative_sampling.py

Running stage 'negative_sampling':
> python src/negative_sampling.py
Negative samples don't exist. Generating.
Sampling negative items randomly...
100% 6040/6040 [00:01<00:00, 4987.64it/s]
Computing file/dir hashes (only done once)          |0.00 [00:00,      ?md5/s]
![A
          |0.00 [00:00,       ?it/s][A
Transferring:   0% 0/2 [00:00<?, ?file/s{'info': ''}]
fee17715b501f9e3580d9031307e38.dir:   0% 0/84 [00:00<?, ?it/s][A
fee17715b501f9e3580d9031307e38.dir:   0% 0/84 [00:00<?, ?it/s{'info': ''}][A
Adding stage 'negative_sampling' in 'dvc.yaml'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.yaml dvc.lock data/gold/ml-1m/.gitignore
[0m

In [None]:
!git status -u

On branch US567625
Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mdeleted:    data/bronze/ml-1m/movies.dat.dvc[m
	[31mmodified:   dvc.lock[m
	[31mmodified:   dvc.yaml[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/gold/ml-1m/.gitignore[m
	[31msrc/negative_sampling.py[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!dvc status

Data and pipelines are up to date.
[0m

In [None]:
!dvc commit
!dvc push

outputs ['data/bronze/ml-1m/users.dat'] of stage: 'data/bronze/ml-1m/users.dat.dvc' changed. Are you sure you want to commit it? [y/n] y
[31mERROR[39m: failed to commit - output 'data/bronze/ml-1m/users.dat' does not exist
Querying remote cache:   0% 0/2 [00:00<?, ?file/s{'info': ''}]Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

Enter verification code: 4/1AX4XfWjwJTi1_Jp_fTpxWFiLL_0WuyN2cWQz7H5Fwo1C0Wl1v6aVhnlcld0
Authentication successful.
Transferring:   0% 0/2 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |0ae55228423ad7072cdf931fbe6997     0.00/? [00:00<?,        ?B/s][A
0ae55228423ad7072cdf931fbe6997:   0% 0.00/24.3M [00:00<?, ?B/s

In [None]:
!git add .
!git commit -m 'commit'
!git push origin "{branch}"

[US567625 c5abe33] commit
 5 files changed, 195 insertions(+), 11 deletions(-)
 delete mode 100644 data/bronze/ml-1m/movies.dat.dvc
 create mode 100644 data/gold/ml-1m/.gitignore
 create mode 100644 src/negative_sampling.py
Counting objects: 12, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (8/8), done.
Writing objects: 100% (12/12), 2.65 KiB | 2.65 MiB/s, done.
Total 12 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/recohut/recobase.git
   8a641e0..c5abe33  US567625 -> US567625
