In [None]:
import os
project_name = "recobase"; branch = "US567625"; account = "recohut"
project_path = os.path.join('/content', branch)

if not os.path.exists(project_path):
    !pip install -U -q dvc dvc[gdrive]
    !cp -r /content/drive/MyDrive/git_credentials/. ~
    !mkdir "{project_path}"
    %cd "{project_path}"
    !git init
    !git remote add origin https://github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout -b "{branch}"
else:
    %cd "{project_path}"

[K     |████████████████████████████████| 666 kB 7.1 MB/s 
[K     |████████████████████████████████| 170 kB 61.1 MB/s 
[K     |████████████████████████████████| 49 kB 6.5 MB/s 
[K     |████████████████████████████████| 211 kB 66.3 MB/s 
[K     |████████████████████████████████| 530 kB 51.2 MB/s 
[K     |████████████████████████████████| 119 kB 58.5 MB/s 
[K     |████████████████████████████████| 40 kB 18 kB/s 
[K     |████████████████████████████████| 109 kB 54.4 MB/s 
[K     |████████████████████████████████| 296 kB 57.1 MB/s 
[K     |████████████████████████████████| 44 kB 2.9 MB/s 
[K     |████████████████████████████████| 4.6 MB 54.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 64.0 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[K     |████████████████████████████████| 201 kB 48.6 MB/s 
[K     |████████████████████████████████| 64 kB 3.2 MB/s 
[K     |████████████████████████████████| 2.6 MB 39.5 MB/s 
[K     |███████████████████████████

In [None]:
!git status

On branch US567625
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   Makefile[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/bronze/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add .
!git commit -m 'commit'
!git push origin "{branch}"

In [None]:
%%writefile ./src/preprocess.py
import pickle
import shutil
import tempfile
import os
from datetime import date
from pathlib import Path
import gzip
import argparse
from abc import *

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()


class AbstractDataset(metaclass=ABCMeta):
    def __init__(self, args):
        self.min_rating = args.min_rating
        self.min_uc = args.min_uc
        self.min_sc = args.min_sc
        self.split = args.split

        assert self.min_uc >= 2, 'Need at least 2 ratings per user for validation and test'

    @classmethod
    @abstractmethod
    def code(cls):
        pass

    @classmethod
    def raw_code(cls):
        return cls.code()

    @abstractmethod
    def preprocess(self):
        pass

    @abstractmethod
    def load_ratings_df(self):
        pass

    def load_dataset(self):
        self.preprocess()
        dataset_path = self._get_preprocessed_dataset_path()
        dataset = pickle.load(dataset_path.open('rb'))
        return dataset

    def filter_triplets(self, df):
        print('Filtering triplets')
        if self.min_sc > 0:
            item_sizes = df.groupby('sid').size()
            good_items = item_sizes.index[item_sizes >= self.min_sc]
            df = df[df['sid'].isin(good_items)]

        if self.min_uc > 0:
            user_sizes = df.groupby('uid').size()
            good_users = user_sizes.index[user_sizes >= self.min_uc]
            df = df[df['uid'].isin(good_users)]
        return df

    def densify_index(self, df):
        print('Densifying index')
        umap = {u: i for i, u in enumerate(set(df['uid']), start=1)}
        smap = {s: i for i, s in enumerate(set(df['sid']), start=1)}
        df['uid'] = df['uid'].map(umap)
        df['sid'] = df['sid'].map(smap)
        return df, umap, smap

    def split_df(self, df, user_count):
        if self.split == 'leave_one_out':
            print('Splitting')
            user_group = df.groupby('uid')
            user2items = user_group.progress_apply(
                lambda d: list(d.sort_values(by=['timestamp', 'sid'])['sid']))
            train, val, test = {}, {}, {}
            for i in range(user_count):
                user = i + 1
                items = user2items[user]
                train[user], val[user], test[user] = items[:-2], items[-2:-1], items[-1:]
            return train, val, test
        else:
            raise NotImplementedError

    def _get_rawdata_root_path(self):
        return Path(RAW_DATASET_ROOT_FOLDER)

    def _get_rawdata_folder_path(self):
        root = self._get_rawdata_root_path()
        return root.joinpath(self.raw_code())

    def _get_preprocessed_root_path(self):
        root = Path(PREP_DATASET_ROOT_FOLDER)
        return root.joinpath(self.raw_code())

    def _get_preprocessed_folder_path(self):
        preprocessed_root = self._get_preprocessed_root_path()
        # folder_name = '{}_min_rating{}-min_uc{}-min_sc{}-split{}' \
        #     .format(self.code(), self.min_rating, self.min_uc, self.min_sc, self.split)
        # return preprocessed_root.joinpath(folder_name)
        return preprocessed_root

    def _get_preprocessed_dataset_path(self):
        folder = self._get_preprocessed_folder_path()
        return folder.joinpath('dataset.pkl')


class ML1MDataset(AbstractDataset):
    @classmethod
    def code(cls):
        return 'ml-1m'

    def preprocess(self):
        dataset_path = self._get_preprocessed_dataset_path()
        if not dataset_path.parent.is_dir():
            dataset_path.parent.mkdir(parents=True)
        df = self.load_ratings_df()
        df = self.filter_triplets(df)
        df, umap, smap = self.densify_index(df)
        train, val, test = self.split_df(df, len(umap))
        dataset = {'train': train,
                   'val': val,
                   'test': test,
                   'umap': umap,
                   'smap': smap}
        with dataset_path.open('wb') as f:
            pickle.dump(dataset, f)

    def load_ratings_df(self):
        folder_path = self._get_rawdata_folder_path()
        file_path = folder_path.joinpath('ratings.dat')
        df = pd.read_csv(file_path, sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        return df


if __name__ == '__main__':
    RAW_DATASET_ROOT_FOLDER = 'data/bronze'
    PREP_DATASET_ROOT_FOLDER = 'data/silver'
    class Args:
        min_rating = 0
        min_uc = 5
        min_sc = 5
        split = 'leave_one_out'
    args = Args()
    dataset = ML1MDataset(args)
    dataset.preprocess()

Overwriting ./src/preprocess.py


In [None]:
!python ./src/preprocess.py

Filtering triplets
Densifying index
Splitting
100% 6040/6040 [00:07<00:00, 852.56it/s]


In [None]:
!dvc run -n preprocess \
          -d src/preprocess.py -d data/bronze/ml-1m/ratings.dat \
          -o data/silver/ml-1m \
          python src/preprocess.py

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      Running stage 'preprocess':
> python src/preprocess.py
Filtering triplets
Densifying index
Splitting
100% 6040/6040 [00:07<00:00, 853.95it/s]
Computing file/dir hashes (only done once)          |0.00 [00:00,      ?md5/s]
![A
          |0.00 [00:00,       ?it/s][A
Transferring:   0% 0/2 [00:00<?, ?file/s{'info': ''}]
676bf10661c9f231027a2ca0715450.dir:   0% 0/71 [00:00<?, ?it/s][A
676bf10661c9f231027a2ca0715450.dir:   0% 0/71 [00:00<?, ?it/s{'info': ''}][A
Creating 'dvc.yaml'
Adding stage 'preprocess' in 'dvc.yaml'
Generating lock file 'dvc.lock'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.yaml dvc.lock data/silver/.gitignore
[0m

In [None]:
!git status -u

On branch US567625
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   Makefile[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/bronze/ml-1m/.gitignore[m
	[31mdata/bronze/ml-1m/movies.dat.dvc[m
	[31mdata/bronze/ml-1m/ratings.dat.dvc[m
	[31mdata/bronze/ml-1m/users.dat.dvc[m
	[31mdata/silver/.gitignore[m
	[31mdvc.lock[m
	[31mdvc.yaml[m
	[31msrc/preprocess.py[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!dvc status

Data and pipelines are up to date.
[0m

In [None]:
!dvc commit
!dvc push

Querying remote cache:   0% 0/1 [00:00<?, ?file/s{'info': ''}]Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

Enter verification code: 4/1AX4XfWiJtb4-yGCUomUcjBsxE-VbcI8-QS29UsgVxUZSrR4UVH4QlFGOxak
Authentication successful.
Transferring:   0% 0/2 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |62c4ea047f847a78a46ab3dce58bcd     0.00/? [00:00<?,        ?B/s][A
62c4ea047f847a78a46ab3dce58bcd:   0% 0.00/3.02M [00:00<?, ?B/s{'info': ''}]     [A
  0% 8.00k/3.02M [00:01<06:51, 7.68kB/s{'info': ''}]                       [A
 93% 2.82M/3.02M [00:01<00:00, 3.31MB/s{'info': ''}][A
Transferring:  50% 1/2 [00:03<00:03,  3.58s/file{'info': ''}]
![A
 

In [None]:
!git add .
!git commit -m 'commit'
!git push origin "{branch}"

[US567625 8a641e0] commit
 9 files changed, 220 insertions(+)
 create mode 100644 data/bronze/ml-1m/.gitignore
 create mode 100644 data/bronze/ml-1m/movies.dat.dvc
 create mode 100644 data/bronze/ml-1m/ratings.dat.dvc
 create mode 100644 data/bronze/ml-1m/users.dat.dvc
 create mode 100644 data/silver/.gitignore
 create mode 100644 dvc.lock
 create mode 100644 dvc.yaml
 create mode 100644 src/preprocess.py
Counting objects: 16, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (12/12), done.
Writing objects: 100% (16/16), 3.43 KiB | 3.43 MiB/s, done.
Total 16 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/recohut/recobase.git
   a49eed5..8a641e0  US567625 -> US567625
