In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import json
import pickle
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory

#### 1. Set up mock repositories

##### Set up remote git repo for application used to train the model (this would usually be hosted in the cloud e.g. Github)

In [3]:
git_remote = TemporaryDirectory()
!git init --bare {git_remote.name}

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /private/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmpxyv1aezm/


##### Set up local git repo for application used to train the model

In [4]:
app_dir = TemporaryDirectory()
!git init {app_dir.name}
!cd {app_dir.name} && git remote add origin {git_remote.name}

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /private/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmp_558rbu0/.git/


##### Set up DVC remote to store the data (this would usually be hosted in the cloud e.g. S3, Azure Blob, Google Cloud Storage)

In [5]:
dvc_blob_storage = TemporaryDirectory()
!dvc --cd {app_dir.name} init
!dvc --cd {app_dir.name} install
!dvc --cd {app_dir.name} remote add -d dvc_blob_storage {dvc_blob_storage.name}

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m[0mSetting 'dvc_blob_storage' as a default remote.
[0m

#### 2. Track existing data

##### Copying data to application directory to mimic existing data already in the repository

In [6]:
INPUT_DATA_SAVE_PATH = Path(app_dir.name) / "data" / "generated_samples"
shutil.copytree(
    Path("data") / "generated_samples" / "sample_1",
    INPUT_DATA_SAVE_PATH / "sample_1",
)
shutil.copytree(
    Path("data") / "generated_samples" / "sample_2",
    INPUT_DATA_SAVE_PATH / "sample_2",
)

PosixPath('/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmp_558rbu0/data/generated_samples/sample_2')

##### Track the data that was initially being used for training the model

In [7]:
!dvc --cd {app_dir.name} add data/generated_samples --file data/generated_samples.dvc
assert (Path(app_dir.name) / "data" / "generated_samples.dvc").exists()

[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
![A
Computing file/dir hashes (only done once)            |0.00 [00:00,      ?md5/s][A

![A[A

  0%|          |                                   0.00/? [00:00<?,        ?B/s][A[A

                                                                                [A[A

![A[A

  0%|          |                                   0.00/? [00:00<?,        ?B/s][A[A

                                                                                [A[A
                                                                                [A
![A
  0%|          |                                 0.00/211 [00:00<?,        ?B/s][A
  0%|          |                                 0.00/211 [00:00<?,        ?B/s][A
                                                                                [A
![A
  0%|   

In [8]:
!ls -la {Path(app_dir.name) / "data"}

total 16
drwxr-xr-x  5 kevinlu  staff  160 Jan 22 16:24 [1m[36m.[m[m
drwx------  6 kevinlu  staff  192 Jan 22 16:24 [1m[36m..[m[m
-rw-r--r--  1 kevinlu  staff   19 Jan 22 16:24 .gitignore
drwxr-xr-x  4 kevinlu  staff  128 Jan 22 16:24 [1m[36mgenerated_samples[m[m
-rw-r--r--  1 kevinlu  staff  104 Jan 22 16:24 generated_samples.dvc


In [9]:
print((Path(app_dir.name) / "data" / ".gitignore").open("r").read())

/generated_samples



##### Read what's inside generated_samples.dvc

In [10]:
print((Path(app_dir.name) / "data" / "generated_samples.dvc").open("r").read())

outs:
- md5: 005b0633ea0f5b53e2dbf3cdbae70261.dir
  size: 3587101
  nfiles: 2
  path: generated_samples



#### 3. Train with config file

##### Set up mock training_inputs_folder and training_output folder

In [11]:
TRAINING_INPUT_PATH = Path(app_dir.name) / "training_inputs"
CONFIG_FILE_PATH = TRAINING_INPUT_PATH / "configs" / "config.json"
CONFIG_FILE_PATH.parent.mkdir(exist_ok=True, parents=True)
config = {"model_name": "resnet"}
json.dump(
    config,
    open(
        Path(CONFIG_FILE_PATH),
        "w",
    ),
)
TRAINING_OUTPUT_PATH = Path(app_dir.name) / "training_outputs"

##### Set up the mock training script

In [12]:
TRAIN_FILE = f"""
import json
import pickle
from pathlib import Path

def train_resnet_model(training_data):
    model_weights = [1, 1, 1]
    return model_weights
    
# load the config file & training data
config = json.load(open("{CONFIG_FILE_PATH}", "r"))
training_data = [str(p) for p in (Path("{app_dir.name}") / "data" / "generated_samples").glob("*")]

model_weights = train_resnet_model(training_data)

Path("{TRAINING_OUTPUT_PATH}").mkdir(exist_ok=True, parents=True)
pickle.dump(dict(model_name=config["model_name"], model_weights=model_weights, n_training_data_used=len(training_data)), open(f"{TRAINING_OUTPUT_PATH}/model.p", "wb"))
"""
with (Path(app_dir.name) / "train.py").open("w") as f:
    f.write(TRAIN_FILE)

##### Run the train script with DVC

In [13]:
!dvc --cd {app_dir.name} run --name train --deps train.py --deps training_inputs --deps data/generated_samples --outs training_outputs python train.py

Running stage 'train':                                                          
> python train.py
Computing file/dir hashes (only done once)            |0.00 [00:00,      ?md5/s]
![A
  0%|          |                                   0.00/? [00:00<?,        ?B/s][A
  0% Transferring|                                   |0/2 [00:00<?,     ?file/s][A
![A
  0%|          |02fd9deb759a6ff05d736c9f41b0d7.dir 0.00/? [00:00<?,        ?B/s][A
  0%|          |02fd9deb759a6ff05d736c9f41b0d7.d0.00/67.0 [00:00<?,        ?B/s][A
Creating 'dvc.yaml'                                                             [A
Adding stage 'train' in 'dvc.yaml'
Generating lock file 'dvc.lock'
Updating lock file 'dvc.lock'

To track the changes with git, run:

    git add dvc.lock dvc.yaml .gitignore

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [14]:
model = pickle.load(open(TRAINING_OUTPUT_PATH / "model.p", "rb"))
assert model["model_name"] == "resnet"
assert model["model_weights"] == [1, 1, 1]
assert model["n_training_data_used"] == 2
assert (TRAINING_OUTPUT_PATH / "model.p").exists()
assert (Path(app_dir.name) / "dvc.lock").exists()
assert (Path(app_dir.name) / "dvc.yaml").exists()

#### 4. Reproducing the workflow with DVC & see how changing the dependencies of the stage in the workflow triggers a retraining of the model

##### First run `dvc_repro`

In [15]:
!dvc --cd {app_dir.name} repro

'data/generated_samples.dvc' didn't change, skipping                  core[39m>
Stage 'train' didn't change, skipping                                           
Data and pipelines are up to date.
[0m

##### Change config file

In [16]:
config = {"model_name": "ResNeXt"}
json.dump(config, open(CONFIG_FILE_PATH, "w"))

##### Run `dvc repro`

In [17]:
!dvc --cd {app_dir.name} repro

'data/generated_samples.dvc' didn't change, skipping                  core[39m>
Running stage 'train':                                                          
> python train.py
Computing file/dir hashes (only done once)            |0.00 [00:00,      ?md5/s]
![A
  0%|          |                                   0.00/? [00:00<?,        ?B/s][A
  0% Transferring|                                   |0/2 [00:00<?,     ?file/s][A
![A
  0%|          |9f84b7ec502831c8451d727025e315.dir 0.00/? [00:00<?,        ?B/s][A
  0%|          |9f84b7ec502831c8451d727025e315.d0.00/67.0 [00:00<?,        ?B/s][A
Updating lock file 'dvc.lock'                                                   [A

To track the changes with git, run:

    git add dvc.lock

To enable auto staging, run:

	dvc config core.autostage true
Use `dvc push` to send your updates to remote storage.
[0m

##### Check if the `train.py` script was run

In [18]:
model = pickle.load(open(TRAINING_OUTPUT_PATH / "model.p", "rb"))
assert model["model_name"] == "ResNeXt"
assert model["n_training_data_used"] == 2
assert model["model_weights"] == [1, 1, 1]

##### Add new data

In [19]:
shutil.copytree(
    Path("data") / "generated_samples" / "sample_3",
    INPUT_DATA_SAVE_PATH / "sample_3",
)

PosixPath('/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmp_558rbu0/data/generated_samples/sample_3')

##### Run `dvc repro`

In [20]:
!dvc --cd {app_dir.name} repro

Computing file/dir hashes (only done once)            |0.00 [00:00,      ?md5/s]
![A
  0%|          |                                   0.00/? [00:00<?,        ?B/s][A

  0%|          |                                   0.00/? [00:00<?,        ?B/s][A[A
                                                                                [A

                                                                                [A[A
![A
  0%|          |                                   0.00/? [00:00<?,        ?B/s][A
Verifying data sources in stage: 'data/generated_samples.dvc'                   [A
  0% Transferring|                                   |0/2 [00:00<?,     ?file/s]
![A
  0%|          |6c61d04dc5ba4307ffa47684979360.dir 0.00/? [00:00<?,        ?B/s][A
  0%|          |6c61d04dc5ba4307ffa47684979360.di0.00/312 [00:00<?,        ?B/s][A
                                                                                [A
Running stage 'train':                                     

##### Check if the `train.py` script was run

In [21]:
model = pickle.load(open(TRAINING_OUTPUT_PATH / "model.p", "rb"))
assert model["model_name"] == "ResNeXt"
assert model["n_training_data_used"] == 3
assert model["model_weights"] == [1, 1, 1]

##### Update `train.py`

In [22]:
TRAIN_FILE = f"""
import json
import pickle
from pathlib import Path

def train_resnet_model(training_data):
    model_weights = [2, 2, 2]
    return model_weights
    
# load the config file & training data
config = json.load(open("{CONFIG_FILE_PATH}", "r"))
training_data = [str(p) for p in (Path("{app_dir.name}") / "data" / "generated_samples").glob("*")]

model_weights = train_resnet_model(training_data)

Path("{TRAINING_OUTPUT_PATH}").mkdir(exist_ok=True, parents=True)
pickle.dump(dict(model_name=config["model_name"], model_weights=model_weights, n_training_data_used=len(training_data)), open(f"{TRAINING_OUTPUT_PATH}/model.p", "wb"))
"""
with (Path(app_dir.name) / "train.py").open("w") as f:
    f.write(TRAIN_FILE)

##### Run `dvc repro`

In [23]:
!dvc --cd {app_dir.name} repro

'data/generated_samples.dvc' didn't change, skipping                  core[39m>
Running stage 'train':                                                          
> python train.py
Computing file/dir hashes (only done once)            |0.00 [00:00,      ?md5/s]
![A
  0%|          |                                   0.00/? [00:00<?,        ?B/s][A
  0% Transferring|                                   |0/2 [00:00<?,     ?file/s][A
![A
  0%|          |0ca6334ac2dbf1af5432e6f453fdcc.dir 0.00/? [00:00<?,        ?B/s][A
  0%|          |0ca6334ac2dbf1af5432e6f453fdcc.d0.00/67.0 [00:00<?,        ?B/s][A
Updating lock file 'dvc.lock'                                                   [A

To track the changes with git, run:

    git add dvc.lock

To enable auto staging, run:

	dvc config core.autostage true
Use `dvc push` to send your updates to remote storage.
[0m

In [24]:
model = pickle.load(open(TRAINING_OUTPUT_PATH / "model.p", "rb"))
assert model["model_name"] == "ResNeXt"
assert model["n_training_data_used"] == 3
assert model["model_weights"] == [2, 2, 2]

#### 5. Checking the file sizes of the repositories and blob storage

##### Push to remote (notice how the files are automatically pushed too)

In [25]:
!cd {app_dir.name} && git add .
!cd {app_dir.name} && git commit -m 'initial commit'
!cd {app_dir.name} && git push origin master

Data and pipelines are up to date.                                              
[0m[master (root-commit) 460e3dc] initial commit
 16 files changed, 574 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvc/plots/confusion.json
 create mode 100644 .dvc/plots/confusion_normalized.json
 create mode 100644 .dvc/plots/linear.json
 create mode 100644 .dvc/plots/scatter.json
 create mode 100644 .dvc/plots/simple.json
 create mode 100644 .dvc/plots/smooth.json
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 data/.gitignore
 create mode 100644 data/generated_samples.dvc
 create mode 100644 dvc.lock
 create mode 100644 dvc.yaml
 create mode 100644 train.py
 create mode 100644 training_inputs/configs/config.json
6 files pushed                                                                  
[0mEnumerating objects: 23, done.
Counting objects: 100% (23/23), done.
Delta compression using up to 10 threads
Compr

##### Check files in the application repo

In [26]:
!tree {app_dir.name}

[01;34m/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmp_558rbu0[00m
├── [01;34mdata[00m
│   ├── [01;34mgenerated_samples[00m
│   │   ├── [01;34msample_1[00m
│   │   │   └── [01;35mpreethi-viswanathan-h5fsy4E4FMY-unsplash.jpg[00m
│   │   ├── [01;34msample_2[00m
│   │   │   └── [01;35mnasa-Q1p7bh3SHj8-unsplash.jpg[00m
│   │   └── [01;34msample_3[00m
│   │       └── [01;35mjj-ying-8bghKxNU1j0-unsplash.jpg[00m
│   └── generated_samples.dvc
├── dvc.lock
├── dvc.yaml
├── train.py
├── [01;34mtraining_inputs[00m
│   └── [01;34mconfigs[00m
│       └── config.json
└── [01;34mtraining_outputs[00m
    └── model.p

8 directories, 9 files


In [27]:
!cd {app_dir.name} && du -sh --

8.8M	.


##### Check the size of the git remote

In [28]:
!cd {git_remote.name} && du -sh --

172K	.


##### Check the size of the dvc remote

In [29]:
!cd {dvc_blob_storage.name} && du -sh --

4.2M	.


#### 6. Using `dvc pull` to only pull down the model artifacts

##### Clone down model file

In [30]:
ci_env_dir = TemporaryDirectory()

!cd {ci_env_dir.name} && git clone {git_remote.name}
cloned_project_path = f"{ci_env_dir.name}/{Path(git_remote.name).stem}"

Cloning into 'tmpxyv1aezm'...
done.


##### Check to see only the files in git are being pulled down

In [31]:
!tree {cloned_project_path}

[01;34m/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmp05qql2dm/tmpxyv1aezm[00m
├── [01;34mdata[00m
│   └── generated_samples.dvc
├── dvc.lock
├── dvc.yaml
├── train.py
└── [01;34mtraining_inputs[00m
    └── [01;34mconfigs[00m
        └── config.json

3 directories, 5 files


##### Pull down the model file

In [32]:
!dvc --cd {cloned_project_path} pull train

  0% Checkout|                                       |0/1 [00:00<?,     ?file/s]
![A
  0%|          |.eio9jzvnrBGYUNUjT6Ss87.tmp        0.00/? [00:00<?,        ?B/s][A
  0%|          |.eio9jzvnrBGYUNUjT6Ss87.tmp     0.00/4.00 [00:00<?,        ?B/s][A
[32mA[0m       training_outputs/                                              [A
1 file added and 1 file fetched
[0m

##### Check that only the model file is pulled, but not the data

In [33]:
!tree {cloned_project_path}

[01;34m/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmp05qql2dm/tmpxyv1aezm[00m
├── [01;34mdata[00m
│   └── generated_samples.dvc
├── dvc.lock
├── dvc.yaml
├── train.py
├── [01;34mtraining_inputs[00m
│   └── [01;34mconfigs[00m
│       └── config.json
└── [01;34mtraining_outputs[00m
    └── model.p

4 directories, 6 files


#### 7. Mimic developer environment

##### First pull down the git files

In [34]:
dev_dir = TemporaryDirectory()
!cd {dev_dir.name} && git clone {git_remote.name}
cloned_project_path = f"{dev_dir.name}/{Path(git_remote.name).stem}"

Cloning into 'tmpxyv1aezm'...
done.


##### Check the files

In [35]:
!tree {cloned_project_path}

[01;34m/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmpwcgmjg8c/tmpxyv1aezm[00m
├── [01;34mdata[00m
│   └── generated_samples.dvc
├── dvc.lock
├── dvc.yaml
├── train.py
└── [01;34mtraining_inputs[00m
    └── [01;34mconfigs[00m
        └── config.json

3 directories, 5 files


##### Check the files after DVC pull

In [36]:
!dvc --cd {cloned_project_path} pull

  0% Checkout|                                       |0/4 [00:00<?,     ?file/s]
![A
  0%|          |.bUb6nf4rmnpxq2UDBKWYNT.tmp        0.00/? [00:00<?,        ?B/s][A
  0%|          |.bUb6nf4rmnpxq2UDBKWYNT.tmp     0.00/4.00 [00:00<?,        ?B/s][A
                                                                                [A
![A
  0%|          |.Mc5ik9dwo8kFmbmwbNApWP.tmp        0.00/? [00:00<?,        ?B/s][A
  0%|          |.Mc5ik9dwo8kFmbmwbNApWP.tmp     0.00/4.00 [00:00<?,        ?B/s][A
[32mA[0m       data/generated_samples/                                        [A
[32mA[0m       training_outputs/
2 files added and 4 files fetched
[0m

##### Check the files after DVC pull

In [37]:
!tree {cloned_project_path}

[01;34m/var/folders/l9/lc2j46w15bxdzz_cwhq2lx8w0000gn/T/tmpwcgmjg8c/tmpxyv1aezm[00m
├── [01;34mdata[00m
│   ├── [01;34mgenerated_samples[00m
│   │   ├── [01;34msample_1[00m
│   │   │   └── [01;35mpreethi-viswanathan-h5fsy4E4FMY-unsplash.jpg[00m
│   │   ├── [01;34msample_2[00m
│   │   │   └── [01;35mnasa-Q1p7bh3SHj8-unsplash.jpg[00m
│   │   └── [01;34msample_3[00m
│   │       └── [01;35mjj-ying-8bghKxNU1j0-unsplash.jpg[00m
│   └── generated_samples.dvc
├── dvc.lock
├── dvc.yaml
├── train.py
├── [01;34mtraining_inputs[00m
│   └── [01;34mconfigs[00m
│       └── config.json
└── [01;34mtraining_outputs[00m
    └── model.p

8 directories, 9 files
