In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
# default_exp dvc

In [3]:
# export
from pathlib import Path
from roheboam.engine.utils.convenience import is_notebook, run_shell_command
from tempfile import TemporaryDirectory

ROOT_PATH = Path(globals()["_dh"][0]) if is_notebook() else Path(__file__).parent

In [4]:
# export
from roheboam.engine.integrations.dvc import DVC

  m_new_contract('Container', ist(collections.Container))


#### 0. Setup

In [5]:
print(
    "Creating git remote for application (this would usually be hosted in the cloud e.g. Github)"
)
git_remote = TemporaryDirectory()
run_shell_command(f"git init --bare {git_remote.name}")

print("Creating git for application")
app_dir = TemporaryDirectory()
run_shell_command(f"git init {app_dir.name}")
run_shell_command(f"cd {app_dir.name} && git remote add origin {git_remote.name}")


print(
    "Creating DVC remote (this would usually be hosted in the cloud e.g. S3, Azure Blob, Google Cloud Storage)"
)
dvc = DVC()
dvc_remote = TemporaryDirectory()

dvc.init(dvc_run_path=app_dir.name)
dvc.install(dvc_run_path=app_dir.name)
dvc.add_remote(
    remote_name="local_remote",
    remote_path=dvc_remote.name,
    dvc_run_path=app_dir.name,
)

  and should_run_async(code)


Creating git remote for application (this would usually be hosted in the cloud e.g. Github)
git init --bare /tmp/tmppp14pdmm
hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: 
hint: 	git config --global init.defaultBranch <name>
hint: 
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be renamed via this command:
hint: 
hint: 	git branch -m <name>
Initialized empty Git repository in /tmp/tmppp14pdmm/
Creating git for application
git init /tmp/tmp8fkr3ksl
hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: 
hint: 	git config --global init.defaultBranch <name>
hint: 
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be r

#### 1. Track existing data

In [6]:
import shutil

print("Copying data to application directory to mock existing data")
shutil.copytree(
    ROOT_PATH / "data" / "generated_samples" / "sample_1",
    Path(app_dir.name) / "data" / "generated_samples" / "sample_1",
)
shutil.copytree(
    ROOT_PATH / "data" / "generated_samples" / "sample_2",
    Path(app_dir.name) / "data" / "generated_samples" / "sample_2",
)

print("Track data")
dvc.add(
    add_path=Path(app_dir.name) / "data" / "generated_samples",
    dvc_output_path=Path(app_dir.name) / "data" / "generated_samples.dvc",
    dvc_run_path=Path(app_dir.name),
)
assert (Path(app_dir.name) / "data" / "generated_samples.dvc").exists()

Copying data to application directory to mock existing data
Track data
dvc --cd /tmp/tmp8fkr3ksl add /tmp/tmp8fkr3ksl/data/generated_samples --file /tmp/tmp8fkr3ksl/data/generated_samples.dvc

To track the changes with git, run:

    git add data/.gitignore data/generated_samples.dvc

To enable auto staging, run:

	dvc config core.autostage true


In [7]:
print((Path(app_dir.name) / "data" / "generated_samples.dvc").open("r").read())

outs:
- md5: 005b0633ea0f5b53e2dbf3cdbae70261.dir
  size: 3587101
  nfiles: 2
  path: generated_samples



#### 2. Train with config file

In [8]:
import json
import pickle

print(
    "Setup mock training_inputs folder, input config files and training_outputs folder"
)
CONFIG_FILE_PATH = Path(app_dir.name) / "training_inputs" / "configs"
CONFIG_FILE_PATH.mkdir(exist_ok=True, parents=True)
config = {"model_name": "model_1"}
json.dump(
    config,
    open(
        Path(app_dir.name) / "training_inputs" / "configs" / "config.json",
        "w",
    ),
)

MODEL_OUTPUT_PATH = Path(app_dir.name) / "training_outputs"
TRAIN_FILE = f"""
import json
import pickle
from pathlib import Path
config = json.load(open("{app_dir.name}/training_inputs/configs/config.json", "r"))
training_data = [str(p) for p in (Path("{app_dir.name}") / "data" / "generated_samples").glob("*")]
(Path.cwd() / "training_outputs").mkdir(exist_ok=True, parents=True)
pickle.dump(dict(model_name=config["model_name"], training_data=training_data), open(f"{MODEL_OUTPUT_PATH}/model.p", "wb"))
"""
with (Path(app_dir.name) / "train.py").open("w") as f:
    f.write(TRAIN_FILE)

# Run with DVC
dvc.run(
    "train",
    f"python train.py",
    dependency_paths=["training_inputs", "data/generated_samples"],
    output_paths=["training_outputs"],
    dvc_run_path=Path(app_dir.name),
)

# Test with assertions
model = pickle.load(open(MODEL_OUTPUT_PATH / "model.p", "rb"))
model["model_name"] == "model_1"
assert len(model["training_data"]) == 2
assert (MODEL_OUTPUT_PATH / "model.p").exists()
assert (Path(app_dir.name) / "dvc.lock").exists()
assert (Path(app_dir.name) / "dvc.yaml").exists()

Setup mock training_inputs folder, input config files and training_outputs folder
dvc --cd /tmp/tmp8fkr3ksl run -n train -d training_inputs -d data/generated_samples -o training_outputs python train.py
Running stage 'train':
> python train.py
Creating 'dvc.yaml'
Adding stage 'train' in 'dvc.yaml'
Generating lock file 'dvc.lock'
Updating lock file 'dvc.lock'

To track the changes with git, run:

    git add .gitignore dvc.yaml dvc.lock

To enable auto staging, run:

	dvc config core.autostage true


#### 3. Change config and use DVC repo

In [9]:
import pickle

# Change config file
config = {"model_name": "model_1_changed"}
json.dump(config, open(CONFIG_FILE_PATH / "config.json", "w"))

# Repro
dvc.repro(dvc_run_path=Path(app_dir.name))

# See if script was run
model = pickle.load(open(MODEL_OUTPUT_PATH / "model.p", "rb"))
model["model_name"] == "model_1_changed"

dvc --cd /tmp/tmp8fkr3ksl repro
'data/generated_samples.dvc' didn't change, skipping
Running stage 'train':
> python train.py
Updating lock file 'dvc.lock'

To track the changes with git, run:

    git add dvc.lock

To enable auto staging, run:

	dvc config core.autostage true
Use `dvc push` to send your updates to remote storage.


True

#### 4. Update data and use DVC repo

In [10]:
import pickle

# Change config file
shutil.copytree(
    ROOT_PATH / "data" / "generated_samples" / "sample_3",
    Path(app_dir.name) / "data" / "generated_samples" / "sample_3",
)

# Repro
dvc.repro(dvc_run_path=Path(app_dir.name))

# See if script was run
model = pickle.load(open(MODEL_OUTPUT_PATH / "model.p", "rb"))
assert model["model_name"] == "model_1_changed"
assert len(model["training_data"]) == 3

dvc --cd /tmp/tmp8fkr3ksl repro
Verifying data sources in stage: 'data/generated_samples.dvc'

Running stage 'train':
> python train.py
Updating lock file 'dvc.lock'

To track the changes with git, run:

    git add data/generated_samples.dvc dvc.lock

To enable auto staging, run:

	dvc config core.autostage true
Use `dvc push` to send your updates to remote storage.


#### 5. Add everything and push to remote

In [11]:
print("Notice here that all the data that has been added by DVC is ignored")
run_shell_command(f"cd {app_dir.name} && git add .")
run_shell_command(f"cd {app_dir.name} && git commit -m 'initial commit'")
run_shell_command(f"cd {app_dir.name} && git push origin master")

Notice here that all the data that has been added by DVC is ignored
cd /tmp/tmp8fkr3ksl && git add .
cd /tmp/tmp8fkr3ksl && git commit -m 'initial commit'
Data and pipelines are up to date.
[master (root-commit) 1ade05e] initial commit
 16 files changed, 561 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvc/plots/confusion.json
 create mode 100644 .dvc/plots/confusion_normalized.json
 create mode 100644 .dvc/plots/linear.json
 create mode 100644 .dvc/plots/scatter.json
 create mode 100644 .dvc/plots/simple.json
 create mode 100644 .dvc/plots/smooth.json
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 data/.gitignore
 create mode 100644 data/generated_samples.dvc
 create mode 100644 dvc.lock
 create mode 100644 dvc.yaml
 create mode 100644 train.py
 create mode 100644 training_inputs/configs/config.json
cd /tmp/tmp8fkr3ksl && git push origin master
6 files pushed
To /tmp/tmppp14pdmm
 * [new branch

In [12]:
print("")
run_shell_command(f"cd {app_dir.name} && du -sh --")


cd /tmp/tmp8fkr3ksl && du -sh --
9.1M	.


In [13]:
# Get the 5 largest files
run_shell_command(f"cd {app_dir.name} && du -ah | sort -h | tail -n 5")

cd /tmp/tmp8fkr3ksl && du -ah | sort -h | tail -n 5
4.2M	./data/generated_samples
4.3M	./.dvc/cache
4.3M	./data
4.5M	./.dvc
9.1M	.


In [14]:
run_shell_command(f"cd {app_dir.name} && git remote -v")

cd /tmp/tmp8fkr3ksl && git remote -v
origin	/tmp/tmppp14pdmm (fetch)
origin	/tmp/tmppp14pdmm (push)


In [15]:
run_shell_command(f"cd {dvc_remote.name} && du -sh --")

cd /tmp/tmpmj8geip_ && du -sh --
4.3M	.


In [16]:
run_shell_command(f"cd {git_remote.name} && du -sh --")

cd /tmp/tmppp14pdmm && du -sh --
296K	.


#### 6. Pull down only the model file (useful during CI when we need to test the model and deploy it, no need for data files)

In [17]:
print("Clone down our model file")
tmp_dir_ci_env = TemporaryDirectory()
run_shell_command(f"cd {tmp_dir_ci_env.name} && git clone {git_remote.name}")
cloned_project_path = f"{tmp_dir_ci_env.name}/{Path(git_remote.name).stem}"

print("Check we don't have our model or data files")
run_shell_command(f"cd {cloned_project_path} && du -sh -- *")
dvc.pull(stage="train", dvc_run_path=cloned_project_path)

print(
    "Check what has changed, not that even though the data folder exists, not data resides in it"
)
run_shell_command(f"cd {cloned_project_path} && du -sh -- *")

print("Check that the model file exists")
run_shell_command(f"cd {cloned_project_path}/training_outputs && du -sh -- *")

Clone down our model file
cd /tmp/tmpuu9o38nq && git clone /tmp/tmppp14pdmm
Cloning into 'tmppp14pdmm'...
done.
Check we don't have our model or data files
cd /tmp/tmpuu9o38nq/tmppp14pdmm && du -sh -- *
12K	data
4.0K	dvc.lock
4.0K	dvc.yaml
4.0K	train.py
12K	training_inputs
dvc --cd /tmp/tmpuu9o38nq/tmppp14pdmm pull train
A       training_outputs/
1 file added and 1 file fetched
Check what has changed, not that even though the data folder exists, not data resides in it
cd /tmp/tmpuu9o38nq/tmppp14pdmm && du -sh -- *
12K	data
4.0K	dvc.lock
4.0K	dvc.yaml
4.0K	train.py
12K	training_inputs
8.0K	training_outputs
Check that the model file exists
cd /tmp/tmpuu9o38nq/tmppp14pdmm/training_outputs && du -sh -- *
4.0K	model.p


#### 7. Mimic developer environment

In [18]:
print("Clone down all files needed for development")
tmp_dir_dev_environment = TemporaryDirectory()
run_shell_command(f"cd {tmp_dir_dev_environment.name} && git clone {git_remote.name}")
cloned_project_path = f"{tmp_dir_dev_environment.name}/{Path(git_remote.name).stem}"

print("Check the files before DVC pull")
run_shell_command(f"cd {cloned_project_path} && du -sh -- *")

print("Check the files after DVC pull")
dvc.pull(dvc_run_path=cloned_project_path)
run_shell_command(f"cd {cloned_project_path} && du -sh -- *")

Clone down all files needed for development
cd /tmp/tmpz67_mmwv && git clone /tmp/tmppp14pdmm
Cloning into 'tmppp14pdmm'...
done.
Check the files before DVC pull
cd /tmp/tmpz67_mmwv/tmppp14pdmm && du -sh -- *
12K	data
4.0K	dvc.lock
4.0K	dvc.yaml
4.0K	train.py
12K	training_inputs
Check the files after DVC pull
dvc --cd /tmp/tmpz67_mmwv/tmppp14pdmm pull
A       training_outputs/
A       data/generated_samples/
2 files added and 4 files fetched
cd /tmp/tmpz67_mmwv/tmppp14pdmm && du -sh -- *
4.3M	data
4.0K	dvc.lock
4.0K	dvc.yaml
4.0K	train.py
12K	training_inputs
8.0K	training_outputs
