Skip to content

Commit

Permalink
feat(template): complete cifar10 classification (#118)
Browse files Browse the repository at this point in the history
* feat(template): complete cifar10 classification

* feat: add log_every_iters in config.yaml and omegaconf req

* chore: use idist.Parallel, spawn_kwargs -> kwargs

* feat: add support for argparse and hydra

Add configuration library selection in tab Templates

* fix: merge config in json file

* fix!: keep yaml/hydra combo, add logger pkg in requirements.txt

remove json/argparse combo since json.load will result an error for plain `int` and argparse will fail to call string "int" for given inputs.

* fix: fix cmd run script in readme, add node_rank

* fix: disable hydra logging and outputs path
  • Loading branch information
ydcjeff committed May 21, 2021
1 parent 3afba9e commit 41e2d14
Show file tree
Hide file tree
Showing 21 changed files with 554 additions and 193 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ dist-ssr
*.local
__pycache__
*.log
.vscode
.vscode
*.tar.gz
21 changes: 21 additions & 0 deletions scripts/check_copies.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,26 @@ def check_utils():
print(red, "Unmatched", file, reset)


def check_readme():
red = "\033[31m"
green = "\033[32m"
reset = "\033[0m"

with open("./src/templates/template-common/README.md", "r") as f:
common_utils = f.read()

path = Path("./src/templates/")

for file in path.rglob("**/README.md"):
utils = file.read_text("utf-8")
if utils.find(common_utils) > -1:
print(green, "Matched", file, reset)
else:
print(red, "Unmatched", file, reset)


if __name__ == "__main__":
check_utils()
print()
check_readme()
print()
2 changes: 2 additions & 0 deletions src/components/CodeBlock.vue
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<script>
import { highlight, languages } from 'prismjs'
import 'prismjs/components/prism-json'
import 'prismjs/components/prism-yaml'
import 'prismjs/components/prism-python'
import 'prismjs/components/prism-markdown'
import 'prismjs/themes/prism-tomorrow.css'
Expand Down Expand Up @@ -162,6 +163,7 @@ div[class~='language-bash']::before {
content: 'sh';
}
div[class~='language-yml']::before,
div[class~='language-yaml']::before {
content: 'yaml';
}
Expand Down
5 changes: 4 additions & 1 deletion src/components/NavBar.vue
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,19 @@ import { ref } from 'vue'
export default {
components: { IconDiscord, IconDownload, IconGitHub, IconTwitter },
setup() {
let zip = new JSZip()
const showDownloadMsg = ref(false)
const currentCommit = __COMMIT__ /* from vite.config.js */
const downloadProject = () => {
const zip = new JSZip()
if (store.code && Object.keys(store.code).length) {
msg.color = '#ff0000'
if (!store.config.output_dir) {
msg.showMsg = true
msg.content = `Output directory is required. Please input in Loggers tab.`
} else if (!store.config.log_every_iters) {
msg.showMsg = true
msg.content = `Logging interval is required. Please input in Loggers tab.`
} else {
for (const filename in store.code) {
zip.file(filename, store.code[filename])
Expand Down
18 changes: 8 additions & 10 deletions src/components/PaneRight.vue
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
<template>
<div v-if="tabs">
<div v-if="tabs()">
<div class="right-pane-tabs">
<div
v-for="tab in tabs"
v-for="tab in tabs()"
:key="tab"
class="right-pane-tab"
:class="{ active: currentTab === tab }"
Expand Down Expand Up @@ -38,22 +38,20 @@ export default {
components: { CodeBlock, Instruction },
setup() {
const currentTab = ref('README.md')
const tabs = computed(() => {
const tabs = () => {
if (store.config.template) {
const tabsArr = Object.keys(templates[store.config.template])
if (import.meta.env.DEV) {
tabsArr.push(__DEV_CONFIG_FILE__)
}
return tabsArr
return Object.keys(store.code)
}
})
}
// search more file types mapping on
// https://icones.js.org/collection/vscode-icons
const fileTypes = {
py: 'python',
md: 'markdown',
json: 'json',
txt: 'text'
txt: 'text',
yml: 'yaml',
yaml: 'yaml'
}
const getFileType = (tab) => {
Expand Down
5 changes: 5 additions & 0 deletions src/components/TabHandlers.vue
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
:saveKey="filename_prefix.name"
:type="filename_prefix.type"
/>
<FormInput
:label="save_every_iters.description"
:saveKey="save_every_iters.name"
:type="save_every_iters.type"
/>
<FormInput
:label="n_saved.description"
:saveKey="n_saved.name"
Expand Down
6 changes: 6 additions & 0 deletions src/components/TabLoggers.vue
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
:saveKey="output_dir.name"
required
/>
<FormInput
type="number"
:label="log_every_iters.description"
:saveKey="log_every_iters.name"
required
/>
<FormSelect
:label="logger.description"
:options="logger.options"
Expand Down
6 changes: 5 additions & 1 deletion src/components/TabTemplates.vue
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ export default {
const downloadTemplates = () => fetchTemplates(store.config.template)
return { templateLabel, templateOptions, downloadTemplates }
return {
templateLabel,
templateOptions,
downloadTemplates
}
}
}
</script>
25 changes: 18 additions & 7 deletions src/metadata/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"launch": {
"name": "launch",
"type": "radio",
"description": "Run the training with torch.distributed.launch"
"description": "Run the training with torch.distributed.launch (recommended)"
},
"spawn": {
"name": "spawn",
Expand All @@ -18,13 +18,13 @@
"nproc_per_node": {
"name": "nproc_per_node",
"type": "number",
"description": "Number of processes to launch on each node",
"description": "Number of processes to launch on each node (mandatory for single node, multi gpus distributed training)",
"min": 1
},
"nnodes": {
"name": "nnodes",
"type": "number",
"description": "Number of nodes to use for distributed training",
"description": "Number of nodes to use for distributed training (mandatory for multi nodes, multi gpus distributed training)",
"min": 1
},
"master_addr": {
Expand All @@ -43,7 +43,7 @@
"save_training": {
"name": "save_training",
"type": "checkbox",
"description": "Save the training state by every save_every_iters."
"description": "Save the training state (models, optimizers, trainers, ...) by every save_every_iters."
},
"save_evaluation": {
"name": "save_evaluation",
Expand All @@ -69,18 +69,24 @@
"name": "filename_prefix",
"type": "text",
"value": "checkpointing",
"description": "What prefix would you like to put in front of saved checkpoint file?"
"description": "What prefix would you like to put in front of saved checkpoint file? (mandatory for saving training states)"
},
"save_every_iters": {
"name": "save_every_iters",
"type": "number",
"value": "checkpointing",
"description": "Iteration interval for saving training states (mandatory for saving training states)"
},
"n_saved": {
"name": "n_saved",
"type": "number",
"value": "checkpointing",
"description": "How many checkpoint file would you like to keep on disk?"
"description": "How many checkpoint file would you like to keep on disk? (mandatory for saving both training and evaluation)"
},
"limit_sec": {
"name": "limit_sec",
"type": "number",
"description": "How long do you want to run for the training and then terminate?"
"description": "How long do you want to run for the training and then terminate? (in seconds)"
}
},
"loggers": {
Expand All @@ -89,6 +95,11 @@
"type": "text",
"description": "Directory to save all outputs"
},
"log_every_iters": {
"name": "log_every_iters",
"type": "number",
"description": "Logging interval for training statistics"
},
"logger": {
"name": "logger",
"type": "array",
Expand Down
12 changes: 7 additions & 5 deletions src/store.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,17 @@ export function saveConfig(key, value) {
}

// render the code if there are fetched files for current selected template
export async function genCode() {
export function genCode() {
const currentFiles = files[store.config.template]
if (currentFiles && Object.keys(currentFiles).length) {
for (const file in currentFiles) {
store.code[file] = ejs.render(currentFiles[file], store.config)
store.code[file] = ejs
.render(currentFiles[file], store.config)
.replaceAll(/(\n\n\n\n)+/gi, '\n')
}
if (isDev) {
store.code[__DEV_CONFIG_FILE__] = JSON.stringify(store.config, null, 2)
}
}
if (isDev) {
store.code[__DEV_CONFIG_FILE__] = JSON.stringify(store.config, null, 2)
}
}

Expand Down
125 changes: 125 additions & 0 deletions src/templates/template-common/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#::: if (it.dist === 'launch') { :::#
#::: if (it.nproc_per_node) { :::#
#::: if (it.nnodes && it.master_addr && it.master_port) { :::#

### Multi Node, Multi GPU Training (`torch.distributed.launch`) (recommended)

- Execute on master node

```sh
python -m torch.distributed.launch \
--nproc_per_node #:::= nproc_per_node :::# \
--nnodes #:::= it.nnodes :::# \
--node_rank 0 \
--master_addr #:::= it.master_addr :::# \
--master_port #:::= it.master_port :::# \
--use_env main.py backend=nccl \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

- Execute on worker nodes

```sh
python -m torch.distributed.launch \
--nproc_per_node #:::= nproc_per_node :::# \
--nnodes #:::= it.nnodes :::# \
--node_rank <node_rank> \
--master_addr #:::= it.master_addr :::# \
--master_port #:::= it.master_port :::# \
--use_env main.py backend=nccl \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

#::: } else { :::#

### Multi GPU Training (`torch.distributed.launch`) (recommended)

```sh
python -m torch.distributed.launch \
--nproc_per_node #:::= it.nproc_per_node :::# \
--use_env main.py backend=nccl \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

#::: } :::#
#::: } :::#
#::: } :::#

#::: if (it.dist === 'spawn') { :::#
#::: if (it.nproc_per_node) { :::#
#::: if (it.nnodes && it.master_addr && it.master_port) { :::#

### Multi Node, Multi GPU Training (`torch.multiprocessing.spawn`)

- Execute on master node

```sh
python main.py \
nproc_per_node=#:::= nproc_per_node :::# \
nnodes=#:::= it.nnodes :::# \
node_rank=0 \
master_addr=#:::= it.master_addr :::# \
master_port=#:::= it.master_port :::# \
backend=nccl \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

- Execute on worker nodes

```sh
python main.py \
nproc_per_node=#:::= nproc_per_node :::# \
nnodes=#:::= it.nnodes :::# \
node_rank=<node_rank> \
master_addr=#:::= it.master_addr :::# \
master_port=#:::= it.master_port :::# \
backend=nccl \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

#::: } else { :::#

### Multi GPU Training (`torch.multiprocessing.spawn`)

```sh
python main.py \
nproc_per_node=#:::= it.nproc_per_node :::# \
backend=nccl \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

#::: } :::#
#::: } :::#
#::: } :::#

#::: if (!it.nproc_per_node) { :::#

### 1 GPU Training

```sh
python main.py \
hydra.run.dir=. \
hydra.output_subdir=null \
hydra/job_logging=disabled \
hydra/hydra_logging=disabled
```

#::: } :::#

0 comments on commit 41e2d14

Please sign in to comment.