feat(template): complete cifar10 classification (#118)

* feat(template): complete cifar10 classification * feat: add log_every_iters in config.yaml and omegaconf req * chore: use idist.Parallel, spawn_kwargs -> kwargs * feat: add support for argparse and hydra Add configuration library selection in tab Templates * fix: merge config in json file * fix!: keep yaml/hydra combo, add logger pkg in requirements.txt remove json/argparse combo since json.load will result an error for plain `int` and argparse will fail to call string "int" for given inputs. * fix: fix cmd run script in readme, add node_rank * fix: disable hydra logging and outputs path
pytorch-ignite · May 21, 2021 · 41e2d14 · 41e2d14
1 parent 3afba9e
commit 41e2d14
Show file tree

Hide file tree

Showing 21 changed files with 554 additions and 193 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ dist-ssr
 *.local
 __pycache__
 *.log
-.vscode
+.vscode
+*.tar.gz
diff --git a/scripts/check_copies.py b/scripts/check_copies.py
@@ -21,5 +21,26 @@ def check_utils():
             print(red, "Unmatched", file, reset)
 
 
+def check_readme():
+    red = "\033[31m"
+    green = "\033[32m"
+    reset = "\033[0m"
+
+    with open("./src/templates/template-common/README.md", "r") as f:
+        common_utils = f.read()
+
+    path = Path("./src/templates/")
+
+    for file in path.rglob("**/README.md"):
+        utils = file.read_text("utf-8")
+        if utils.find(common_utils) > -1:
+            print(green, "Matched", file, reset)
+        else:
+            print(red, "Unmatched", file, reset)
+
+
 if __name__ == "__main__":
     check_utils()
+    print()
+    check_readme()
+    print()
diff --git a/src/components/CodeBlock.vue b/src/components/CodeBlock.vue
@@ -18,6 +18,7 @@
 <script>
 import { highlight, languages } from 'prismjs'
 import 'prismjs/components/prism-json'
+import 'prismjs/components/prism-yaml'
 import 'prismjs/components/prism-python'
 import 'prismjs/components/prism-markdown'
 import 'prismjs/themes/prism-tomorrow.css'
@@ -162,6 +163,7 @@ div[class~='language-bash']::before {
   content: 'sh';
 }
 
+div[class~='language-yml']::before,
 div[class~='language-yaml']::before {
   content: 'yaml';
 }

diff --git a/src/components/NavBar.vue b/src/components/NavBar.vue
@@ -95,16 +95,19 @@ import { ref } from 'vue'
 export default {
   components: { IconDiscord, IconDownload, IconGitHub, IconTwitter },
   setup() {
-    let zip = new JSZip()
     const showDownloadMsg = ref(false)
     const currentCommit = __COMMIT__ /* from vite.config.js */
 
     const downloadProject = () => {
+      const zip = new JSZip()
       if (store.code && Object.keys(store.code).length) {
         msg.color = '#ff0000'
         if (!store.config.output_dir) {
           msg.showMsg = true
           msg.content = `Output directory is required. Please input in Loggers tab.`
+        } else if (!store.config.log_every_iters) {
+          msg.showMsg = true
+          msg.content = `Logging interval is required. Please input in Loggers tab.`
         } else {
           for (const filename in store.code) {
             zip.file(filename, store.code[filename])

diff --git a/src/components/PaneRight.vue b/src/components/PaneRight.vue
@@ -1,8 +1,8 @@
 <template>
-  <div v-if="tabs">
+  <div v-if="tabs()">
     <div class="right-pane-tabs">
       <div
-        v-for="tab in tabs"
+        v-for="tab in tabs()"
         :key="tab"
         class="right-pane-tab"
         :class="{ active: currentTab === tab }"
@@ -38,22 +38,20 @@ export default {
   components: { CodeBlock, Instruction },
   setup() {
     const currentTab = ref('README.md')
-    const tabs = computed(() => {
+    const tabs = () => {
       if (store.config.template) {
-        const tabsArr = Object.keys(templates[store.config.template])
-        if (import.meta.env.DEV) {
-          tabsArr.push(__DEV_CONFIG_FILE__)
-        }
-        return tabsArr
+        return Object.keys(store.code)
       }
-    })
+    }
     // search more file types mapping on
     // https://icones.js.org/collection/vscode-icons
     const fileTypes = {
       py: 'python',
       md: 'markdown',
       json: 'json',
-      txt: 'text'
+      txt: 'text',
+      yml: 'yaml',
+      yaml: 'yaml'
     }
 
     const getFileType = (tab) => {

diff --git a/src/components/TabHandlers.vue b/src/components/TabHandlers.vue
@@ -15,6 +15,11 @@
       :saveKey="filename_prefix.name"
       :type="filename_prefix.type"
     />
+    <FormInput
+      :label="save_every_iters.description"
+      :saveKey="save_every_iters.name"
+      :type="save_every_iters.type"
+    />
     <FormInput
       :label="n_saved.description"
       :saveKey="n_saved.name"

diff --git a/src/components/TabLoggers.vue b/src/components/TabLoggers.vue
@@ -7,6 +7,12 @@
       :saveKey="output_dir.name"
       required
     />
+    <FormInput
+      type="number"
+      :label="log_every_iters.description"
+      :saveKey="log_every_iters.name"
+      required
+    />
     <FormSelect
       :label="logger.description"
       :options="logger.options"

diff --git a/src/components/TabTemplates.vue b/src/components/TabTemplates.vue
@@ -26,7 +26,11 @@ export default {
 
     const downloadTemplates = () => fetchTemplates(store.config.template)
 
-    return { templateLabel, templateOptions, downloadTemplates }
+    return {
+      templateLabel,
+      templateOptions,
+      downloadTemplates
+    }
   }
 }
 </script>
diff --git a/src/metadata/metadata.json b/src/metadata/metadata.json
@@ -8,7 +8,7 @@
     "launch": {
       "name": "launch",
       "type": "radio",
-      "description": "Run the training with torch.distributed.launch"
+      "description": "Run the training with torch.distributed.launch (recommended)"
     },
     "spawn": {
       "name": "spawn",
@@ -18,13 +18,13 @@
     "nproc_per_node": {
       "name": "nproc_per_node",
       "type": "number",
-      "description": "Number of processes to launch on each node",
+      "description": "Number of processes to launch on each node (mandatory for single node, multi gpus distributed training)",
       "min": 1
     },
     "nnodes": {
       "name": "nnodes",
       "type": "number",
-      "description": "Number of nodes to use for distributed training",
+      "description": "Number of nodes to use for distributed training (mandatory for multi nodes, multi gpus distributed training)",
       "min": 1
     },
     "master_addr": {
@@ -43,7 +43,7 @@
     "save_training": {
       "name": "save_training",
       "type": "checkbox",
-      "description": "Save the training state by every save_every_iters."
+      "description": "Save the training state (models, optimizers, trainers, ...) by every save_every_iters."
     },
     "save_evaluation": {
       "name": "save_evaluation",
@@ -69,18 +69,24 @@
       "name": "filename_prefix",
       "type": "text",
       "value": "checkpointing",
-      "description": "What prefix would you like to put in front of saved checkpoint file?"
+      "description": "What prefix would you like to put in front of saved checkpoint file? (mandatory for saving training states)"
+    },
+    "save_every_iters": {
+      "name": "save_every_iters",
+      "type": "number",
+      "value": "checkpointing",
+      "description": "Iteration interval for saving training states (mandatory for saving training states)"
     },
     "n_saved": {
       "name": "n_saved",
       "type": "number",
       "value": "checkpointing",
-      "description": "How many checkpoint file would you like to keep on disk?"
+      "description": "How many checkpoint file would you like to keep on disk? (mandatory for saving both training and evaluation)"
     },
     "limit_sec": {
       "name": "limit_sec",
       "type": "number",
-      "description": "How long do you want to run for the training and then terminate?"
+      "description": "How long do you want to run for the training and then terminate? (in seconds)"
     }
   },
   "loggers": {
@@ -89,6 +95,11 @@
       "type": "text",
       "description": "Directory to save all outputs"
     },
+    "log_every_iters": {
+      "name": "log_every_iters",
+      "type": "number",
+      "description": "Logging interval for training statistics"
+    },
     "logger": {
       "name": "logger",
       "type": "array",

diff --git a/src/store.js b/src/store.js
@@ -57,15 +57,17 @@ export function saveConfig(key, value) {
 }
 
 // render the code if there are fetched files for current selected template
-export async function genCode() {
+export function genCode() {
   const currentFiles = files[store.config.template]
   if (currentFiles && Object.keys(currentFiles).length) {
     for (const file in currentFiles) {
-      store.code[file] = ejs.render(currentFiles[file], store.config)
+      store.code[file] = ejs
+        .render(currentFiles[file], store.config)
+        .replaceAll(/(\n\n\n\n)+/gi, '\n')
+    }
+    if (isDev) {
+      store.code[__DEV_CONFIG_FILE__] = JSON.stringify(store.config, null, 2)
     }
-  }
-  if (isDev) {
-    store.code[__DEV_CONFIG_FILE__] = JSON.stringify(store.config, null, 2)
   }
 }
 

diff --git a/src/templates/template-common/README.md b/src/templates/template-common/README.md
@@ -0,0 +1,125 @@
+#::: if (it.dist === 'launch') { :::#
+#::: if (it.nproc_per_node) { :::#
+#::: if (it.nnodes && it.master_addr && it.master_port) { :::#
+
+### Multi Node, Multi GPU Training (`torch.distributed.launch`) (recommended)
+
+- Execute on master node
+
+```sh
+python -m torch.distributed.launch \
+  --nproc_per_node #:::= nproc_per_node :::# \
+  --nnodes #:::= it.nnodes :::# \
+  --node_rank 0 \
+  --master_addr #:::= it.master_addr :::# \
+  --master_port #:::= it.master_port :::# \
+  --use_env main.py backend=nccl \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+- Execute on worker nodes
+
+```sh
+python -m torch.distributed.launch \
+  --nproc_per_node #:::= nproc_per_node :::# \
+  --nnodes #:::= it.nnodes :::# \
+  --node_rank <node_rank> \
+  --master_addr #:::= it.master_addr :::# \
+  --master_port #:::= it.master_port :::# \
+  --use_env main.py backend=nccl \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+#::: } else { :::#
+
+### Multi GPU Training (`torch.distributed.launch`) (recommended)
+
+```sh
+python -m torch.distributed.launch \
+  --nproc_per_node #:::= it.nproc_per_node :::# \
+  --use_env main.py backend=nccl \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+#::: } :::#
+#::: } :::#
+#::: } :::#
+
+#::: if (it.dist === 'spawn') { :::#
+#::: if (it.nproc_per_node) { :::#
+#::: if (it.nnodes && it.master_addr && it.master_port) { :::#
+
+### Multi Node, Multi GPU Training (`torch.multiprocessing.spawn`)
+
+- Execute on master node
+
+```sh
+python main.py  \
+  nproc_per_node=#:::= nproc_per_node :::# \
+  nnodes=#:::= it.nnodes :::# \
+  node_rank=0 \
+  master_addr=#:::= it.master_addr :::# \
+  master_port=#:::= it.master_port :::# \
+  backend=nccl \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+- Execute on worker nodes
+
+```sh
+python main.py  \
+  nproc_per_node=#:::= nproc_per_node :::# \
+  nnodes=#:::= it.nnodes :::# \
+  node_rank=<node_rank> \
+  master_addr=#:::= it.master_addr :::# \
+  master_port=#:::= it.master_port :::# \
+  backend=nccl \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+#::: } else { :::#
+
+### Multi GPU Training (`torch.multiprocessing.spawn`)
+
+```sh
+python main.py  \
+  nproc_per_node=#:::= it.nproc_per_node :::# \
+  backend=nccl \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+#::: } :::#
+#::: } :::#
+#::: } :::#
+
+#::: if (!it.nproc_per_node) { :::#
+
+### 1 GPU Training
+
+```sh
+python main.py \
+  hydra.run.dir=. \
+  hydra.output_subdir=null \
+  hydra/job_logging=disabled \
+  hydra/hydra_logging=disabled
+```
+
+#::: } :::#