fix CHANGELOG.md CONFLICT

qiita-spots · Feb 27, 2024 · 13eafdf · 13eafdf
2 parents 57d1b14 + 62292ca
commit 13eafdf
Show file tree

Hide file tree

Showing 39 changed files with 832 additions and 267 deletions.
diff --git a/.github/workflows/qiita-ci.yml b/.github/workflows/qiita-ci.yml
@@ -154,6 +154,8 @@ jobs:
 
           echo "5. Setting up qiita"
           conda activate qiita
+          # adapt environment_script for private qiita plugins from travis to github actions.
+          sed 's#export PATH="/home/travis/miniconda3/bin:$PATH"; source #source /home/runner/.profile; conda #' -i qiita_db/support_files/patches/54.sql
           qiita-env make --no-load-ontologies
           qiita-test-install
           qiita plugins update

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Qiita changelog
 
+Version 2024.02
+---------------
+
+Deployed on February 27th, 2024
+
+* Default workflows now accept commands with multiple inputs.
+* The loading time of the main study page was improved [#3350](https://github.com/qiita-spots/qiita/pull/3350).
+* SPP improvements - mainly @charles-cowart, thank you! Errors are now show to the user in the GUI [#127](hhttps://github.com/biocore/mg-scripts/pull/127), admins can restart jobs [#129](hhttps://github.com/biocore/mg-scripts/pull/129), adapter-trimmer files now are stored and their sequence counts are part of the prep-info [#126](hhttps://github.com/biocore/mg-scripts/pull/126), and support for per instrument/data-type configuration [#123](hhttps://github.com/biocore/mg-scripts/pull/123).
+* The internal Sequence Processing Pipeline is now using the https://www.gencodegenes.org human transcripts v44 for Metatranscriptomic data - additional to the human pan-genome reference, with the GRCh38 genome + PhiX and T2T-CHM13v2.0 genome - for human host filtering.
+* Added a command to qp-woltka: 'Calculate RNA Copy Counts'.
+* Other fixes - mainly by @sjanssen2, thank you!: [#3345](https://github.com/qiita-spots/qiita/pull/3345),[#3224](https://github.com/qiita-spots/qiita/pull/3224),  [#3357](https://github.com/qiita-spots/qiita/pull/3357), [#3358](https://github.com/qiita-spots/qiita/pull/3358), [#3359](https://github.com/qiita-spots/qiita/pull/3359), [#3362](https://github.com/qiita-spots/qiita/pull/3362), [#3364](https://github.com/qiita-spots/qiita/pull/3364).
+
+
 Version 2023.12
 ---------------
 
@@ -14,6 +27,7 @@ Deployed on January 8th, 2024
 * Updated the Adapter and host filtering plugin (qp-fastp-minimap2) to v2023.12 addressing a bug in adapter filtering; [more information](https://qiita.ucsd.edu/static/doc/html/processingdata/qp-fastp-minimap2.html).
 * Other fixes: [3334](https://github.com/qiita-spots/qiita/pull/3334), [3338](https://github.com/qiita-spots/qiita/pull/3338). Thank you @sjanssen2.
 * The internal Sequence Processing Pipeline is now using the human pan-genome reference, together with the GRCh38 genome + PhiX and T2T-CHM13v2.0 genome for human host filtering.
+* Added two new commands to qp-woltka: 'SynDNA Woltka' & 'Calculate Cell Counts'.
 
 
 Version 2023.10

diff --git a/INSTALL.md b/INSTALL.md
@@ -162,9 +162,9 @@ Navigate to the cloned directory and ensure your conda environment is active:
 cd qiita
 source activate qiita
 ```
-If you are using Ubuntu or a Windows Subsystem for Linux (WSL), you will need to ensure that you have a C++ compiler and that development libraries and include files for PostgreSQL are available. Type `cc` into your system to ensure that it doesn't result in `program not found`. The following commands will install a C++ compiler and  `libpq-dev`:
+If you are using Ubuntu or a Windows Subsystem for Linux (WSL), you will need to ensure that you have a C++ compiler and that development libraries and include files for PostgreSQL are available. Type `cc` into your system to ensure that it doesn't result in `program not found`. If you use the the GNU Compiler Collection, make sure to have `gcc` and `g++` available. The following commands will install a C++ compiler and  `libpq-dev`:
 ```bash
-sudo apt install gcc              # alternatively, you can install clang instead
+sudo apt install gcc g++             # alternatively, you can install clang instead
 sudo apt-get install libpq-dev
 ```
 Install Qiita (this occurs through setuptools' `setup.py` file in the qiita directory):
@@ -178,7 +178,7 @@ At this point, Qiita will be installed and the system will start. However,
 you will need to install plugins in order to process any kind of data. For a list
 of available plugins, visit the [Qiita Spots](https://github.com/qiita-spots)
 github organization. Each of the plugins have their own installation instructions, we
-suggest looking at each individual .travis.yml file to see detailed installation
+suggest looking at each individual .github/workflows/qiita-plugin-ci.yml file to see detailed installation
 instructions. Note that the most common plugins are:
 - [qtp-biom](https://github.com/qiita-spots/qtp-biom)
 - [qtp-sequencing](https://github.com/qiita-spots/qtp-sequencing)
@@ -224,15 +224,15 @@ export REDBIOM_HOST=http://my_host.com:7379
 
 ## Configure NGINX and supervisor
 
-(NGINX)[https://www.nginx.com/] is not a requirement for Qiita development but it's highly recommended for deploys as this will allow us
-to have multiple workers. Note that we are already installing (NGINX)[https://www.nginx.com/] within the Qiita conda environment; also,
-that Qiita comes with an example (NGINX)[https://www.nginx.com/]  config file: `qiita_pet/nginx_example.conf`, which is used in the Travis builds.
+[NGINX](https://www.nginx.com/) is not a requirement for Qiita development but it's highly recommended for deploys as this will allow us
+to have multiple workers. Note that we are already installing [NGINX](https://www.nginx.com/) within the Qiita conda environment; also,
+that Qiita comes with an example [NGINX](https://www.nginx.com/)  config file: `qiita_pet/nginx_example.conf`, which is used in the Travis builds.
 
-Now, (supervisor)[https://github.com/Supervisor/supervisor] will allow us to start all the workers we want based on its configuration file; and we
-need that both the (NGINX)[https://www.nginx.com/] and (supervisor)[https://github.com/Supervisor/supervisor] config files to match. For our Travis
+Now, [supervisor](https://github.com/Supervisor/supervisor) will allow us to start all the workers we want based on its configuration file; and we
+need that both the [NGINX](https://www.nginx.com/) and [supervisor](https://github.com/Supervisor/supervisor) config files to match. For our Travis
 testing we are creating 3 workers: 21174 for master and 21175-6 as a regular workers.
 
-If you are using (NGINX)[https://www.nginx.com/] via conda, you are going to need to create the NGINX folder within the environment; thus run:
+If you are using [NGINX](https://www.nginx.com/) via conda, you are going to need to create the NGINX folder within the environment; thus run:
 
 ```bash
 mkdir -p ${CONDA_PREFIX}/var/run/nginx/
@@ -256,7 +256,7 @@ Start the qiita server:
 qiita pet webserver start
 ```
 
-If all the above commands executed correctly, you should be able to access Qiita by going in your browser to https://localhost:21174 if you are not using NGINX, or https://localhost:8383 if you are using NGINX, to login use `test@foo.bar` and `password` as the credentials. (In the future, we will have a *single user mode* that will allow you to use a local Qiita server without logging in. You can track progress on this on issue [#920](https://github.com/biocore/qiita/issues/920).)
+If all the above commands executed correctly, you should be able to access Qiita by going in your browser to https://localhost:21174 if you are not using NGINX, or https://localhost:8383 if you are using NGINX, to login use `test@foo.bar` and `password` as the credentials. (Login as `admin@foo.bar` with `password` to see admin functionality. In the future, we will have a *single user mode* that will allow you to use a local Qiita server without logging in. You can track progress on this on issue [#920](https://github.com/biocore/qiita/issues/920).)
 
 
 

diff --git a/notebooks/resource-allocation/generate-allocation-summary-arrays.py b/notebooks/resource-allocation/generate-allocation-summary-arrays.py
@@ -0,0 +1,239 @@
+from qiita_core.util import MaxRSS_helper
+from qiita_db.software import Software
+import datetime
+from io import StringIO
+from subprocess import check_output
+import pandas as pd
+from os.path import join
+
+# This is an example script to collect the data we need from SLURM, the plan
+# is that in the near future we will clean up and add these to the Qiita's main
+# code and then have cronjobs to run them.
+
+# at time of writting we have:
+#     qp-spades spades
+# (*) qp-woltka Woltka v0.1.4
+#     qp-woltka SynDNA Woltka
+#     qp-woltka Calculate Cell Counts
+# (*) qp-meta Sortmerna v2.1b
+# (*) qp-fastp-minimap2 Adapter and host filtering v2023.12
+# ... and the admin plugin
+# (*) qp-klp
+# Here we are only going to create summaries for (*)
+
+
+sacct = ['sacct', '-p',
+         '--format=JobName,JobID,ElapsedRaw,MaxRSS,ReqMem', '-j']
+# for the non admin jobs, we will use jobs from the last six months
+six_months = datetime.date.today() - datetime.timedelta(weeks=6*4)
+
+print('The current "sofware - commands" that use job-arrays are:')
+for s in Software.iter():
+    if 'ENVIRONMENT="' in s.environment_script:
+        for c in s.commands:
+            print(f"{s.name} - {c.name}")
+
+# 1. Command: woltka
+
+fn = join('/panfs', 'qiita', 'jobs_woltka.tsv.gz')
+print(f"Generating the summary for the woltka jobs: {fn}.")
+
+cmds = [c for s in Software.iter(False)
+        if 'woltka' in s.name for c in s.commands]
+jobs = [j for c in cmds for j in c.processing_jobs if j.status == 'success' and
+        j.heartbeat.date() > six_months and j.input_artifacts]
+
+data = []
+for j in jobs:
+    size = sum([fp['fp_size'] for fp in j.input_artifacts[0].filepaths])
+    jid, mjid = j.external_id.strip().split()
+    rvals = StringIO(check_output(sacct + [jid]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    jmem = _d.MaxRSS.apply(lambda x: x if type(x) is not str
+                           else MaxRSS_helper(x)).max()
+    jwt = _d.ElapsedRaw.max()
+
+    rvals = StringIO(check_output(sacct + [mjid]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    mmem = _d.MaxRSS.apply(lambda x: x if type(x) is not str
+                           else MaxRSS_helper(x)).max()
+    mwt = _d.ElapsedRaw.max()
+
+    data.append({
+        'jid': j.id, 'sjid': jid, 'mem': jmem, 'wt': jwt, 'type': 'main',
+        'db': j.parameters.values['Database'].split('/')[-1]})
+    data.append(
+        {'jid': j.id, 'sjid': mjid, 'mem': mmem, 'wt': mwt, 'type': 'merge',
+         'db': j.parameters.values['Database'].split('/')[-1]})
+df = pd.DataFrame(data)
+df.to_csv(fn, sep='\t', index=False)
+
+# 2. qp-meta Sortmerna
+
+fn = join('/panfs', 'qiita', 'jobs_sortmerna.tsv.gz')
+print(f"Generating the summary for the woltka jobs: {fn}.")
+
+# for woltka we will only use jobs from the last 6 months
+cmds = [c for s in Software.iter(False)
+        if 'minimap2' in s.name.lower() for c in s.commands]
+jobs = [j for c in cmds for j in c.processing_jobs if j.status == 'success' and
+        j.heartbeat.date() > six_months and j.input_artifacts]
+
+data = []
+for j in jobs:
+    size = sum([fp['fp_size'] for fp in j.input_artifacts[0].filepaths])
+    jid, mjid = j.external_id.strip().split()
+    rvals = StringIO(check_output(sacct + [jid]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    jmem = _d.MaxRSS.apply(lambda x: x if type(x) is not str
+                           else MaxRSS_helper(x)).max()
+    jwt = _d.ElapsedRaw.max()
+
+    rvals = StringIO(check_output(sacct + [mjid]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    mmem = _d.MaxRSS.apply(lambda x: x if type(x) is not str
+                           else MaxRSS_helper(x)).max()
+    mwt = _d.ElapsedRaw.max()
+
+    data.append({
+        'jid': j.id, 'sjid': jid, 'mem': jmem, 'wt': jwt, 'type': 'main'})
+    data.append(
+        {'jid': j.id, 'sjid': mjid, 'mem': mmem, 'wt': mwt, 'type': 'merge'})
+df = pd.DataFrame(data)
+df.to_csv(fn, sep='\t', index=False)
+
+
+# 3. Adapter and host filtering. Note that there is a new version deployed on
+#    Jan 2024 so the current results will not be the most accurate
+
+fn = join('/panfs', 'qiita', 'jobs_adapter_host.tsv.gz')
+print(f"Generating the summary for the woltka jobs: {fn}.")
+
+# for woltka we will only use jobs from the last 6 months
+cmds = [c for s in Software.iter(False)
+        if 'meta' in s.name.lower() for c in s.commands]
+jobs = [j for c in cmds if 'sortmerna' in c.name.lower()
+        for j in c.processing_jobs if j.status == 'success' and
+        j.heartbeat.date() > six_months and j.input_artifacts]
+
+data = []
+for j in jobs:
+    size = sum([fp['fp_size'] for fp in j.input_artifacts[0].filepaths])
+    jid, mjid = j.external_id.strip().split()
+    rvals = StringIO(check_output(sacct + [jid]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    jmem = _d.MaxRSS.apply(lambda x: x if type(x) is not str
+                           else MaxRSS_helper(x)).max()
+    jwt = _d.ElapsedRaw.max()
+
+    rvals = StringIO(check_output(sacct + [mjid]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    mmem = _d.MaxRSS.apply(lambda x: x if type(x) is not str
+                           else MaxRSS_helper(x)).max()
+    mwt = _d.ElapsedRaw.max()
+
+    data.append({
+        'jid': j.id, 'sjid': jid, 'mem': jmem, 'wt': jwt, 'type': 'main'})
+    data.append(
+        {'jid': j.id, 'sjid': mjid, 'mem': mmem, 'wt': mwt, 'type': 'merge'})
+df = pd.DataFrame(data)
+df.to_csv(fn, sep='\t', index=False)
+
+
+# 4. The SPP!
+
+fn = join('/panfs', 'qiita', 'jobs_spp.tsv.gz')
+print(f"Generating the summary for the SPP jobs: {fn}.")
+
+# for the SPP we will look at jobs from the last year
+year = datetime.date.today() - datetime.timedelta(days=365)
+cmds = [c for s in Software.iter(False)
+        if s.name == 'qp-klp' for c in s.commands]
+jobs = [j for c in cmds for j in c.processing_jobs if j.status == 'success' and
+        j.heartbeat.date() > year]
+
+# for the SPP we need to find the jobs that were actually run, this means
+# looping throught the existing slurm jobs and finding them
+max_inter = 2000
+
+data = []
+for job in jobs:
+    jei = int(job.external_id)
+    rvals = StringIO(
+        check_output(sacct + [str(jei)]).decode('ascii'))
+    _d = pd.read_csv(rvals, sep='|')
+    mem = _d.MaxRSS.apply(
+        lambda x: x if type(x) is not str else MaxRSS_helper(x)).max()
+    wt = _d.ElapsedRaw.max()
+    # the current "easy" way to determine if amplicon or other is to check
+    # the file extension of the filename
+    stype = 'other'
+    if job.parameters.values['sample_sheet']['filename'].endswith('.txt'):
+        stype = 'amplicon'
+    rid = job.parameters.values['run_identifier']
+    data.append(
+        {'jid': job.id, 'sjid': jei, 'mem': mem, 'stype': stype, 'wt': wt,
+         'type': 'main', 'rid': rid, 'name': _d.JobName[0]})
+
+    # let's look for the convert job
+    for jid in range(jei + 1, jei + max_inter):
+        rvals = StringIO(check_output(sacct + [str(jid)]).decode('ascii'))
+        _d = pd.read_csv(rvals, sep='|')
+        if [1 for x in _d.JobName.values if x.startswith(job.id)]:
+            cjid = int(_d.JobID[0])
+            mem = _d.MaxRSS.apply(
+                lambda x: x if type(x) is not str else MaxRSS_helper(x)).max()
+            wt = _d.ElapsedRaw.max()
+
+            data.append(
+                {'jid': job.id, 'sjid': cjid, 'mem': mem, 'stype': stype,
+                 'wt': wt, 'type': 'convert', 'rid': rid,
+                 'name': _d.JobName[0]})
+
+            # now let's look for the next step, if amplicon that's fastqc but
+            # if other that's qc/nuqc
+            for jid in range(cjid + 1, cjid + max_inter):
+                rvals = StringIO(
+                    check_output(sacct + [str(jid)]).decode('ascii'))
+                _d = pd.read_csv(rvals, sep='|')
+                if [1 for x in _d.JobName.values if x.startswith(job.id)]:
+                    qc_jid = _d.JobIDRaw.apply(
+                        lambda x: int(x.split('.')[0])).max()
+                    qcmem = _d.MaxRSS.apply(
+                        lambda x: x if type(x) is not str
+                        else MaxRSS_helper(x)).max()
+                    qcwt = _d.ElapsedRaw.max()
+
+                    if stype == 'amplicon':
+                        data.append(
+                            {'jid': job.id, 'sjid': qc_jid, 'mem': qcmem,
+                             'stype': stype, 'wt': qcwt, 'type': 'fastqc',
+                             'rid': rid, 'name': _d.JobName[0]})
+                    else:
+                        data.append(
+                            {'jid': job.id, 'sjid': qc_jid, 'mem': qcmem,
+                             'stype': stype, 'wt': qcwt, 'type': 'qc',
+                             'rid': rid, 'name': _d.JobName[0]})
+                        for jid in range(qc_jid + 1, qc_jid + max_inter):
+                            rvals = StringIO(check_output(
+                                sacct + [str(jid)]).decode('ascii'))
+                            _d = pd.read_csv(rvals, sep='|')
+                            if [1 for x in _d.JobName.values if x.startswith(
+                                    job.id)]:
+                                fqc_jid = _d.JobIDRaw.apply(
+                                    lambda x: int(x.split('.')[0])).max()
+                                fqcmem = _d.MaxRSS.apply(
+                                    lambda x: x if type(x) is not str
+                                    else MaxRSS_helper(x)).max()
+                                fqcwt = _d.ElapsedRaw.max()
+                                data.append(
+                                    {'jid': job.id, 'sjid': fqc_jid,
+                                     'mem': fqcmem, 'stype': stype,
+                                     'wt': fqcwt, 'type': 'fastqc',
+                                     'rid': rid, 'name': _d.JobName[0]})
+                                break
+                    break
+            break
+
+df = pd.DataFrame(data)
+df.to_csv(fn, sep='\t', index=False)
diff --git a/notebooks/resource-allocation/generate-allocation-summary.py b/notebooks/resource-allocation/generate-allocation-summary.py
@@ -5,6 +5,7 @@
 from json import loads
 from os.path import join
 
+from qiita_core.util import MaxRSS_helper
 from qiita_db.exceptions import QiitaDBUnknownIDError
 from qiita_db.processing_job import ProcessingJob
 from qiita_db.software import Software
@@ -117,19 +118,8 @@
 print('Make sure that only 0/K/M exist', set(
     df.MaxRSS.apply(lambda x: str(x)[-1])))
 
-
-def _helper(x):
-    if x[-1] == 'K':
-        y = float(x[:-1]) * 1000
-    elif x[-1] == 'M':
-        y = float(x[:-1]) * 1000000
-    else:
-        y = float(x)
-    return y
-
-
 # Generating new columns
-df['MaxRSSRaw'] = df.MaxRSS.apply(lambda x: _helper(str(x)))
+df['MaxRSSRaw'] = df.MaxRSS.apply(lambda x: MaxRSS_helper(str(x)))
 df['ElapsedRawTime'] = df.ElapsedRaw.apply(
     lambda x: timedelta(seconds=float(x)))
 

diff --git a/qiita_core/__init__.py b/qiita_core/__init__.py
@@ -6,4 +6,4 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
-__version__ = "2023.12"
+__version__ = "2024.02"
diff --git a/qiita_core/tests/test_util.py b/qiita_core/tests/test_util.py
@@ -10,7 +10,7 @@
 
 from qiita_core.util import (
     qiita_test_checker, execute_as_transaction, get_qiita_version,
-    is_test_environment, get_release_info)
+    is_test_environment, get_release_info, MaxRSS_helper)
 from qiita_db.meta_util import (
     generate_biom_and_metadata_release, generate_plugin_releases)
 import qiita_db as qdb
@@ -82,6 +82,20 @@ def test_get_release_info(self):
         self.assertEqual(biom_metadata_release, ('', '', ''))
         self.assertNotEqual(archive_release, ('', '', ''))
 
+    def test_MaxRSS_helper(self):
+        tests = [
+            ('6', 6.0),
+            ('6K', 6000),
+            ('6M', 6000000),
+            ('6G', 6000000000),
+            ('6.9', 6.9),
+            ('6.9K', 6900),
+            ('6.9M', 6900000),
+            ('6.9G', 6900000000),
+        ]
+        for x, y in tests:
+            self.assertEqual(MaxRSS_helper(x), y)
+
 
 if __name__ == '__main__':
     main()