Merge pull request #755 from sequana/dev

Dev
sequana · Jul 1, 2022 · 7c178f9 · 7c178f9
2 parents a37349a + 511bab8
commit 7c178f9
Show file tree

Hide file tree

Showing 173 changed files with 3,180 additions and 1,150 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -7,6 +7,8 @@ on:
       - dev
   pull_request:
     branches-ignore: []
+  schedule:
+    - cron: '0 0 * * SUN'
 
 jobs:
   build-linux:
@@ -19,28 +21,31 @@ jobs:
 
 
     steps:
-    - uses: actions/checkout@v2
+    - name: checkout git repo
+      uses: actions/checkout@v2
+
     - name: Set up Python 3.X
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python }}
-    - name: conda
+
+    - name: Add conda to system path
       run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
-    - name: Install dependencies conda
+
+    - name: conda
       run: |
-        conda install -c conda-forge --quiet mamba python=${{ matrix.python }}
-        mamba install -c conda-forge -c bioconda --quiet -y kraken2 shustring samtools "snpeff<5.1" bwa cd-hit 
-        conda list | grep ruamel
-        conda update ruamel_yaml
+        conda install -c conda-forge --quiet 'mamba<0.24' python=${{ matrix.python }}
+        mamba install -c conda-forge -c bioconda --quiet -y kraken2 shustring 'samtools<1.15.1' "snpeff" bwa cd-hit
+
     - name: Install sequana with pip
       run: |
         pip install .[testing]
         pip install multiqc==1.11.0
-    - name: Test with pytest
+    - name: testing
       run: |
-        pytest --cov-report term --cov=sequana
+        pytest --cov-report term-missing --cov=sequana
+
     - name: coveralls
       run: |
         pip install coveralls

diff --git a/README.rst b/README.rst
@@ -106,7 +106,7 @@ up-to-date status and documentation.
 Contributors
 ============
 
-Maintaining BioServices would not have been possible without users and contributors. 
+Maintaining Sequana would not have been possible without users and contributors.
 Each contribution has been an encouragement to pursue this project. Thanks to all:
 
 .. image:: https://contrib.rocks/image?repo=sequana/sequana
@@ -117,9 +117,12 @@ Each contribution has been an encouragement to pursue this project. Thanks to al
 Changelog
 ~~~~~~~~~
 
-========= ========================================================================
+========= ==========================================================================
 Version   Description
-========= ========================================================================
+========= ==========================================================================
+0.14.1    * Kegg enrichment: add gene list 'all' and fix incomplete annotation case
+          * New uniprot module for GO term enrichment and enrichment 
+            refactorisation (transparent for users)
 0.14.0    * pinned click>=8.1.0 due to API change (autocomplete)
           * moved tests around to decrease packaging from 16 to 4Mb
           * ribodesigner: new plots, clustering and notebook         
@@ -131,5 +134,5 @@ Version   Description
 0.12.6    * remove some rules now in https://github.com/sequana/sequana-wrappers
 0.12.5    * refactorisation of VCF tools/modules to use vcfpy instead of pyVCF
 0.12.4    * complete change log before 0.12.4 on readthedocs.org
-========= ========================================================================
+========= ==========================================================================
 
diff --git a/doc/references.rst b/doc/references.rst
@@ -6,7 +6,7 @@ References
 
 .. contents::
 
-Assembly related
+Assembly and contigs related
 -----------------------------
 
 .. automodule:: sequana.assembly

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 adjusttext
-bioservices>=1.7.8
+bioservices>=1.10.0
 brokenaxes
 bx-python
 click>=8.1.0
@@ -15,7 +15,7 @@ lxml
 matplotlib
 matplotlib-venn
 mock
-multiqc
+multiqc<=1.11
 packaging
 pandas>=0.22
 plotly
@@ -29,5 +29,6 @@ scikit-learn
 scipy
 seaborn
 statsmodels
+tqdm
 xlrd
 UpSetPlot
diff --git a/sequana/__init__.py b/sequana/__init__.py
@@ -34,6 +34,8 @@
 from .coverage import Coverage
 from .fastq import FastQ, FastQC, Identifier
 from .fasta import FastA
+# contig import is after fasta due to cycling imports
+from .contigs import Contigs
 from .gff3 import GFF3
 from .freebayes_vcf_filter import VCF_freebayes
 from .freebayes_bcf_filter import BCF_freebayes

diff --git a/sequana/assembly.py b/sequana/assembly.py
@@ -10,16 +10,20 @@
 #  documentation: http://sequana.readthedocs.io
 #
 ##############################################################################
+import textwrap
+
 from sequana.lazy import pylab
 from sequana.lazy import pandas as pd
+
 import colorlog
 
 logger = colorlog.getLogger(__name__)
 
+
 __all__ = ["BUSCO"]
 
 
-class BUSCO(object):
+class BUSCO:
     """Wrapper of the BUSCO output
 
     "BUSCO provides a quantitative measures for the assessment
@@ -33,19 +37,32 @@ class BUSCO(object):
     percentage in the range 0-100.
 
     :reference: http://busco.ezlab.org/
+
+    .. note:: support version 3.0.1 and new formats from v5.X
     """
 
-    def __init__(self, filename="full_table_testbusco.tsv"):
+    def __init__(self, filename="full_table_test.tsv"):
         """.. rubric:: constructor
 
         :filename: a valid BUSCO input file (full table). See example in sequana
             code source (testing)
 
         """
-        self.df = pd.read_csv(filename, sep="\t", skiprows=4)
+        # version 3.0.1
+        try:
+            self.df = pd.read_csv(filename, sep="\t", skiprows=4)
+
+            if "Status" not in self.df.columns: #pragma: no cover
+                # version 5.2.2
+                self.df = pd.read_csv(filename, sep="\t", skiprows=2)
+        except pd.io.parsers.python_parser.ParserError: #pragma: no cover
+            # it may happen that the parsing is incorrect with some files
+            self.df = pd.read_csv(filename, sep="\t", skiprows=2)
+
+        self.df.rename({"# Busco id": "ID"}, inplace=True, axis=1)
 
     def pie_plot(self, filename=None, hold=False):
-        """Plot PIE plot of the status (complete / fragment / missed)
+        """Pie plot of the status (completed / fragment / missed)
 
         .. plot::
             :include-source:
@@ -57,7 +74,7 @@ def pie_plot(self, filename=None, hold=False):
         """
         if hold is False:
             pylab.clf()
-        self.df.groupby("Status").count()["# Busco id"].plot(kind="pie")
+        self.df.groupby("Status").count()["ID"].plot(kind="pie")
         pylab.ylabel("")
         # pylab.title("Distribution Complete/Fragmented/Missing")
         # pylab.legend()
@@ -104,7 +121,7 @@ def summary(self):
         orthologs
 
         """
-        df = self.df.drop_duplicates(subset=["# Busco id"])
+        df = self.df.drop_duplicates(subset=["ID"])
         data = {}
         data["S"] = sum(df.Status == "Complete")
         data["F"] = sum(df.Status == "Fragmented")
@@ -155,3 +172,59 @@ def __str__(self):
     {} Total BUSCO groups searched
     """
         return string.format(self.get_summary_string(), C, S, D, F, M, N)
+
+
+    def save_core_genomes(self, contig_file, output_fasta_file='core.fasta'):
+        """Save the core genome based on busco and assembly output
+
+        The busco file must have been generated from an input contig file.
+        In the example below, the busco file was obtained from the **data.contigs.fasta**
+        file::
+
+            from sequana import BUSCO
+            b = BUSCO("busco_full_table.tsv")
+            b.save_core_genomes("data.contigs.fasta", "core.fasta")
+
+        If a gene from the core genome is missing, the extracted gene is made of 100 N's
+        If a gene is duplicated, only the best entry (based on the score) is kept.
+
+        If there are 130 genes in the core genomes, the output will 
+        be a multi-sequence FASTA file made of 130 sequences.
+
+        """
+        # local import due to cyclic import
+        from sequana import FastA
+        f = FastA(contig_file)
+
+        # if we have duplicated hits, we group them and take the best score
+        # we then drop the ID to keep the index.
+        # Note the fillna set to 0 to include 'Missing' entries
+        indices = self.df.fillna(0).groupby('ID')['Score'].nlargest(1).reset_index(level=0, drop=True).index
+        subdf = self.df.loc[indices]
+
+        # we sort the entries by gene (core genome) name
+        # useful if we want to merge the sequences for a multiple alignment
+
+        with open(output_fasta_file, 'w') as fout:
+            for record in subdf.to_dict('records'):
+                type_ = record['Status']
+                if type_ == 'Missing':
+                    data = 'N' * 100
+                    fout.write(f">{ID}\t{type_}:{seqname}:{start}:{end}:{end-start}\n{data}\n")
+                else:
+                    # get gene/contig information
+                    start = int(record['Gene Start'])
+                    ID = record['ID']    
+                    end = int(record['Gene End'])
+                    seqname = record['Sequence']
+
+                    # save the core gene sequence
+                    ctg_index = f.names.index(seqname)
+                    data = f.sequences[ctg_index][start:end]
+                    data = "\n".join(textwrap.wrap(data,80))
+                    fout.write(f">{ID}\t{type_}:{seqname}:{start}:{end}:{end-start}\n{data}\n")
+
+
+
+
+
diff --git a/sequana/bamtools.py b/sequana/bamtools.py
@@ -27,7 +27,6 @@
     developers to convert their SAM to BAM.
 
 """
-import os
 import json
 import math
 

diff --git a/sequana/compare.py b/sequana/compare.py
@@ -243,13 +243,13 @@ def plot_common_major_counts(
 
         if mode in ["down"]:
             # Negative values !
-            gl1 = set(self.r1.gene_lists["down"])
-            gl2 = set(self.r2.gene_lists["down"])
+            gl1 = list(set(self.r1.gene_lists["down"]))
+            gl2 = list(set(self.r2.gene_lists["down"]))
             A = self.r1.df.loc[gl1].sort_values(by=sortby)
             B = self.r2.df.loc[gl1].sort_values(by=sortby)
         else:
-            gl1 = set(self.r1.gene_lists[mode])
-            gl2 = set(self.r2.gene_lists[mode])
+            gl1 = list(set(self.r1.gene_lists[mode]))
+            gl2 = list(set(self.r2.gene_lists[mode]))
             A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
             B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
         # sometimes, up and down may be inverted as compared to the other
@@ -309,9 +309,10 @@ def plot_foldchange(self):
 
         A = self.r1.df.loc[self.r1.gene_lists[mode]]
         B = self.r2.df.loc[self.r2.gene_lists[mode]]
-        AB = set(A.index).intersection(set(B.index))
-        Ao = A.loc[set(A.index).difference(set(B.index))]
-        Bo = B.loc[set(B.index).difference(set(A.index))]
+        # cast set to list to avoid future error in pandas (june 2022)
+        AB = list(set(A.index).intersection(set(B.index)))
+        Ao = A.loc[list(set(A.index).difference(set(B.index)))]
+        Bo = B.loc[list(set(B.index).difference(set(A.index)))]
         Ac = A.loc[AB]
         Bc = B.loc[AB]
 
@@ -329,9 +330,10 @@ def plot_volcano_differences(self, mode="all"):
         labels = [cond1, cond2]
         A = self.r1.df.loc[self.r1.gene_lists[mode]]
         B = self.r2.df.loc[self.r2.gene_lists[mode]]
-        AB = set(A.index).intersection(set(B.index))
-        Aonly = A.loc[set(A.index).difference(set(B.index))]
-        Bonly = B.loc[set(B.index).difference(set(A.index))]
+        # cast set to list to avoid future error in pandas (june 2022)
+        AB = list(set(A.index).intersection(set(B.index)))
+        Aonly = A.loc[list(set(A.index).difference(set(B.index)))]
+        Bonly = B.loc[list(set(B.index).difference(set(A.index)))]
         Acommon = A.loc[AB]
         Bcommon = B.loc[AB]