Skip to content

Commit

Permalink
Merge pull request #755 from sequana/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
cokelaer committed Jul 1, 2022
2 parents a37349a + 511bab8 commit 7c178f9
Show file tree
Hide file tree
Showing 173 changed files with 3,180 additions and 1,150 deletions.
25 changes: 15 additions & 10 deletions .github/workflows/main.yml
Expand Up @@ -7,6 +7,8 @@ on:
- dev
pull_request:
branches-ignore: []
schedule:
- cron: '0 0 * * SUN'

jobs:
build-linux:
Expand All @@ -19,28 +21,31 @@ jobs:


steps:
- uses: actions/checkout@v2
- name: checkout git repo
uses: actions/checkout@v2

- name: Set up Python 3.X
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python }}
- name: conda

- name: Add conda to system path
run: |
# $CONDA is an environment variable pointing to the root of the miniconda directory
echo $CONDA/bin >> $GITHUB_PATH
- name: Install dependencies conda
- name: conda
run: |
conda install -c conda-forge --quiet mamba python=${{ matrix.python }}
mamba install -c conda-forge -c bioconda --quiet -y kraken2 shustring samtools "snpeff<5.1" bwa cd-hit
conda list | grep ruamel
conda update ruamel_yaml
conda install -c conda-forge --quiet 'mamba<0.24' python=${{ matrix.python }}
mamba install -c conda-forge -c bioconda --quiet -y kraken2 shustring 'samtools<1.15.1' "snpeff" bwa cd-hit
- name: Install sequana with pip
run: |
pip install .[testing]
pip install multiqc==1.11.0
- name: Test with pytest
- name: testing
run: |
pytest --cov-report term --cov=sequana
pytest --cov-report term-missing --cov=sequana
- name: coveralls
run: |
pip install coveralls
Expand Down
11 changes: 7 additions & 4 deletions README.rst
Expand Up @@ -106,7 +106,7 @@ up-to-date status and documentation.
Contributors
============

Maintaining BioServices would not have been possible without users and contributors.
Maintaining Sequana would not have been possible without users and contributors.
Each contribution has been an encouragement to pursue this project. Thanks to all:

.. image:: https://contrib.rocks/image?repo=sequana/sequana
Expand All @@ -117,9 +117,12 @@ Each contribution has been an encouragement to pursue this project. Thanks to al
Changelog
~~~~~~~~~

========= ========================================================================
========= ==========================================================================
Version Description
========= ========================================================================
========= ==========================================================================
0.14.1 * Kegg enrichment: add gene list 'all' and fix incomplete annotation case
* New uniprot module for GO term enrichment and enrichment
refactorisation (transparent for users)
0.14.0 * pinned click>=8.1.0 due to API change (autocomplete)
* moved tests around to decrease packaging from 16 to 4Mb
* ribodesigner: new plots, clustering and notebook
Expand All @@ -131,5 +134,5 @@ Version Description
0.12.6 * remove some rules now in https://github.com/sequana/sequana-wrappers
0.12.5 * refactorisation of VCF tools/modules to use vcfpy instead of pyVCF
0.12.4 * complete change log before 0.12.4 on readthedocs.org
========= ========================================================================
========= ==========================================================================

2 changes: 1 addition & 1 deletion doc/references.rst
Expand Up @@ -6,7 +6,7 @@ References

.. contents::

Assembly related
Assembly and contigs related
-----------------------------

.. automodule:: sequana.assembly
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
@@ -1,5 +1,5 @@
adjusttext
bioservices>=1.7.8
bioservices>=1.10.0
brokenaxes
bx-python
click>=8.1.0
Expand All @@ -15,7 +15,7 @@ lxml
matplotlib
matplotlib-venn
mock
multiqc
multiqc<=1.11
packaging
pandas>=0.22
plotly
Expand All @@ -29,5 +29,6 @@ scikit-learn
scipy
seaborn
statsmodels
tqdm
xlrd
UpSetPlot
2 changes: 2 additions & 0 deletions sequana/__init__.py
Expand Up @@ -34,6 +34,8 @@
from .coverage import Coverage
from .fastq import FastQ, FastQC, Identifier
from .fasta import FastA
# contig import is after fasta due to cycling imports
from .contigs import Contigs
from .gff3 import GFF3
from .freebayes_vcf_filter import VCF_freebayes
from .freebayes_bcf_filter import BCF_freebayes
Expand Down
85 changes: 79 additions & 6 deletions sequana/assembly.py
Expand Up @@ -10,16 +10,20 @@
# documentation: http://sequana.readthedocs.io
#
##############################################################################
import textwrap

from sequana.lazy import pylab
from sequana.lazy import pandas as pd

import colorlog

logger = colorlog.getLogger(__name__)


__all__ = ["BUSCO"]


class BUSCO(object):
class BUSCO:
"""Wrapper of the BUSCO output
"BUSCO provides a quantitative measures for the assessment
Expand All @@ -33,19 +37,32 @@ class BUSCO(object):
percentage in the range 0-100.
:reference: http://busco.ezlab.org/
.. note:: support version 3.0.1 and new formats from v5.X
"""

def __init__(self, filename="full_table_testbusco.tsv"):
def __init__(self, filename="full_table_test.tsv"):
""".. rubric:: constructor
:filename: a valid BUSCO input file (full table). See example in sequana
code source (testing)
"""
self.df = pd.read_csv(filename, sep="\t", skiprows=4)
# version 3.0.1
try:
self.df = pd.read_csv(filename, sep="\t", skiprows=4)

if "Status" not in self.df.columns: #pragma: no cover
# version 5.2.2
self.df = pd.read_csv(filename, sep="\t", skiprows=2)
except pd.io.parsers.python_parser.ParserError: #pragma: no cover
# it may happen that the parsing is incorrect with some files
self.df = pd.read_csv(filename, sep="\t", skiprows=2)

self.df.rename({"# Busco id": "ID"}, inplace=True, axis=1)

def pie_plot(self, filename=None, hold=False):
"""Plot PIE plot of the status (complete / fragment / missed)
"""Pie plot of the status (completed / fragment / missed)
.. plot::
:include-source:
Expand All @@ -57,7 +74,7 @@ def pie_plot(self, filename=None, hold=False):
"""
if hold is False:
pylab.clf()
self.df.groupby("Status").count()["# Busco id"].plot(kind="pie")
self.df.groupby("Status").count()["ID"].plot(kind="pie")
pylab.ylabel("")
# pylab.title("Distribution Complete/Fragmented/Missing")
# pylab.legend()
Expand Down Expand Up @@ -104,7 +121,7 @@ def summary(self):
orthologs
"""
df = self.df.drop_duplicates(subset=["# Busco id"])
df = self.df.drop_duplicates(subset=["ID"])
data = {}
data["S"] = sum(df.Status == "Complete")
data["F"] = sum(df.Status == "Fragmented")
Expand Down Expand Up @@ -155,3 +172,59 @@ def __str__(self):
{} Total BUSCO groups searched
"""
return string.format(self.get_summary_string(), C, S, D, F, M, N)


def save_core_genomes(self, contig_file, output_fasta_file='core.fasta'):
"""Save the core genome based on busco and assembly output
The busco file must have been generated from an input contig file.
In the example below, the busco file was obtained from the **data.contigs.fasta**
file::
from sequana import BUSCO
b = BUSCO("busco_full_table.tsv")
b.save_core_genomes("data.contigs.fasta", "core.fasta")
If a gene from the core genome is missing, the extracted gene is made of 100 N's
If a gene is duplicated, only the best entry (based on the score) is kept.
If there are 130 genes in the core genomes, the output will
be a multi-sequence FASTA file made of 130 sequences.
"""
# local import due to cyclic import
from sequana import FastA
f = FastA(contig_file)

# if we have duplicated hits, we group them and take the best score
# we then drop the ID to keep the index.
# Note the fillna set to 0 to include 'Missing' entries
indices = self.df.fillna(0).groupby('ID')['Score'].nlargest(1).reset_index(level=0, drop=True).index
subdf = self.df.loc[indices]

# we sort the entries by gene (core genome) name
# useful if we want to merge the sequences for a multiple alignment

with open(output_fasta_file, 'w') as fout:
for record in subdf.to_dict('records'):
type_ = record['Status']
if type_ == 'Missing':
data = 'N' * 100
fout.write(f">{ID}\t{type_}:{seqname}:{start}:{end}:{end-start}\n{data}\n")
else:
# get gene/contig information
start = int(record['Gene Start'])
ID = record['ID']
end = int(record['Gene End'])
seqname = record['Sequence']

# save the core gene sequence
ctg_index = f.names.index(seqname)
data = f.sequences[ctg_index][start:end]
data = "\n".join(textwrap.wrap(data,80))
fout.write(f">{ID}\t{type_}:{seqname}:{start}:{end}:{end-start}\n{data}\n")





1 change: 0 additions & 1 deletion sequana/bamtools.py
Expand Up @@ -27,7 +27,6 @@
developers to convert their SAM to BAM.
"""
import os
import json
import math

Expand Down
22 changes: 12 additions & 10 deletions sequana/compare.py
Expand Up @@ -243,13 +243,13 @@ def plot_common_major_counts(

if mode in ["down"]:
# Negative values !
gl1 = set(self.r1.gene_lists["down"])
gl2 = set(self.r2.gene_lists["down"])
gl1 = list(set(self.r1.gene_lists["down"]))
gl2 = list(set(self.r2.gene_lists["down"]))
A = self.r1.df.loc[gl1].sort_values(by=sortby)
B = self.r2.df.loc[gl1].sort_values(by=sortby)
else:
gl1 = set(self.r1.gene_lists[mode])
gl2 = set(self.r2.gene_lists[mode])
gl1 = list(set(self.r1.gene_lists[mode]))
gl2 = list(set(self.r2.gene_lists[mode]))
A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
# sometimes, up and down may be inverted as compared to the other
Expand Down Expand Up @@ -309,9 +309,10 @@ def plot_foldchange(self):

A = self.r1.df.loc[self.r1.gene_lists[mode]]
B = self.r2.df.loc[self.r2.gene_lists[mode]]
AB = set(A.index).intersection(set(B.index))
Ao = A.loc[set(A.index).difference(set(B.index))]
Bo = B.loc[set(B.index).difference(set(A.index))]
# cast set to list to avoid future error in pandas (june 2022)
AB = list(set(A.index).intersection(set(B.index)))
Ao = A.loc[list(set(A.index).difference(set(B.index)))]
Bo = B.loc[list(set(B.index).difference(set(A.index)))]
Ac = A.loc[AB]
Bc = B.loc[AB]

Expand All @@ -329,9 +330,10 @@ def plot_volcano_differences(self, mode="all"):
labels = [cond1, cond2]
A = self.r1.df.loc[self.r1.gene_lists[mode]]
B = self.r2.df.loc[self.r2.gene_lists[mode]]
AB = set(A.index).intersection(set(B.index))
Aonly = A.loc[set(A.index).difference(set(B.index))]
Bonly = B.loc[set(B.index).difference(set(A.index))]
# cast set to list to avoid future error in pandas (june 2022)
AB = list(set(A.index).intersection(set(B.index)))
Aonly = A.loc[list(set(A.index).difference(set(B.index)))]
Bonly = B.loc[list(set(B.index).difference(set(A.index)))]
Acommon = A.loc[AB]
Bcommon = B.loc[AB]

Expand Down

0 comments on commit 7c178f9

Please sign in to comment.