Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor VCF code #849

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ Changelog
========= ==========================================================================
Version Description
========= ==========================================================================
0.17.0 * viz submodules: remove easydev and cleanup scipy imports
* remove the substractor utility (use sequana_depletion pipeline instead)
* remove get_max_gc_correlation function from bedtools. not used.
* Major change in VCF reader (freebayes). Got rid of freebayes_bcf_filter
redundant with freebayes_vcf_filter; replace scipy fisher test with own
implementation. Remove useless VCF code.
0.16.9 * Major fix on PCA and add batch effect plots in RNAdiff analysis
* count matrix and DESeq2 output files' headers fixed with missing index
(no impact on analysis but only for those willing to use the CSV files
Expand Down
9 changes: 2 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "sequana"
version = "0.16.9"
version = "0.16.10"
description = "A set of standalone application and pipelines dedicated to NGS analysis"
authors = ["Thomas Cokelaer <thomas.cokelaer@pasteur.fr>"]
license = "new BSD"
readme = ["README.rst", "LICENSE"]
readme = "README.rst"
keywords = ["snakemake", "sequana", "NGS"]
classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down Expand Up @@ -136,8 +136,3 @@ sphinx = ">3"
sphinx-rtd-theme = "^2.0.0"
sphinx-gallery = "^0.15.0"
sequana-sphinxext = "^1.0.0"
#docutils = "^0.20.1"




1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,4 @@ snakemake>=7.16,<8
statsmodels
tqdm
UpSetPlot
vcfpy
xlrd
2 changes: 1 addition & 1 deletion sequana/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def get_package_version(package_name):
from .fasta import FastA
from .fastq import FastQ, FastQC, Identifier
from .featurecounts import FeatureCount
from .freebayes_bcf_filter import BCF_freebayes
from .freebayes_vcf_filter import VCF_freebayes
from .gff3 import GFF3
from .homer import Homer
Expand All @@ -80,3 +79,4 @@ def get_package_version(package_name):
from .sequence import DNA, RNA, Repeats, Sequence
from .snpeff import SnpEff
from .trf import TRF
from .variants import VariantFile
36 changes: 0 additions & 36 deletions sequana/bedtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1586,42 +1586,6 @@ def get_gc_correlation(self):
gc.collect()
return C

def get_max_gc_correlation(self, guess=100):
"""Plot correlation between coverage and GC content by varying the GC window

The GC content uses a moving window of size W. This parameter affects
the correlation bewteen coverage and GC. This function find the
*optimal* window length.

"""
from scipy.optimize import fmin

pylab.clf()
corrs = []
wss = []

def func(params):
ws = int(round(params[0]))
if ws < 10:
return 0

self._compute_gc_content(ws)
corr = self.get_gc_correlation()
corrs.append(corr)
wss.append(ws)
return corr

res = fmin(func, guess, xtol=1, disp=False) # guess is 200
pylab.plot(wss, corrs, "o")
pylab.xlabel("GC window size")
pylab.ylabel("Correlation")
pylab.grid()

# reset to the default value
self._compute_gc_content()

return res[0]

def _get_hist_data(self, bins=30):
data = self.df["cov"].dropna()
m = data.quantile(0.01)
Expand Down
21 changes: 10 additions & 11 deletions sequana/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@

import colorlog
import tqdm
from pysam import FastxFile

from sequana.lazy import numpy as np
from sequana.lazy import pylab
from sequana.lazy import pylab, pysam
from sequana.stats import L50, N50

logger = colorlog.getLogger(__name__)
Expand Down Expand Up @@ -58,7 +57,7 @@ class FastA:
"""

def __init__(self, filename, verbose=False):
self._fasta = FastxFile(filename)
self._fasta = pysam.FastxFile(filename)
self.filename = filename
self._N = None

Expand All @@ -77,38 +76,38 @@ def next(self): # python 2
# This should allow developers to break a loop that takes too long
# through the reads to run forever
self._fasta.close()
self._fasta = FastxFile(self._fasta.filename)
self._fasta = pysam.FastxFile(self._fasta.filename)
except:
self._fasta.close()
self._fasta = FastxFile(self._fasta.filename)
self._fasta = pysam.FastxFile(self._fasta.filename)
raise StopIteration

def __len__(self):
if self._N is None:
logger.debug("Reading input fasta file...please wait")
self._N = sum(1 for x in FastxFile(self.filename))
self._N = sum(1 for x in pysam.FastxFile(self.filename))
return self._N

def _get_names(self):
_fasta = FastxFile(self.filename)
_fasta = pysam.FastxFile(self.filename)
return [this.name for this in _fasta]

names = property(_get_names)

def _get_sequences(self):
_fasta = FastxFile(self.filename)
_fasta = pysam.FastxFile(self.filename)
return [this.sequence for this in _fasta]

sequences = property(_get_sequences)

def _get_comment(self):
_fasta = FastxFile(self.filename)
_fasta = pysam.FastxFile(self.filename)
return [this.comment for this in _fasta]

comments = property(_get_comment)

def _get_lengths(self):
_fasta = FastxFile(self.filename)
_fasta = pysam.FastxFile(self.filename)
return [len(this.sequence) for this in _fasta]

lengths = property(_get_lengths)
Expand Down Expand Up @@ -236,7 +235,7 @@ def select_random_reads(self, N=None, output_filename="random.fasta"):
cherries = N
elif isinstance(N, list):
cherries = set(N)
fasta = FastxFile(self.filename)
fasta = pysam.FastxFile(self.filename)

with open(output_filename, "w") as fh:
for i, read in enumerate(tqdm.tqdm(fasta)):
Expand Down
8 changes: 1 addition & 7 deletions sequana/fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from itertools import islice

import pysam
from easydev import Progress
from tqdm import tqdm

from sequana.lazy import numpy as np
Expand Down Expand Up @@ -945,15 +944,13 @@ def _get_info(self):

total_length = 0
C = defaultdict(int)
if self.verbose:
pb = Progress(self.N)

sequences = []
mean_qualities = []
qualities = []

ff = pysam.FastxFile(self.filename)
for i, record in enumerate(ff):
for i, record in tqdm(enumerate(ff), disable=not self.verbose):
if i < self.skip_nrows:
continue
if i > self.max_sample + self.skip_nrows:
Expand Down Expand Up @@ -987,9 +984,6 @@ def _get_info(self):

total_length += len(record.sequence)

if self.verbose:
pb.animate(i + 1)

# other data
self.qualities = qualities
self.mean_qualities = mean_qualities
Expand Down
3 changes: 0 additions & 3 deletions sequana/find_motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,7 @@ def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, gl

data = FastA(filename)
N = len(data)
from easydev import Progress

pb = Progress(N)
df = {"query_name": [], "hit": [], "length": [], "start": [], "end": []}
for i, item in enumerate(data):
X1, S = self.find_motif_from_sequence(item.sequence, motif, window=window, local_threshold=local_threshold)
Expand All @@ -103,7 +101,6 @@ def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, gl
df["end"].append(len(item.sequence))
df["length"].append(len(item.sequence))
df["hit"].append(S)
pb.animate(i + 1)
df = pd.DataFrame(df)
return df

Expand Down
Loading
Loading