sequana · cokelaer · Feb 26, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/README.rst b/README.rst
@@ -243,6 +243,12 @@ Changelog
 ========= ==========================================================================
 Version   Description
 ========= ==========================================================================
+0.17.0    * viz submodules: remove easydev and cleanup scipy imports
+          * remove the substractor utility (use sequana_depletion pipeline instead)
+          * remove get_max_gc_correlation function from bedtools. not used.
+          * Major change in VCF reader (freebayes). Got rid of freebayes_bcf_filter
+            redundant with freebayes_vcf_filter; replace scipy fisher test with own
+            implementation. Remove useless VCF code.
 0.16.9    * Major fix on PCA and add batch effect plots in RNAdiff analysis
           * count matrix and DESeq2 output files' headers fixed with missing index
             (no impact on analysis but only for those willing to use the CSV files

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,11 +4,11 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "sequana"
-version = "0.16.9"
+version = "0.16.10"
 description = "A set of standalone application and pipelines dedicated to NGS analysis"
 authors = ["Thomas Cokelaer <thomas.cokelaer@pasteur.fr>"]
 license = "new BSD"
-readme = ["README.rst", "LICENSE"]
+readme = "README.rst"
 keywords = ["snakemake", "sequana", "NGS"]
 classifiers = [
         "Development Status :: 5 - Production/Stable",
@@ -136,8 +136,3 @@ sphinx = ">3"
 sphinx-rtd-theme = "^2.0.0"
 sphinx-gallery = "^0.15.0"
 sequana-sphinxext = "^1.0.0"
-#docutils = "^0.20.1"
-
-
-
-
diff --git a/requirements.txt b/requirements.txt
@@ -26,5 +26,4 @@ snakemake>=7.16,<8
 statsmodels
 tqdm
 UpSetPlot
-vcfpy
 xlrd
diff --git a/sequana/__init__.py b/sequana/__init__.py
@@ -55,7 +55,6 @@ def get_package_version(package_name):
 from .fasta import FastA
 from .fastq import FastQ, FastQC, Identifier
 from .featurecounts import FeatureCount
-from .freebayes_bcf_filter import BCF_freebayes
 from .freebayes_vcf_filter import VCF_freebayes
 from .gff3 import GFF3
 from .homer import Homer
@@ -80,3 +79,4 @@ def get_package_version(package_name):
 from .sequence import DNA, RNA, Repeats, Sequence
 from .snpeff import SnpEff
 from .trf import TRF
+from .variants import VariantFile
diff --git a/sequana/bedtools.py b/sequana/bedtools.py
@@ -1586,42 +1586,6 @@ def get_gc_correlation(self):
         gc.collect()
         return C
 
-    def get_max_gc_correlation(self, guess=100):
-        """Plot correlation between coverage and GC content by varying the GC window
-
-        The GC content uses a moving window of size W. This parameter affects
-        the correlation bewteen coverage and GC. This function find the
-        *optimal* window length.
-
-        """
-        from scipy.optimize import fmin
-
-        pylab.clf()
-        corrs = []
-        wss = []
-
-        def func(params):
-            ws = int(round(params[0]))
-            if ws < 10:
-                return 0
-
-            self._compute_gc_content(ws)
-            corr = self.get_gc_correlation()
-            corrs.append(corr)
-            wss.append(ws)
-            return corr
-
-        res = fmin(func, guess, xtol=1, disp=False)  # guess is 200
-        pylab.plot(wss, corrs, "o")
-        pylab.xlabel("GC window size")
-        pylab.ylabel("Correlation")
-        pylab.grid()
-
-        # reset to the default value
-        self._compute_gc_content()
-
-        return res[0]
-
     def _get_hist_data(self, bins=30):
         data = self.df["cov"].dropna()
         m = data.quantile(0.01)

diff --git a/sequana/fasta.py b/sequana/fasta.py
@@ -16,10 +16,9 @@
 
 import colorlog
 import tqdm
-from pysam import FastxFile
 
 from sequana.lazy import numpy as np
-from sequana.lazy import pylab
+from sequana.lazy import pylab, pysam
 from sequana.stats import L50, N50
 
 logger = colorlog.getLogger(__name__)
@@ -58,7 +57,7 @@ class FastA:
     """
 
     def __init__(self, filename, verbose=False):
-        self._fasta = FastxFile(filename)
+        self._fasta = pysam.FastxFile(filename)
         self.filename = filename
         self._N = None
 
@@ -77,38 +76,38 @@ def next(self):  # python 2
             # This should allow developers to break a loop that takes too long
             # through the reads to run forever
             self._fasta.close()
-            self._fasta = FastxFile(self._fasta.filename)
+            self._fasta = pysam.FastxFile(self._fasta.filename)
         except:
             self._fasta.close()
-            self._fasta = FastxFile(self._fasta.filename)
+            self._fasta = pysam.FastxFile(self._fasta.filename)
             raise StopIteration
 
     def __len__(self):
         if self._N is None:
             logger.debug("Reading input fasta file...please wait")
-            self._N = sum(1 for x in FastxFile(self.filename))
+            self._N = sum(1 for x in pysam.FastxFile(self.filename))
         return self._N
 
     def _get_names(self):
-        _fasta = FastxFile(self.filename)
+        _fasta = pysam.FastxFile(self.filename)
         return [this.name for this in _fasta]
 
     names = property(_get_names)
 
     def _get_sequences(self):
-        _fasta = FastxFile(self.filename)
+        _fasta = pysam.FastxFile(self.filename)
         return [this.sequence for this in _fasta]
 
     sequences = property(_get_sequences)
 
     def _get_comment(self):
-        _fasta = FastxFile(self.filename)
+        _fasta = pysam.FastxFile(self.filename)
         return [this.comment for this in _fasta]
 
     comments = property(_get_comment)
 
     def _get_lengths(self):
-        _fasta = FastxFile(self.filename)
+        _fasta = pysam.FastxFile(self.filename)
         return [len(this.sequence) for this in _fasta]
 
     lengths = property(_get_lengths)
@@ -236,7 +235,7 @@ def select_random_reads(self, N=None, output_filename="random.fasta"):
             cherries = N
         elif isinstance(N, list):
             cherries = set(N)
-        fasta = FastxFile(self.filename)
+        fasta = pysam.FastxFile(self.filename)
 
         with open(output_filename, "w") as fh:
             for i, read in enumerate(tqdm.tqdm(fasta)):

diff --git a/sequana/fastq.py b/sequana/fastq.py
@@ -20,7 +20,6 @@
 from itertools import islice
 
 import pysam
-from easydev import Progress
 from tqdm import tqdm
 
 from sequana.lazy import numpy as np
@@ -945,15 +944,13 @@ def _get_info(self):
 
         total_length = 0
         C = defaultdict(int)
-        if self.verbose:
-            pb = Progress(self.N)
 
         sequences = []
         mean_qualities = []
         qualities = []
 
         ff = pysam.FastxFile(self.filename)
-        for i, record in enumerate(ff):
+        for i, record in tqdm(enumerate(ff), disable=not self.verbose):
             if i < self.skip_nrows:
                 continue
             if i > self.max_sample + self.skip_nrows:
@@ -987,9 +984,6 @@ def _get_info(self):
 
             total_length += len(record.sequence)
 
-            if self.verbose:
-                pb.animate(i + 1)
-
         # other data
         self.qualities = qualities
         self.mean_qualities = mean_qualities

diff --git a/sequana/find_motif.py b/sequana/find_motif.py
@@ -91,9 +91,7 @@ def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, gl
 
         data = FastA(filename)
         N = len(data)
-        from easydev import Progress
 
-        pb = Progress(N)
         df = {"query_name": [], "hit": [], "length": [], "start": [], "end": []}
         for i, item in enumerate(data):
             X1, S = self.find_motif_from_sequence(item.sequence, motif, window=window, local_threshold=local_threshold)
@@ -103,7 +101,6 @@ def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, gl
                 df["end"].append(len(item.sequence))
                 df["length"].append(len(item.sequence))
                 df["hit"].append(S)
-            pb.animate(i + 1)
         df = pd.DataFrame(df)
         return df