scverse · BorisMuzellec · Jan 9, 2023 · Jan 2, 2023 · Jan 2, 2023 · Jan 2, 2023
@@ -24,8 +24,9 @@ It aims to facilitate DEA experiments for python users.
 
 As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from scratch, you may experience some differences in terms of retrieved values or available features.
 
-Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor analysis,
-but we plan to implement more in the near future. In case there is a feature you would particularly like to be implemented, feel free to open an issue.
+Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor 
+and paired multi-factor analysis, but we plan to implement more in the near future.
+In case there is a feature you would particularly like to be implemented, feel free to open an issue.
 
 ## Installation
 

@@ -36,7 +36,7 @@
     "\n",
     "from pydeseq2.DeseqDataSet import DeseqDataSet\n",
     "from pydeseq2.DeseqStats import DeseqStats\n",
-    "from pydeseq2.utils import load_data"
+    "from pydeseq2.utils import load_example_data"
    ]
   },
   {
@@ -89,7 +89,7 @@
    },
    "outputs": [],
    "source": [
-    "counts_df = load_data(\n",
+    "counts_df = load_example_data(\n",
     "    modality=\"raw_counts\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
@@ -103,7 +103,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clinical_df = load_data(\n",
+    "clinical_df = load_example_data(\n",
     "    modality=\"clinical\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
@@ -198,7 +198,7 @@
     "dds = DeseqDataSet(\n",
     "    counts_df,\n",
     "    clinical_df,\n",
-    "    design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
+    "    design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
     "    refit_cooks=True,\n",
     "    n_cpus=8,\n",
     ")"

@@ -36,7 +36,7 @@
     "\n",
     "from pydeseq2.DeseqDataSet import DeseqDataSet\n",
     "from pydeseq2.DeseqStats import DeseqStats\n",
-    "from pydeseq2.utils import load_data"
+    "from pydeseq2.utils import load_example_data"
    ]
   },
   {
@@ -87,7 +87,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "counts_df = load_data(\n",
+    "counts_df = load_example_data(\n",
     "    modality=\"raw_counts\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clinical_df = load_data(\n",
+    "clinical_df = load_example_data(\n",
     "    modality=\"clinical\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
@@ -196,7 +196,7 @@
     "dds = DeseqDataSet(\n",
     "    counts_df,\n",
     "    clinical_df,\n",
-    "    design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
+    "    design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
     "    refit_cooks=True,\n",
     "    n_cpus=8,\n",
     ")"

@@ -43,9 +43,10 @@ class DeseqDataSet:
         DataFrame containing clinical information.
         Must be indexed by sample barcodes.
 
-    design_factor : str
-        Name of the column of clinical to be used as a design variable.
-        (default: 'high_grade').
+    design_factors : str or list[str]
+        Name of the columns of clinical to be used as design variables. If a list,
+        the last factor will be considered the variable of interest by default.
+        Only bi-level factors are supported. (default: 'high_grade').
 
     reference_level : str
         The factor to use as a reference. Must be one of the values taken by the design.
@@ -155,7 +156,7 @@ def __init__(
         self,
         counts,
         clinical,
-        design_factor="high_grade",
+        design_factors="high_grade",
         reference_level=None,
         min_mu=0.5,
         min_disp=1e-8,
@@ -183,15 +184,20 @@ def __init__(
 
         # Import clinical data and convert design_column to string
         self.clinical = clinical.loc[self.counts.index]
-        self.design_factor = design_factor
-        if self.clinical[self.design_factor].isna().any():
-            raise ValueError("NaNs are not allowed in the design factor.")
-        self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)
+        # Convert design_factors to list if a single string was provided.
+        self.design_factors = (
+            [design_factors] if isinstance(design_factors, str) else design_factors
+        )
+        if self.clinical[self.design_factors].isna().any().any():
+            raise ValueError("NaNs are not allowed in the design factors.")
+        self.clinical[self.design_factors] = self.clinical[self.design_factors].astype(
+            str
+        )
 
-        # Build the design matrix (splits the dataset in two cohorts)
+        # Build the design matrix (splits the dataset in cohorts)
         self.design_matrix = build_design_matrix(
-            self.clinical,
-            design_factor,
+            clinical_df=self.clinical,
+            design_factors=self.design_factors,
             ref=reference_level,
             expanded=False,
             intercept=True,
@@ -250,6 +256,8 @@ def fit_genewise_dispersions(self):
         Fits a negative binomial per gene, independently.
         """
 
+        num_vars = self.design_matrix.shape[-1]
+
         # Check that size factors are available. If not, compute them.
         if not hasattr(self, "size_factors"):
             self.fit_size_factors()
@@ -261,29 +269,54 @@ def fit_genewise_dispersions(self):
         non_zero = ~(self.counts == 0).all()
         self.non_zero_genes = non_zero[non_zero].index
         counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         # Convert design_matrix to numpy for speed
         X = self.design_matrix.values
         rough_disps = self._rough_dispersions.loc[self.non_zero_genes].values
         size_factors = self.size_factors.values
 
-        with parallel_backend("loky", inner_max_num_threads=1):
-            mu_hat_ = np.array(
-                Parallel(
+        # mu_hat is initialized differently depending on the number of different factor
+        # groups. If there are as many different factor combinations as design factors
+        # (intercept included), it is fitted with a linear model, otherwise it is fitted
+        # with a GLM (using rough dispersion estimates).
+        if len(self.design_matrix.value_counts()) == num_vars:
+            with parallel_backend("loky", inner_max_num_threads=1):
+                mu_hat_ = np.array(
+                    Parallel(
+                        n_jobs=self.n_processes,
+                        verbose=self.joblib_verbosity,
+                        batch_size=self.batch_size,
+                    )(
+                        delayed(fit_lin_mu)(
+                            counts=counts_nonzero[:, i],
+                            size_factors=size_factors,
+                            design_matrix=X,
+                            min_mu=self.min_mu,
+                        )
+                        for i in range(num_genes)
+                    )
+                )
+        else:
+            with parallel_backend("loky", inner_max_num_threads=1):
+                res = Parallel(
                     n_jobs=self.n_processes,
                     verbose=self.joblib_verbosity,
                     batch_size=self.batch_size,
                 )(
-                    delayed(fit_lin_mu)(
-                        counts_nonzero[:, i],
-                        size_factors,
-                        X,
-                        self.min_mu,
+                    delayed(irls_solver)(
+                        counts=counts_nonzero[:, i],
+                        size_factors=size_factors,
+                        design_matrix=X,
+                        disp=rough_disps[i],
+                        min_mu=self.min_mu,
+                        beta_tol=self.beta_tol,
                     )
-                    for i in range(m)
+                    for i in range(num_genes)
                 )
-            )
+
+                _, mu_hat_, _, _ = zip(*res)
+                mu_hat_ = np.array(mu_hat_)
 
         self._mu_hat = pd.DataFrame(
             mu_hat_.T,
@@ -296,14 +329,14 @@ def fit_genewise_dispersions(self):
         with parallel_backend("loky", inner_max_num_threads=1):
             res = Parallel(n_jobs=self.n_processes, batch_size=self.batch_size)(
                 delayed(fit_alpha_mle)(
-                    counts_nonzero[:, i],
-                    X,
-                    mu_hat_[i, :],
-                    rough_disps[i],
-                    self.min_disp,
-                    self.max_disp,
+                    counts=counts_nonzero[:, i],
+                    design_matrix=X,
+                    mu=mu_hat_[i, :],
+                    alpha_hat=rough_disps[i],
+                    min_disp=self.min_disp,
+                    max_disp=self.max_disp,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end - start:.2f} seconds.\n")
@@ -398,8 +431,8 @@ def fit_dispersion_prior(self):
             self.fit_dispersion_trend()
 
         # Exclude genes with all zeroes
-        m = len(self.non_zero_genes)
-        p = self.design_matrix.shape[1]
+        num_genes = len(self.non_zero_genes)
+        num_vars = self.design_matrix.shape[1]
 
         # Fit dispersions to the curve, and compute log residuals
         disp_residuals = np.log(self.genewise_dispersions) - np.log(
@@ -415,7 +448,7 @@ def fit_dispersion_prior(self):
             disp_residuals.loc[self.non_zero_genes][above_min_disp]
         ).median() ** 2 / norm.ppf(0.75)
         self.prior_disp_var = np.maximum(
-            self._squared_logres - polygamma(1, (m - p) / 2), 0.25
+            self._squared_logres - polygamma(1, (num_genes - num_vars) / 2), 0.25
         )
 
     def fit_MAP_dispersions(self):
@@ -430,7 +463,7 @@ def fit_MAP_dispersions(self):
 
         # Exclude genes with all zeroes
         counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         X = self.design_matrix.values
         mu_hat = self._mu_hat.values
@@ -445,17 +478,17 @@ def fit_MAP_dispersions(self):
                 batch_size=self.batch_size,
             )(
                 delayed(fit_alpha_mle)(
-                    counts_nonzero[:, i],
-                    X,
-                    mu_hat[:, i],
-                    fit_disps[i],
-                    self.min_disp,
-                    self.max_disp,
-                    self.prior_disp_var,
-                    True,
-                    True,
+                    counts=counts_nonzero[:, i],
+                    design_matrix=X,
+                    mu=mu_hat[:, i],
+                    alpha_hat=fit_disps[i],
+                    min_disp=self.min_disp,
+                    max_disp=self.max_disp,
+                    prior_disp_var=self.prior_disp_var,
+                    cr_reg=True,
+                    prior_reg=True,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end-start:.2f} seconds.\n")
@@ -491,7 +524,7 @@ def fit_LFC(self):
 
         # Exclude genes with all zeroes
         counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         X = self.design_matrix.values
         size_factors = self.size_factors.values
@@ -506,14 +539,14 @@ def fit_LFC(self):
                 batch_size=self.batch_size,
             )(
                 delayed(irls_solver)(
-                    counts_nonzero[:, i],
-                    size_factors,
-                    X,
-                    disps[i],
-                    self.min_mu,
-                    self.beta_tol,
+                    counts=counts_nonzero[:, i],
+                    size_factors=size_factors,
+                    design_matrix=X,
+                    disp=disps[i],
+                    min_mu=self.min_mu,
+                    beta_tol=self.beta_tol,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end-start:.2f} seconds.\n")
@@ -560,7 +593,7 @@ def calculate_cooks(self):
         if not hasattr(self, "dispersions"):
             self.fit_MAP_dispersions()
 
-        p = self.design_matrix.shape[1]
+        num_vars = self.design_matrix.shape[1]
         # Keep only non-zero genes
         normed_counts = self.counts.loc[:, self.non_zero_genes].div(self.size_factors, 0)
         dispersions = robust_method_of_moments_disp(normed_counts, self.design_matrix)
@@ -570,7 +603,7 @@ def calculate_cooks(self):
         ) ** 2 / V
         self.cooks = (
             squared_pearson_res
-            / p
+            / num_vars
             * (self._hat_diagonals / (1 - self._hat_diagonals) ** 2)
         ).T
 
@@ -611,7 +644,7 @@ def _replace_outliers(self):
         if not hasattr(self, "cooks"):
             self.calculate_cooks()
 
-        m, p = self.design_matrix.shape
+        num_samples, num_vars = self.design_matrix.shape
 
         # Check whether cohorts have enough samples to allow refitting
         n_or_more = (
@@ -624,7 +657,7 @@ def _replace_outliers(self):
         self.replaceable.index = self.design_matrix.index
 
         # Get positions of counts with cooks above threshold
-        cooks_cutoff = f.ppf(0.99, p, m - p)
+        cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars)
         idx = (self.cooks > cooks_cutoff).T
         self.replaced = idx.any(axis=0)
 
@@ -673,7 +706,7 @@ def _refit_without_outliers(
         sub_dds = DeseqDataSet(
             counts=self.counts_to_refit,
             clinical=self.clinical,
-            design_factor=self.design_factor,
+            design_factors=self.design_factors,
             min_mu=self.min_mu,
             min_disp=self.min_disp,
             max_disp=self.max_disp,