Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b990ca2
feat: allow a list of factors in build_design_matrix
Jan 2, 2023
3d889bf
feat: improve multifactor design function and limit number of IRLS it…
Jan 2, 2023
ad43332
docs(DeseqDataSet): update to reflect that the design may take severa…
Jan 2, 2023
7f26c27
docs(DeseqDataSet): update to reflect that the design may take severa…
Jan 2, 2023
d6382b7
feat: support pairwise multifactor tests
Jan 2, 2023
6b2ebec
fix(DeseqStats): handle multi-factor designs in _cooks_filtering
Jan 2, 2023
442bac3
feat: add multifactor support for apeglm lfc shrinkage
Jan 3, 2023
3f688ce
fix: convert design_factor to list if a string is provided
Jan 4, 2023
5c952f3
fix: correct mu initialisation in multi-factor case
Jan 5, 2023
e31af6f
fix: fix design matrix building in multifactor case
Jan 5, 2023
e850e5e
feat: add multifactor for shrinkage
Jan 5, 2023
416c817
docs: fix docstrings
Jan 5, 2023
0c95c24
CI: adapt tests to multi-factor
Jan 5, 2023
3b5f59d
docs: update docstrings and add line comments
Jan 5, 2023
1963663
refactor: change design_factor to design_factors
Jan 5, 2023
ae19ca2
docs: update README
Jan 5, 2023
a7dc5fe
docs: update docstring (list of strings argument)
BorisMuzellec Jan 9, 2023
e536451
refactor: change single-letter variable names to more informative ones
Jan 9, 2023
5e5c318
resolve merge issues
Jan 9, 2023
cd85df8
fix: change attribute name from design_factor to design_factors every…
Jan 9, 2023
a2b39cc
fix: fix design factor test
Jan 9, 2023
8c2197a
docs: expand the description of the "contrast" attribute
Jan 9, 2023
39f4211
feat: add tests on the contrast variable
Jan 9, 2023
b0e35f5
feat: add tests on the contrast variable
Jan 9, 2023
3c0ded6
refactor: change variable names for more meaningful ones and give var…
Jan 9, 2023
4be92a9
refactor: change function/variable names to more meanignful ones
Jan 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ It aims to facilitate DEA experiments for python users.

As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from scratch, you may experience some differences in terms of retrieved values or available features.

Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor analysis,
but we plan to implement more in the near future. In case there is a feature you would particularly like to be implemented, feel free to open an issue.
Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor
and paired multi-factor analysis, but we plan to implement more in the near future.
In case there is a feature you would particularly like to be implemented, feel free to open an issue.

## Installation

Expand Down
8 changes: 4 additions & 4 deletions notebooks/PyDESeq2_minimal_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"\n",
"from pydeseq2.DeseqDataSet import DeseqDataSet\n",
"from pydeseq2.DeseqStats import DeseqStats\n",
"from pydeseq2.utils import load_data"
"from pydeseq2.utils import load_example_data"
]
},
{
Expand Down Expand Up @@ -89,7 +89,7 @@
},
"outputs": [],
"source": [
"counts_df = load_data(\n",
"counts_df = load_example_data(\n",
" modality=\"raw_counts\",\n",
" cancer_type=CANCER,\n",
" debug=False,\n",
Expand All @@ -103,7 +103,7 @@
"metadata": {},
"outputs": [],
"source": [
"clinical_df = load_data(\n",
"clinical_df = load_example_data(\n",
" modality=\"clinical\",\n",
" cancer_type=CANCER,\n",
" debug=False,\n",
Expand Down Expand Up @@ -198,7 +198,7 @@
"dds = DeseqDataSet(\n",
" counts_df,\n",
" clinical_df,\n",
" design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
" design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
" refit_cooks=True,\n",
" n_cpus=8,\n",
")"
Expand Down
8 changes: 4 additions & 4 deletions notebooks/PyDESeq2_step_by_step_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"\n",
"from pydeseq2.DeseqDataSet import DeseqDataSet\n",
"from pydeseq2.DeseqStats import DeseqStats\n",
"from pydeseq2.utils import load_data"
"from pydeseq2.utils import load_example_data"
]
},
{
Expand Down Expand Up @@ -87,7 +87,7 @@
"metadata": {},
"outputs": [],
"source": [
"counts_df = load_data(\n",
"counts_df = load_example_data(\n",
" modality=\"raw_counts\",\n",
" cancer_type=CANCER,\n",
" debug=False,\n",
Expand All @@ -101,7 +101,7 @@
"metadata": {},
"outputs": [],
"source": [
"clinical_df = load_data(\n",
"clinical_df = load_example_data(\n",
" modality=\"clinical\",\n",
" cancer_type=CANCER,\n",
" debug=False,\n",
Expand Down Expand Up @@ -196,7 +196,7 @@
"dds = DeseqDataSet(\n",
" counts_df,\n",
" clinical_df,\n",
" design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
" design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
" refit_cooks=True,\n",
" n_cpus=8,\n",
")"
Expand Down
145 changes: 89 additions & 56 deletions pydeseq2/DeseqDataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ class DeseqDataSet:
DataFrame containing clinical information.
Must be indexed by sample barcodes.

design_factor : str
Name of the column of clinical to be used as a design variable.
(default: 'high_grade').
design_factors : str or list[str]
Name of the columns of clinical to be used as design variables. If a list,
the last factor will be considered the variable of interest by default.
Only bi-level factors are supported. (default: 'high_grade').

reference_level : str
The factor to use as a reference. Must be one of the values taken by the design.
Expand Down Expand Up @@ -155,7 +156,7 @@ def __init__(
self,
counts,
clinical,
design_factor="high_grade",
design_factors="high_grade",
reference_level=None,
min_mu=0.5,
min_disp=1e-8,
Expand Down Expand Up @@ -183,15 +184,20 @@ def __init__(

# Import clinical data and convert design_column to string
self.clinical = clinical.loc[self.counts.index]
self.design_factor = design_factor
if self.clinical[self.design_factor].isna().any():
raise ValueError("NaNs are not allowed in the design factor.")
self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)
# Convert design_factors to list if a single string was provided.
self.design_factors = (
[design_factors] if isinstance(design_factors, str) else design_factors
)
if self.clinical[self.design_factors].isna().any().any():
raise ValueError("NaNs are not allowed in the design factors.")
self.clinical[self.design_factors] = self.clinical[self.design_factors].astype(
str
)

# Build the design matrix (splits the dataset in two cohorts)
# Build the design matrix (splits the dataset in cohorts)
self.design_matrix = build_design_matrix(
self.clinical,
design_factor,
clinical_df=self.clinical,
design_factors=self.design_factors,
ref=reference_level,
expanded=False,
intercept=True,
Expand Down Expand Up @@ -250,6 +256,8 @@ def fit_genewise_dispersions(self):
Fits a negative binomial per gene, independently.
"""

num_vars = self.design_matrix.shape[-1]

# Check that size factors are available. If not, compute them.
if not hasattr(self, "size_factors"):
self.fit_size_factors()
Expand All @@ -261,29 +269,54 @@ def fit_genewise_dispersions(self):
non_zero = ~(self.counts == 0).all()
self.non_zero_genes = non_zero[non_zero].index
counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
m = counts_nonzero.shape[1]
num_genes = counts_nonzero.shape[1]

# Convert design_matrix to numpy for speed
X = self.design_matrix.values
rough_disps = self._rough_dispersions.loc[self.non_zero_genes].values
size_factors = self.size_factors.values

with parallel_backend("loky", inner_max_num_threads=1):
mu_hat_ = np.array(
Parallel(
# mu_hat is initialized differently depending on the number of different factor
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you think it might be useful to through a warning or otherwise give the comment to the user on which fit was used? Or at least add it to the docstring so it is easier to find? (please ignore if you don't think it is useful information to the user)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good question. Personally I don't think a warning is necessary, as the _mu_hat attribute is not meant to be accessed by the user (it's an intermediate value). DESeq2 does this silently too. If that's OK for you, I'll leave this as is for now.

# groups. If there are as many different factor combinations as design factors
# (intercept included), it is fitted with a linear model, otherwise it is fitted
# with a GLM (using rough dispersion estimates).
if len(self.design_matrix.value_counts()) == num_vars:
with parallel_backend("loky", inner_max_num_threads=1):
mu_hat_ = np.array(
Parallel(
n_jobs=self.n_processes,
verbose=self.joblib_verbosity,
batch_size=self.batch_size,
)(
delayed(fit_lin_mu)(
counts=counts_nonzero[:, i],
size_factors=size_factors,
design_matrix=X,
min_mu=self.min_mu,
)
for i in range(num_genes)
)
)
else:
with parallel_backend("loky", inner_max_num_threads=1):
res = Parallel(
n_jobs=self.n_processes,
verbose=self.joblib_verbosity,
batch_size=self.batch_size,
)(
delayed(fit_lin_mu)(
counts_nonzero[:, i],
size_factors,
X,
self.min_mu,
delayed(irls_solver)(
counts=counts_nonzero[:, i],
size_factors=size_factors,
design_matrix=X,
disp=rough_disps[i],
min_mu=self.min_mu,
beta_tol=self.beta_tol,
)
for i in range(m)
for i in range(num_genes)
)
)

_, mu_hat_, _, _ = zip(*res)
mu_hat_ = np.array(mu_hat_)

self._mu_hat = pd.DataFrame(
mu_hat_.T,
Expand All @@ -296,14 +329,14 @@ def fit_genewise_dispersions(self):
with parallel_backend("loky", inner_max_num_threads=1):
res = Parallel(n_jobs=self.n_processes, batch_size=self.batch_size)(
delayed(fit_alpha_mle)(
counts_nonzero[:, i],
X,
mu_hat_[i, :],
rough_disps[i],
self.min_disp,
self.max_disp,
counts=counts_nonzero[:, i],
design_matrix=X,
mu=mu_hat_[i, :],
alpha_hat=rough_disps[i],
min_disp=self.min_disp,
max_disp=self.max_disp,
)
for i in range(m)
for i in range(num_genes)
)
end = time.time()
print(f"... done in {end - start:.2f} seconds.\n")
Expand Down Expand Up @@ -398,8 +431,8 @@ def fit_dispersion_prior(self):
self.fit_dispersion_trend()

# Exclude genes with all zeroes
m = len(self.non_zero_genes)
p = self.design_matrix.shape[1]
num_genes = len(self.non_zero_genes)
num_vars = self.design_matrix.shape[1]

# Fit dispersions to the curve, and compute log residuals
disp_residuals = np.log(self.genewise_dispersions) - np.log(
Expand All @@ -415,7 +448,7 @@ def fit_dispersion_prior(self):
disp_residuals.loc[self.non_zero_genes][above_min_disp]
).median() ** 2 / norm.ppf(0.75)
self.prior_disp_var = np.maximum(
self._squared_logres - polygamma(1, (m - p) / 2), 0.25
self._squared_logres - polygamma(1, (num_genes - num_vars) / 2), 0.25
)

def fit_MAP_dispersions(self):
Expand All @@ -430,7 +463,7 @@ def fit_MAP_dispersions(self):

# Exclude genes with all zeroes
counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
m = counts_nonzero.shape[1]
num_genes = counts_nonzero.shape[1]

X = self.design_matrix.values
mu_hat = self._mu_hat.values
Expand All @@ -445,17 +478,17 @@ def fit_MAP_dispersions(self):
batch_size=self.batch_size,
)(
delayed(fit_alpha_mle)(
counts_nonzero[:, i],
X,
mu_hat[:, i],
fit_disps[i],
self.min_disp,
self.max_disp,
self.prior_disp_var,
True,
True,
counts=counts_nonzero[:, i],
design_matrix=X,
mu=mu_hat[:, i],
alpha_hat=fit_disps[i],
min_disp=self.min_disp,
max_disp=self.max_disp,
prior_disp_var=self.prior_disp_var,
cr_reg=True,
prior_reg=True,
)
for i in range(m)
for i in range(num_genes)
)
end = time.time()
print(f"... done in {end-start:.2f} seconds.\n")
Expand Down Expand Up @@ -491,7 +524,7 @@ def fit_LFC(self):

# Exclude genes with all zeroes
counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
m = counts_nonzero.shape[1]
num_genes = counts_nonzero.shape[1]

X = self.design_matrix.values
size_factors = self.size_factors.values
Expand All @@ -506,14 +539,14 @@ def fit_LFC(self):
batch_size=self.batch_size,
)(
delayed(irls_solver)(
counts_nonzero[:, i],
size_factors,
X,
disps[i],
self.min_mu,
self.beta_tol,
counts=counts_nonzero[:, i],
size_factors=size_factors,
design_matrix=X,
disp=disps[i],
min_mu=self.min_mu,
beta_tol=self.beta_tol,
)
for i in range(m)
for i in range(num_genes)
)
end = time.time()
print(f"... done in {end-start:.2f} seconds.\n")
Expand Down Expand Up @@ -560,7 +593,7 @@ def calculate_cooks(self):
if not hasattr(self, "dispersions"):
self.fit_MAP_dispersions()

p = self.design_matrix.shape[1]
num_vars = self.design_matrix.shape[1]
# Keep only non-zero genes
normed_counts = self.counts.loc[:, self.non_zero_genes].div(self.size_factors, 0)
dispersions = robust_method_of_moments_disp(normed_counts, self.design_matrix)
Expand All @@ -570,7 +603,7 @@ def calculate_cooks(self):
) ** 2 / V
self.cooks = (
squared_pearson_res
/ p
/ num_vars
* (self._hat_diagonals / (1 - self._hat_diagonals) ** 2)
).T

Expand Down Expand Up @@ -611,7 +644,7 @@ def _replace_outliers(self):
if not hasattr(self, "cooks"):
self.calculate_cooks()

m, p = self.design_matrix.shape
num_samples, num_vars = self.design_matrix.shape

# Check whether cohorts have enough samples to allow refitting
n_or_more = (
Expand All @@ -624,7 +657,7 @@ def _replace_outliers(self):
self.replaceable.index = self.design_matrix.index

# Get positions of counts with cooks above threshold
cooks_cutoff = f.ppf(0.99, p, m - p)
cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars)
idx = (self.cooks > cooks_cutoff).T
self.replaced = idx.any(axis=0)

Expand Down Expand Up @@ -673,7 +706,7 @@ def _refit_without_outliers(
sub_dds = DeseqDataSet(
counts=self.counts_to_refit,
clinical=self.clinical,
design_factor=self.design_factor,
design_factors=self.design_factors,
min_mu=self.min_mu,
min_disp=self.min_disp,
max_disp=self.max_disp,
Expand Down
Loading