In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [5]:
# load in the counts and metadata again
prefix = "Trypanosoma"
counts = pd.read_csv(f"analysis/{prefix}/star/ReadsPerGene.csv", index_col=0).T
metadata = pd.read_csv(f"data/{prefix}/metadata.csv", index_col=0)
counts.fillna(0)

# restrict to the 2 stages we want to compare
counts_s = counts[metadata["organism"].isin(["Trypanosoma brucei brucei"])]
counts_s = counts_s.loc[:, (counts_s != 0).any(axis=0)]
metadata_s = metadata[metadata["organism"].isin(["Trypanosoma brucei brucei"])]

# create deseq2 dataset object
dds = DeseqDataSet(
    counts=counts_s,
    metadata=metadata_s,
    design_factors="condition",  # compare samples based on the developmental "stage"
    refit_cooks=True
)

# Run DeSeq2
dds.deseq2()

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 1.98 seconds.

Fitting dispersion trend curve...
... done in 0.23 seconds.

Fitting MAP dispersions...
... done in 1.96 seconds.

Fitting LFCs...
... done in 0.85 seconds.

Calculating cook's distance...
... done in 0.01 seconds.

Replacing 0 outlier genes.



In [3]:
! mkdir -p "analysis/Trypanosoma/de"

In [6]:
# Summarize results
stat_res=DeseqStats(dds)
stat_res.summary()
res = stat_res.results_df

Running Wald tests...


Log2 fold change & Wald test p-value: condition peak vs ascending
                                  baseMean  log2FoldChange     lfcSE  \
gene                                                                   
Tb04.24M18.150                  197.292416        0.217076  0.190053   
Tb04.3I12.100                   218.408392        0.124410  0.171674   
Tb05.30F7.410                    99.278007       -1.824686  0.655682   
Tb05.5K5.100                     16.771503        0.644534  0.565982   
Tb05.5K5.110                    329.781049       -0.045490  0.139694   
...                                    ...             ...       ...   
Tb927_10_v4.snoRNA.0063:snoRNA    3.790393        0.122767  1.088272   
Tb927_10_v4.snoRNA.0064:snoRNA    0.153151        0.709747  4.425350   
Tb927_10_v4.snoRNA.0073:snoRNA    0.183423       -1.213827  4.425356   
Tb927_10_v4.snoRNA.0078:snoRNA  133.952072        0.242593  0.196860   
tmp.1.100                        48.341400        0.781146  0.795903  

... done in 0.44 seconds.



In [7]:
res.to_csv(f"analysis/{prefix}/de/slender_vs_stumpy_tbrucei.full.csv")

In [8]:
# Filter results with baseMean<10 so that gene expressions close to zero don't skew results
res=res[res.baseMean>=10]

# Get list of only genes that have a fold change FC > 2 or FC < 0.5
sigs=res[(res.padj<0.05)&(abs(res.log2FoldChange)>1)]
sigs

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tb05.30F7.410,99.278007,-1.824686,0.655682,-2.782883,5.387819e-03,1.327922e-02
Tb05.5K5.270,559.894678,2.175802,0.136297,15.963716,2.287011e-57,2.794969e-55
Tb05.5K5.280,501.262274,-1.617947,0.146860,-11.016920,3.167116e-28,1.081477e-26
Tb05.5K5.290,54.930171,-1.074951,0.320333,-3.355727,7.915657e-04,2.361588e-03
Tb05.5K5.420,307.351030,-3.507154,0.201814,-17.378194,1.206955e-67,2.242039e-65
...,...,...,...,...,...,...
Tb927.9.9410,1057.658097,1.239115,0.099359,12.471079,1.073619e-35,5.478995e-34
Tb927.9.9810,1413.470872,-1.451264,0.100885,-14.385271,6.402719e-47,5.773636e-45
Tb927.9.9820,1610.596099,-1.227299,0.095661,-12.829598,1.119436e-37,6.301404e-36
Tb927.9.9940,2501.918060,-1.650387,0.091116,-18.113086,2.512616e-73,5.427251e-71


In [9]:
sigs.to_csv(f"analysis/{prefix}/de/slender_vs_stumpy_tbrucei.filtered.csv")