In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [41]:
# load in the counts and metadata again
prefix = "Plasmodium"
counts = pd.read_csv(f"analysis/{prefix}/star/ReadsPerGene.csv", index_col=0).T
metadata = pd.read_csv(f"data/{prefix}/metadata.csv", index_col=0)

# restrict to the 2 stages we want to compare
counts_s = counts[metadata["timepoint"].isin([16,24]) & metadata["condition"].isin(["wildtype"])]
metadata_s = metadata[metadata["timepoint"].isin([16,24]) & metadata["condition"].isin(["wildtype"])]

# create deseq2 dataset object
dds = DeseqDataSet(
    counts=counts_s,
    metadata=metadata_s,
    design_factors="timepoint",  # compare samples based on the developmental "stage"
    refit_cooks=True
)


In [42]:
# Run DeSeq2
dds.deseq2()

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.00 seconds.

Fitting dispersion trend curve...
... done in 0.47 seconds.

Fitting MAP dispersions...
... done in 0.96 seconds.

Fitting LFCs...
... done in 0.76 seconds.

Replacing 0 outlier genes.



In [43]:
# Summarize results
stat_res=DeseqStats(dds)
stat_res.summary()
res = stat_res.results_df

Running Wald tests...


Log2 fold change & Wald test p-value: timepoint 24 vs 16
                    baseMean  log2FoldChange     lfcSE      stat    pvalue  \
gene                                                                         
PBANKA_0000101      0.499719        0.458866  4.079205  0.112489  0.910436   
PBANKA_0000201      0.000000             NaN       NaN       NaN       NaN   
PBANKA_0000301     10.017695       -0.514198  1.045984 -0.491593  0.623007   
PBANKA_0000401     21.489374        2.099218  0.840406  2.497861  0.012495   
PBANKA_0000600     24.726040       -1.279861  0.682595 -1.874992  0.060794   
...                      ...             ...       ...       ...       ...   
PBANKA_MIT03300     0.000000             NaN       NaN       NaN       NaN   
PBANKA_MIT03400     0.000000             NaN       NaN       NaN       NaN   
PBANKA_MIT03500  4967.510592        1.068291  0.586247  1.822255  0.068416   
PBANKA_MIT03600    80.486185        0.527752  0.588252  0.897152  0.369638   
PBANKA_

... done in 0.49 seconds.



In [44]:
res.to_csv(f"analysis/{prefix}/de/16_vs_24h_wildtype.full.csv")

In [45]:
# Filter results with baseMean<10 so that gene expressions close to zero don't skew results
res=res[res.baseMean>=10]
res

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PBANKA_0000301,10.017695,-0.514198,1.045984,-0.491593,0.623007,0.655489
PBANKA_0000401,21.489374,2.099218,0.840406,2.497861,0.012495,0.016503
PBANKA_0000600,24.726040,-1.279861,0.682595,-1.874992,0.060794,0.074576
PBANKA_0000901,20.009201,0.783187,0.733173,1.068215,0.285423,0.317576
PBANKA_0001001,169.147522,-0.278415,0.297155,-0.936937,0.348791,0.383112
...,...,...,...,...,...,...
PBANKA_MIT02700,1717.186751,0.662942,0.304260,2.178863,0.029342,0.037273
PBANKA_MIT02800,129.439561,0.717212,0.523901,1.368983,0.171004,0.197240
PBANKA_MIT03500,4967.510592,1.068291,0.586247,1.822255,0.068416,0.083330
PBANKA_MIT03600,80.486185,0.527752,0.588252,0.897152,0.369638,0.404973


In [46]:
# Get list of only genes that have a fold change FC > 2 or FC < 0.5
sigs=res[(res.padj<0.05)&(abs(res.log2FoldChange)>1)]
sigs

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PBANKA_0000401,21.489374,2.099218,0.840406,2.497861,1.249453e-02,1.650330e-02
PBANKA_0007701,61.267753,2.148682,0.514697,4.174658,2.984343e-05,4.841561e-05
PBANKA_0008101,195.532688,1.897002,0.295459,6.420518,1.358112e-10,2.943911e-10
PBANKA_0100021,1402.834566,3.470364,0.160587,21.610466,1.431964e-103,3.162612e-102
PBANKA_0100041,38.141408,1.635674,0.726840,2.250391,2.442412e-02,3.131204e-02
...,...,...,...,...,...,...
PBANKA_1466121,226.572142,-1.131772,0.267480,-4.231243,2.324037e-05,3.802701e-05
PBANKA_API00095,71.627149,2.234156,0.736556,3.033246,2.419384e-03,3.399823e-03
PBANKA_MIT00800,106.386590,1.004885,0.469840,2.138781,3.245344e-02,4.104335e-02
PBANKA_MIT01000,206.692383,1.021458,0.411696,2.481097,1.309789e-02,1.727806e-02


In [14]:
sigs.to_csv(f"analysis/{prefix}/de/16_vs_24h_wildtype.filtered.csv")

In [47]:
" ".join([i for i in sigs.index.to_list()])

'PBANKA_0000401 PBANKA_0007701 PBANKA_0008101 PBANKA_0100021 PBANKA_0100041 PBANKA_0100061 PBANKA_0100100 PBANKA_0100200 PBANKA_0100400 PBANKA_0100600 PBANKA_0100700 PBANKA_0100800 PBANKA_0100900 PBANKA_0101000 PBANKA_0101300 PBANKA_0102500 PBANKA_0102600 PBANKA_0102700 PBANKA_0102800 PBANKA_0102900 PBANKA_0103400 PBANKA_0103700 PBANKA_0103800 PBANKA_0104400 PBANKA_0104800 PBANKA_0104900 PBANKA_0105500 PBANKA_0105600 PBANKA_0106000 PBANKA_0106200 PBANKA_0106300 PBANKA_0106500 PBANKA_0106600 PBANKA_0106700 PBANKA_0107100 PBANKA_0107200 PBANKA_0107300 PBANKA_0107400 PBANKA_0107900 PBANKA_0108100 PBANKA_0108200 PBANKA_0108300 PBANKA_0108400 PBANKA_0108500 PBANKA_0108700 PBANKA_0108800 PBANKA_0108900 PBANKA_0109000 PBANKA_0109100 PBANKA_0109300 PBANKA_0109500 PBANKA_0109900 PBANKA_0110300 PBANKA_0110500 PBANKA_0110600 PBANKA_0110700 PBANKA_0110900 PBANKA_0111000 PBANKA_0111100 PBANKA_0111200 PBANKA_0111300 PBANKA_0111600 PBANKA_0111900 PBANKA_0112100 PBANKA_0112200 PBANKA_0112300 PBANKA_01

In [36]:
# load in the counts and metadata again
prefix = "Trypanosoma"
counts = pd.read_csv(f"analysis/{prefix}/star/ReadsPerGene.csv", index_col=0).T
metadata = pd.read_csv(f"data/{prefix}/metadata.csv", index_col=0)
counts.fillna(0)

# restrict to the 2 stages we want to compare
counts_s = counts[metadata["organism"].isin(["Trypanosoma brucei brucei"])]
counts_s = counts_s.loc[:, (counts_s != 0).any(axis=0)]
metadata_s = metadata[metadata["organism"].isin(["Trypanosoma brucei brucei"])]

# create deseq2 dataset object
dds = DeseqDataSet(
    counts=counts_s,
    metadata=metadata_s,
    design_factors="condition",  # compare samples based on the developmental "stage"
    refit_cooks=True
)


In [37]:
res.to_csv(f"analysis/{prefix}/de/slender_vs_stumpy_tbrucei.full.csv")

In [38]:
# Filter results with baseMean<10 so that gene expressions close to zero don't skew results
res=res[res.baseMean>=10]

# Get list of only genes that have a fold change FC > 2 or FC < 0.5
sigs=res[(res.padj<0.05)&(abs(res.log2FoldChange)>1)]
sigs

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tb05.30F7.410,99.278007,-1.824686,0.655682,-2.782883,5.387819e-03,1.306693e-02
Tb05.5K5.270,559.894678,2.175802,0.136328,15.960037,2.425868e-57,2.916149e-55
Tb05.5K5.280,501.262274,-1.617947,0.146893,-11.014468,3.254533e-28,1.093140e-26
Tb05.5K5.290,54.930171,-1.074951,0.320382,-3.355215,7.930336e-04,2.327387e-03
Tb05.5K5.420,307.351030,-3.507154,0.201850,-17.375023,1.275546e-67,2.330678e-65
...,...,...,...,...,...,...
Tb927.9.9410,1057.658097,1.239115,0.099392,12.466945,1.130782e-35,5.676277e-34
Tb927.9.9810,1413.470872,-1.451264,0.100918,-14.380611,6.848708e-47,6.074737e-45
Tb927.9.9820,1610.596099,-1.227299,0.095694,-12.825240,1.184189e-37,6.556816e-36
Tb927.9.9940,2501.918060,-1.650387,0.091148,-18.106590,2.827291e-73,6.007006e-71


In [39]:
sigs.to_csv(f"analysis/{prefix}/de/slender_vs_stumpy_tbrucei.filtered.csv")

In [40]:
" ".join([i for i in sigs.index.to_list()])

'Tb05.30F7.410 Tb05.5K5.270 Tb05.5K5.280 Tb05.5K5.290 Tb05.5K5.420 Tb05.5K5.500 Tb08.27P2.110 Tb08.27P2.160 Tb08.27P2.260 Tb08.27P2.400 Tb08.27P2.60 Tb08.27P2.90 Tb09.v4.0012 Tb09.v4.0018 Tb09.v4.0039 Tb09.v4.0042 Tb09.v4.0151 Tb09.v4.0152 Tb1.NT.27 Tb10.v4.0101 Tb10.v4.0117 Tb10.v4.0141 Tb10.v4.0163 Tb10.v4.0205 Tb11.01.6241 Tb11.1390 Tb11.1420 Tb11.15.0008 Tb11.1690 Tb11.NT.81 Tb11.NT.84 Tb11.v5.0142 Tb11.v5.0228 Tb11.v5.0242 Tb11.v5.0330 Tb11.v5.0346 Tb11.v5.0348 Tb11.v5.0397 Tb11.v5.0404 Tb11.v5.0417 Tb11.v5.0470 Tb11.v5.0551 Tb11.v5.0563 Tb11.v5.0580 Tb11.v5.0638 Tb11.v5.0664 Tb11.v5.0676 Tb11.v5.0677 Tb11.v5.0688 Tb11.v5.0695 Tb11.v5.0703 Tb11.v5.0726 Tb11.v5.0748 Tb11.v5.0752 Tb11.v5.0785 Tb11.v5.0788 Tb11.v5.0789 Tb11.v5.0879 Tb11.v5.0887 Tb11.v5.0896 Tb11.v5.0926 Tb11.v5.0964 Tb11.v5.1015 Tb11.v5.1026 Tb11.v5.1027 Tb11.v5.1063 Tb2.NT.50 Tb9.NT.34 Tb927.1.1620 Tb927.1.20 Tb927.1.2060 Tb927.1.2070 Tb927.1.2160 Tb927.1.2390 Tb927.1.2470 Tb927.1.2490 Tb927.1.2510 Tb927.1.2670 Tb92