In [23]:
# Sample to be processed
study = 'mznps_timecourse'


In [24]:
# Input/output/resource directories
place = 'nemo'
scratchdir = f"/plus/scratch/users/scott/projects/{study}"
rawdir = f'{scratchdir}/raw_data'
outdir = f'{scratchdir}/project_results'
datadir = f'{scratchdir}/resources'
commonsdir = '/plus/data/@data_scott/common_resources'


In [25]:
# Load libraries
import os
import sklearn
import pickle
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from scipy import io

import scanpy as sc
import anndata as ad
import pybiomart as pbm
import leidenalg as la
import scrublet as scr

import graphtools as gt
from pygsp import graphs, filters
import phate
import magic
import scprep
import sklearn
import meld

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import time
import natsort

# Package settings
sc.settings.autosave = False
sc.settings.figdir = f'{outdir}/'
np.random.seed(42)
font = {'size'   : 14}
mpl.rc('font', **font)
mpl.rcParams['animation.embed_limit'] = 1000
mpl.rcParams['pdf.fonttype'] = 42

In [26]:
# Show loaded libraries
import session_info
session_info.show()

In [27]:
# Import sequence analysis tools
import pyranges as pr
import pybedtools
import Bio
import pysam


In [28]:
# Import bulk RNA tools
import PyWGCNA
import conorm
import pingouin as pg

### Load bulk timeseries data

In [29]:
# Load bulk transcriptome data
bulk_data = pd.read_csv(f'{rawdir}/GSE162415_danrer11_normalized_reads_combined.csv', index_col=0)
bulk_data.index = bulk_data.index.str.split('.').str[0]
bulk_data


Unnamed: 0_level_0,MZsox19bnanog_rep2_6hpf,MZsox19bnanog_rep2_5.5hpf,MZsox19bnanog_rep2_5hpf,MZsox19bnanog_rep2_4.5hpf,MZsox19bnanog_rep2_4hpf,MZsox19bnanog_rep2_3.5hpf,MZsox19bnanog_rep2_3hpf,MZsox19bnanog_rep2_2.5hpf,MZsox19bnanog_rep1_5.5hpf,MZsox19bnanog_rep1_5hpf,...,K2_WT_B8_3hpf,K1_WT_B8_2.5hpf,C8_WT_B7_6hpf,C7_WT_B7_5.5hpf,C6_WT_B7_5hpf,C5_WT_B7_4.5hpf,C4_WT_B7_4hpf,C3_WT_B7_3.5hpf,C2_WT_B7_3hpf,C1_WT_B7_2.5hpf
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDART00000168631,2,1,4,2,8,8,13,5,8,13,...,6,10,0,0,1,0,1,1,0,1
ENSDART00000160440,7,12,9,7,16,15,8,12,11,28,...,29,9,4,1,1,3,1,1,1,0
ENSDART00000144711,0,0,0,2,3,6,6,4,1,5,...,3,4,1175,692,335,28,6,0,2,4
ENSDART00000177225,0,0,0,0,2,1,1,2,0,0,...,1,1,666,422,143,10,0,0,1,3
ENSDART00000075632,945,911,883,722,769,621,623,518,934,910,...,536,409,1051,989,957,782,577,530,480,403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDART00000182213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSDART00000193333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
ENSDART00000181556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
ENSDART00000159467,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [30]:
# Rename columns
names = {
'MZsox19bnanog_rep2_6hpf': 'MZsn_rep2_6hpf',
'MZsox19bnanog_rep2_5.5hpf': 'MZsn_rep2_5.5hpf',
'MZsox19bnanog_rep2_5hpf': 'MZsn_rep2_5hpf',
'MZsox19bnanog_rep2_4.5hpf': 'MZsn_rep2_4.5hpf',
'MZsox19bnanog_rep2_4hpf': 'MZsn_rep2_4hpf',
'MZsox19bnanog_rep2_3.5hpf': 'MZsn_rep2_3.5hpf',
'MZsox19bnanog_rep2_3hpf': 'MZsn_rep2_3hpf',
'MZsox19bnanog_rep2_2.5hpf': 'MZsn_rep2_2.5hpf',
'MZsox19bnanog_rep1_5.5hpf': 'MZsn_rep1_5.5hpf',
'MZsox19bnanog_rep1_5hpf': 'MZsn_rep1_5hpf',
'MZsox19bnanog_rep1_4.5hpf': 'MZsn_rep1_4.5hpf',
'MZsox19bnanog_rep1_4hpf': 'MZsn_rep1_4hpf',
'MZsox19bnanog_rep1_3.5hpf': 'MZsn_rep1_3.5hpf',
'MZsox19bnanog_rep1_3hpf': 'MZsn_rep1_3hpf',
'MZsox19bnanog_rep1_2.5hpf': 'MZsn_rep1_2.5hpf',
'MZsox19bspg_rep2_6hpf': 'MZsp_rep2_6hpf',
'MZsox19bspg_rep2_5.5hpf': 'MZsp_rep2_5.5hpf',
'MZsox19bspg_rep2_5hpf': 'MZsp_rep2_5hpf',
'MZsox19bspg_rep2_4.5hpf': 'MZsp_rep2_4.5hpf',
'MZsox19bspg_rep2_4hpf': 'MZsp_rep2_4hpf',
'MZsox19bspg_rep2_3.5hpf': 'MZsp_rep2_3.5hpf',
'MZsox19bspg_rep2_3hpf': 'MZsp_rep2_3hpf',
'MZsox19bspg_rep2_2.5hpf': 'MZsp_rep2_2.5hpf',
'MZsox19bspg_rep1_5.5hpf': 'MZsp_rep1_5.5hpf',
'MZsox19bspg_rep1_5hpf': 'MZsp_rep1_5hpf',
'MZsox19bspg_rep1_4.5hpf': 'MZsp_rep1_4.5hpf',
'MZsox19bspg_rep1_4hpf': 'MZsp_rep1_4hpf',
'MZsox19bspg_rep1_3.5hpf': 'MZsp_rep1_3.5hpf',
'MZsox19bspg_rep1_3hpf': 'MZsp_rep1_3hpf',
'MZsox19bspg_rep1_2.5hpf': 'MZsp_rep1_2.5hpf',
'MZnanog_rep3_6hpf': 'MZn_rep3_6hpf',
'MZnanog_rep3_5.5hpf': 'MZn_rep3_5.5hpf',
'MZnanog_rep3_5hpf': 'MZn_rep3_5hpf',
'MZnanog_rep3_4.5hpf': 'MZn_rep3_4.5hpf',
'MZnanog_rep3_4hpf': 'MZn_rep3_4hpf',
'MZnanog_rep3_3.5hpf': 'MZn_rep3_3.5hpf',
'MZnanog_rep3_3hpf': 'MZn_rep3_3hpf',
'MZnanog_rep2_6hpf': 'MZn_rep2_6hpf',
'MZnanog_rep2_5.5hpf': 'MZn_rep2_5.5hpf',
'MZnanog_rep2_5hpf': 'MZn_rep2_5hpf',
'MZnanog_rep2_4.5hpf': 'MZn_rep2_4.5hpf',
'MZnanog_rep2_4hpf': 'MZn_rep2_4hpf',
'MZnanog_rep2_3.5hpf': 'MZn_rep2_3.5hpf',
'MZnanog_rep2_3hpf': 'MZn_rep2_3hpf',
'MZnanog_rep2_2.5hpf': 'MZn_rep2_2.5hpf',
'MZnanog_rep1_6hpf': 'MZn_rep1_6hpf',
'MZnanog_rep1_5.5hpf': 'MZn_rep1_5.5hpf',
'MZnanog_rep1_5hpf': 'MZn_rep1_5hpf',
'MZnanog_rep1_4.5hpf': 'MZn_rep1_4.5hpf',
'MZnanog_rep1_4hpf': 'MZn_rep1_4hpf',
'MZnanog_rep1_3.5hpf': 'MZn_rep1_3.5hpf',
'MZnanog_rep1_3hpf': 'MZn_rep1_3hpf',
'MZspg_rep3_6hpf': 'MZp_rep3_6hpf',
'MZspg_rep3_5.5hpf': 'MZp_rep3_5.5hpf',
'MZspg_rep3_5hpf': 'MZp_rep3_5hpf',
'MZspg_rep3_4.5hpf': 'MZp_rep3_4.5hpf',
'MZspg_rep3_4hpf': 'MZp_rep3_4hpf',
'MZspg_rep3_3.5hpf': 'MZp_rep3_3.5hpf',
'MZspg_rep3_3hpf': 'MZp_rep3_3hpf',
'MZspg_rep3_2.5hpf': 'MZp_rep3_2.5hpf',
'MZspg_rep2_6hpf': 'MZp_rep2_6hpf',
'MZspg_rep2_5.5hpf': 'MZp_rep2_5.5hpf',
'MZspg_rep2_5hpf': 'MZp_rep2_5hpf',
'MZspg_rep2_4.5hpf': 'MZp_rep2_4.5hpf',
'MZspg_rep2_4hpf': 'MZp_rep2_4hpf',
'MZspg_rep2_3.5hpf': 'MZp_rep2_3.5hpf',
'MZspg_rep2_3hpf': 'MZp_rep2_3hpf',
'MZspg_rep2_2.5hpf': 'MZp_rep2_2.5hpf',
'MZspg_rep1_6hpf': 'MZp_rep1_6hpf',
'MZspg_rep1_5.5hpf': 'MZp_rep1_5.5hpf',
'MZspg_rep1_5hpf': 'MZp_rep1_5hpf',
'MZspg_rep1_4.5hpf': 'MZp_rep1_4.5hpf',
'MZspg_rep1_4hpf': 'MZp_rep1_4hpf',
'MZspg_rep1_3.5hpf': 'MZp_rep1_3.5hpf',
'MZspg_rep1_3hpf': 'MZp_rep1_3hpf',
'MZspg_rep1_2.5hpf': 'MZp_rep1_2.5hpf',
'MZsox19b_rep3_6hpf': 'MZs_rep3_6hpf',
'MZsox19b_rep3_5.5hpf': 'MZs_rep3_5.5hpf',
'MZsox19b_rep3_5hpf': 'MZs_rep3_5hpf',
'MZsox19b_rep3_4.5hpf': 'MZs_rep3_4.5hpf',
'MZsox19b_rep3_4hpf': 'MZs_rep3_4hpf',
'MZsox19b_rep3_3.5hpf': 'MZs_rep3_3.5hpf',
'MZsox19b_rep3_3hpf': 'MZs_rep3_3hpf',
'MZsox19b_rep3_2.5hpf': 'MZs_rep3_2.5hpf',
'MZsox19b_rep2_6hpf': 'MZs_rep2_6hpf',
'MZsox19b_rep2_5.5hpf': 'MZs_rep2_5.5hpf',
'MZsox19b_rep2_5hpf': 'MZs_rep2_5hpf',
'MZsox19b_rep2_4.5hpf': 'MZs_rep2_4.5hpf',
'MZsox19b_rep2_4hpf': 'MZs_rep2_4hpf',
'MZsox19b_rep2_3.5hpf': 'MZs_rep2_3.5hpf',
'MZsox19b_rep2_3hpf': 'MZs_rep2_3hpf',
'MZsox19b_rep2_2.5hpf': 'MZs_rep2_2.5hpf',
'MZsox19b_rep1_6hpf': 'MZs_rep1_6hpf',
'MZsox19b_rep1_5.5hpf': 'MZs_rep1_5.5hpf',
'MZsox19b_rep1_5hpf': 'MZs_rep1_5hpf',
'MZsox19b_rep1_4.5hpf': 'MZs_rep1_4.5hpf',
'MZsox19b_rep1_4hpf': 'MZs_rep1_4hpf',
'MZsox19b_rep1_3.5hpf': 'MZs_rep1_3.5hpf',
'MZsox19b_rep1_3hpf': 'MZs_rep1_3hpf',
'MZsox19b_rep1_2.5hpf': 'MZs_rep1_2.5hpf',
'WT_rep5_5.5hpf': 'WT_rep5_5.5hpf',
'WT_rep5_5hpf': 'WT_rep5_5hpf',
'WT_rep5_4.5hpf': 'WT_rep5_4.5hpf',
'WT_rep5_4hpf': 'WT_rep5_4hpf',
'WT_rep5_3.5hpf': 'WT_rep5_3.5hpf',
'WT_rep5_3hpf': 'WT_rep5_3hpf',
'WT_rep5_2.5hpf': 'WT_rep5_2.5hpf',
'WT_rep4_6hpf': 'WT_rep4_6hpf',
'WT_rep4_5.5hpf': 'WT_rep4_5.5hpf',
'WT_rep4_5hpf': 'WT_rep4_5hpf',
'WT_rep4_4.5hpf': 'WT_rep4_4.5hpf',
'WT_rep4_4hpf': 'WT_rep4_4hpf',
'WT_rep4_3.5hpf': 'WT_rep4_3.5hpf',
'WT_rep4_3hpf': 'WT_rep4_3hpf',
'WT_rep4_2.5hpf': 'WT_rep4_2.5hpf',
'WT_rep3_6hpf': 'WT_rep3_6hpf',
'WT_rep3_5.5hpf': 'WT_rep3_5.5hpf',
'WT_rep3_5hpf': 'WT_rep3_5hpf',
'WT_rep3_4.5hpf': 'WT_rep3_4.5hpf',
'WT_rep3_4hpf': 'WT_rep3_4hpf',
'WT_rep3_3.5hpf': 'WT_rep3_3.5hpf',
'WT_rep3_3hpf': 'WT_rep3_3hpf',
'WT_rep3_2.5hpf': 'WT_rep3_2.5hpf',
'WT_rep2_6hpf': 'WT_rep2_6hpf',
'WT_rep2_5.5hpf': 'WT_rep2_5.5hpf',
'WT_rep2_5hpf': 'WT_rep2_5hpf',
'WT_rep2_4.5hpf': 'WT_rep2_4.5hpf',
'WT_rep2_4hpf': 'WT_rep2_4hpf',
'WT_rep2_3.5hpf': 'WT_rep2_3.5hpf',
'WT_rep2_3hpf': 'WT_rep2_3hpf',
'WT_rep2_2.5hpf': 'WT_rep2_2.5hpf',
'WT_rep1_5.5hpf': 'WT_rep1_5.5hpf',
'WT_rep1_5hpf': 'WT_rep1_5hpf',
'WT_rep1_4.5hpf': 'WT_rep1_4.5hpf',
'WT_rep1_4hpf': 'WT_rep1_4hpf',
'WT_rep1_3.5hpf': 'WT_rep1_3.5hpf',
'WT_rep1_3hpf': 'WT_rep1_3hpf',
'WT_rep1_2.5hpf': 'WT_rep1_2.5hpf',
'T3_triple_B12_6hpf': 'MZnps_rep1_6hpf',
'R8_triple_B12_5.5hpf': 'MZnps_rep1_5.5hpf',
'R7_triple_B12_5hpf': 'MZnps_rep1_5hpf',
'R6_triple_B12_4.5hpf': 'MZnps_rep1_4.5hpf',
'R5_triple_B12_4hpf': 'MZnps_rep1_4hpf',
'R4_triple_B12_3.5hpf': 'MZnps_rep1_3.5hpf',
'R3_triple_B12_3hpf': 'MZnps_rep1_3hpf',
'R2_triple_B12_2.5hpf': 'MZnps_rep1_2.5hpf',
'P8_triple_B11_6hpf': 'MZnps_rep2_6hpf',
'P7_triple_B11_5.5hpf': 'MZnps_rep2_5.5hpf',
'P6_triple_B11_5hpf': 'MZnps_rep2_5hpf',
'P5_triple_B11_4.5hpf': 'MZnps_rep2_4.5hpf',
'P4_triple_B11_4hpf': 'MZnps_rep2_4hpf',
'P3_triple_B11_3.5hpf': 'MZnps_rep2_3.5hpf',
'P2_triple_B11_3hpf': 'MZnps_rep2_3hpf',
'P1_triple_B11_2.5hpf': 'MZnps_rep2_2.5hpf',
'B7_triple_B10_6hpf': 'MZnps_rep3_6hpf',
'B6_triple_B10_5.5hpf': 'MZnps_rep3_5.5hpf',
'B5_triple_B10_5hpf': 'MZnps_rep3_5hpf',
'B4_triple_B10_4.5hpf': 'MZnps_rep3_4.5hpf',
'B3_triple_B10_4hpf': 'MZnps_rep3_4hpf',
'B2_triple_B10_3.5hpf': 'MZnps_rep3_3.5hpf',
'B1_triple_B10_3hpf': 'MZnps_rep3_3hpf',
'J8_MZnanogspg_B9_6hpf': 'MZnp_rep1_6hpf',
'J7_MZnanogspg_B9_5.5hpf': 'MZnp_rep1_5.5hpf',
'J6_MZnanogspg_B9_5hpf': 'MZnp_rep1_5hpf',
'J5_MZnanogspg_B9_4.5hpf': 'MZnp_rep1_4.5hpf',
'J4_MZnanogspg_B9_4hpf': 'MZnp_rep1_4hpf',
'J3_MZnanogspg_B9_3.5hpf': 'MZnp_rep1_3.5hpf',
'J2_MZnanogspg_B9_3hpf': 'MZnp_rep1_3hpf',
'J1_MZnanogspg_B9_2.5hpf': 'MZnp_rep1_2.5hpf',
'L8_MZnanogspg_B8_6hpf': 'MZnp_rep2_6hpf',
'L7_MZnanogspg_B8_5.5hpf': 'MZnp_rep2_5.5hpf',
'L6_MZnanogspg_B8_5hpf': 'MZnp_rep2_5hpf',
'L5_MZnanogspg_B8_4.5hpf': 'MZnp_rep2_4.5hpf',
'L4_MZnanogspg_B8_4hpf': 'MZnp_rep2_4hpf',
'L3_MZnanogspg_B8_3.5hpf': 'MZnp_rep2_3.5hpf',
'L2_MZnanogspg_B8_3hpf': 'MZnp_rep2_3hpf',
'L1_MZnanogspg_B8_2.5hpf': 'MZnp_rep2_2.5hpf',
'D8_MZnanogspg_B7_6hpf': 'MZnp_rep3_6hpf',
'D7_MZnanogspg_B7_5.5hpf': 'MZnp_rep3_5.5hpf',
'D6_MZnanogspg_B7_5hpf': 'MZnp_rep3_5hpf',
'D5_MZnanogspg_B7_4.5hpf': 'MZnp_rep3_4.5hpf',
'D4_MZnanogspg_B7_4hpf': 'MZnp_rep3_4hpf',
'D3_MZnanogspg_B7_3.5hpf': 'MZnp_rep3_3.5hpf',
'D2_MZnanogspg_B7_3hpf': 'MZnp_rep3_3hpf',
'D1_MZnanogspg_B7_2.5hpf': 'MZnp_rep3_2.5hpf',
'T1_WT_B12_6hpf': 'WT_rep6_6hpf',
'Q8_WT_B12_5.5hpf': 'WT_rep6_5.5hpf',
'Q7_WT_B12_5hpf': 'WT_rep6_5hpf',
'Q6_WT_B12_4.5hpf': 'WT_rep6_4.5hpf',
'Q5_WT_B12_4hpf': 'WT_rep6_4hpf',
'Q4_WT_B12_3.5hpf': 'WT_rep6_3.5hpf',
'Q3_WT_B12_3hpf': 'WT_rep6_3hpf',
'Q2_WT_B12_2.5hpf': 'WT_rep6_2.5hpf',
'O8_WT_B11_6hpf': 'WT_rep7_6hpf',
'O7_WT_B11_5.5hpf': 'WT_rep7_5.5hpf',
'O6_WT_B11_5hpf': 'WT_rep7_5hpf',
'O5_WT_B11_4.5hpf': 'WT_rep7_4.5hpf',
'O4_WT_B11_4hpf': 'WT_rep7_4hpf',
'O3_WT_B11_3.5hpf': 'WT_rep7_3.5hpf',
'O2_WT_B11_3hpf': 'WT_rep7_3hpf',
'O1_WT_B11_2.5hpf': 'WT_rep7_2.5hpf',
'A7_WT_B10_6hpf': 'WT_rep8_6hpf',
'A6_WT_B10_5.5hpf': 'WT_rep8_5.5hpf',
'A5_WT_B10_5hpf': 'WT_rep8_5hpf',
'A4_WT_B10_4.5hpf': 'WT_rep8_4.5hpf',
'A3_WT_B10_4hpf': 'WT_rep8_4hpf',
'A2_WT_B10_3.5hpf': 'WT_rep8_3.5hpf',
'A1_WT_B10_3hpf': 'WT_rep8_3hpf',
'I8_WT_B9_6hpf': 'WT_rep9_6hpf',
'I7_WT_B9_5.5hpf': 'WT_rep9_5.5hpf',
'I6_WT_B9_5hpf': 'WT_rep9_5hpf',
'I5_WT_B9_4.5hpf': 'WT_rep9_4.5hpf',
'I4_WT_B9_4hpf': 'WT_rep9_4hpf',
'I3_WT_B9_3.5hpf': 'WT_rep9_3.5hpf',
'I2_WT_B9_3hpf': 'WT_rep9_3hpf',
'I1_WT_B9_2.5hpf': 'WT_rep9_2.5hpf',
'K8_WT_B8_6hpf': 'WT_rep10_6hpf',
'K7_WT_B8_5.5hpf': 'WT_rep10_5.5hpf',
'K6_WT_B8_5hpf': 'WT_rep10_5hpf',
'K5_WT_B8_4.5hpf': 'WT_rep10_4.5hpf',
'K4_WT_B8_4hpf': 'WT_rep10_4hpf',
'K3_WT_B8_3.5hpf': 'WT_rep10_3.5hpf',
'K2_WT_B8_3hpf': 'WT_rep10_3hpf',
'K1_WT_B8_2.5hpf': 'WT_rep10_2.5hpf',
'C8_WT_B7_6hpf': 'WT_rep11_6hpf',
'C7_WT_B7_5.5hpf': 'WT_rep11_5.5hpf',
'C6_WT_B7_5hpf': 'WT_rep11_5hpf',
'C5_WT_B7_4.5hpf': 'WT_rep11_4.5hpf',
'C4_WT_B7_4hpf': 'WT_rep11_4hpf',
'C3_WT_B7_3.5hpf': 'WT_rep11_3.5hpf',
'C2_WT_B7_3hpf': 'WT_rep11_3hpf',
'C1_WT_B7_2.5hpf': 'WT_rep11_2.5hpf'
}
bulk_data = bulk_data.rename(columns=names)
bulk_data

Unnamed: 0_level_0,MZsn_rep2_6hpf,MZsn_rep2_5.5hpf,MZsn_rep2_5hpf,MZsn_rep2_4.5hpf,MZsn_rep2_4hpf,MZsn_rep2_3.5hpf,MZsn_rep2_3hpf,MZsn_rep2_2.5hpf,MZsn_rep1_5.5hpf,MZsn_rep1_5hpf,...,WT_rep10_3hpf,WT_rep10_2.5hpf,WT_rep11_6hpf,WT_rep11_5.5hpf,WT_rep11_5hpf,WT_rep11_4.5hpf,WT_rep11_4hpf,WT_rep11_3.5hpf,WT_rep11_3hpf,WT_rep11_2.5hpf
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDART00000168631,2,1,4,2,8,8,13,5,8,13,...,6,10,0,0,1,0,1,1,0,1
ENSDART00000160440,7,12,9,7,16,15,8,12,11,28,...,29,9,4,1,1,3,1,1,1,0
ENSDART00000144711,0,0,0,2,3,6,6,4,1,5,...,3,4,1175,692,335,28,6,0,2,4
ENSDART00000177225,0,0,0,0,2,1,1,2,0,0,...,1,1,666,422,143,10,0,0,1,3
ENSDART00000075632,945,911,883,722,769,621,623,518,934,910,...,536,409,1051,989,957,782,577,530,480,403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDART00000182213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSDART00000193333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
ENSDART00000181556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
ENSDART00000159467,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [31]:
# Create bulk data obs
bulk_data_obs = pd.DataFrame(index = bulk_data.columns.to_list())
bulk_data_obs['Sample_id'] = bulk_data_obs.index.to_list()
bulk_data_obs = bulk_data_obs.merge(bulk_data_obs['Sample_id'].str.split(r"_", expand=True), how='left', left_index=True, right_index=True)
bulk_data_obs.columns = ['Sample_id', 'Condition', 'Replicate', 'Timepoint']
bulk_data_obs['Sample_type'] = bulk_data_obs.Condition + '_' + bulk_data_obs.Timepoint
bulk_data_obs['hpf'] = bulk_data_obs['Timepoint'].str.removesuffix('hpf').astype(float)
bulk_data_obs['Batch'] = ['A' if i in bulk_data_obs.index[:138] else 'B' for i in bulk_data_obs.index.to_list()]
bulk_data_obs

Unnamed: 0,Sample_id,Condition,Replicate,Timepoint,Sample_type,hpf,Batch
MZsn_rep2_6hpf,MZsn_rep2_6hpf,MZsn,rep2,6hpf,MZsn_6hpf,6.0,A
MZsn_rep2_5.5hpf,MZsn_rep2_5.5hpf,MZsn,rep2,5.5hpf,MZsn_5.5hpf,5.5,A
MZsn_rep2_5hpf,MZsn_rep2_5hpf,MZsn,rep2,5hpf,MZsn_5hpf,5.0,A
MZsn_rep2_4.5hpf,MZsn_rep2_4.5hpf,MZsn,rep2,4.5hpf,MZsn_4.5hpf,4.5,A
MZsn_rep2_4hpf,MZsn_rep2_4hpf,MZsn,rep2,4hpf,MZsn_4hpf,4.0,A
...,...,...,...,...,...,...,...
WT_rep11_4.5hpf,WT_rep11_4.5hpf,WT,rep11,4.5hpf,WT_4.5hpf,4.5,B
WT_rep11_4hpf,WT_rep11_4hpf,WT,rep11,4hpf,WT_4hpf,4.0,B
WT_rep11_3.5hpf,WT_rep11_3.5hpf,WT,rep11,3.5hpf,WT_3.5hpf,3.5,B
WT_rep11_3hpf,WT_rep11_3hpf,WT,rep11,3hpf,WT_3hpf,3.0,B


In [32]:
# Load annotation information
GTF = pr.read_gtf(f'{commonsdir}/annotations/Danio_rerio.GRCz11.104.gtf.gz')
GTF = GTF.as_df()
GTF_transcript = GTF[GTF.Feature == 'transcript']
GTF_transcript.index = GTF_transcript.transcript_id
GTF_transcript

Unnamed: 0_level_0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_version,...,transcript_version,transcript_name,transcript_source,transcript_biotype,exon_number,exon_id,exon_version,protein_id,protein_version,tag
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDART00000171868,1,havana,transcript,27984392,27995611,.,+,.,ENSDARG00000100083,2,...,2,sugt1-204,havana,processed_transcript,,,,,,
ENSDART00000166819,1,ensembl_havana,transcript,27977330,28019225,.,+,.,ENSDARG00000100083,2,...,2,sugt1-203,ensembl_havana,protein_coding,,,,,,
ENSDART00000164984,1,havana,transcript,27977369,28001937,.,+,.,ENSDARG00000100083,2,...,2,sugt1-202,havana,nonsense_mediated_decay,,,,,,
ENSDART00000162589,1,havana,transcript,27977388,27995053,.,+,.,ENSDARG00000100083,2,...,2,sugt1-201,havana,retained_intron,,,,,,
ENSDART00000180692,1,ensembl,transcript,27977296,28020042,.,+,.,ENSDARG00000100083,2,...,1,sugt1-205,ensembl,protein_coding,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDART00000118615,MT,RefSeq,transcript,6352,6423,.,-,.,ENSDARG00000080128,3,...,3,NC_002333.1-201,RefSeq,Mt_tRNA,,,,,,
ENSDART00000117474,MT,RefSeq,transcript,7975,8046,.,-,.,ENSDARG00000081369,3,...,3,NC_002333.9-201,RefSeq,Mt_tRNA,,,,,,
ENSDART00000093623,MT,RefSeq,transcript,14713,15232,.,-,.,ENSDARG00000063922,3,...,3,mt-nd6-201,RefSeq,protein_coding,,,,,,
ENSDART00000116823,MT,RefSeq,transcript,15232,15301,.,-,.,ENSDARG00000083312,3,...,3,NC_002333.21-201,RefSeq,Mt_tRNA,,,,,,


In [33]:
# Transform counts to gene level (sum)
bulk_data_gene = bulk_data.merge(GTF_transcript['gene_id'], how='left', left_index=True, right_index=True)
bulk_data_gene = bulk_data_gene.groupby('gene_id').sum()
bulk_data_gene

Unnamed: 0_level_0,MZsn_rep2_6hpf,MZsn_rep2_5.5hpf,MZsn_rep2_5hpf,MZsn_rep2_4.5hpf,MZsn_rep2_4hpf,MZsn_rep2_3.5hpf,MZsn_rep2_3hpf,MZsn_rep2_2.5hpf,MZsn_rep1_5.5hpf,MZsn_rep1_5hpf,...,WT_rep10_3hpf,WT_rep10_2.5hpf,WT_rep11_6hpf,WT_rep11_5.5hpf,WT_rep11_5hpf,WT_rep11_4.5hpf,WT_rep11_4hpf,WT_rep11_3.5hpf,WT_rep11_3hpf,WT_rep11_2.5hpf
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDARG00000000001,308,339,350,417,444,550,591,588,405,388,...,624,577,189,164,234,387,556,697,743,796
ENSDARG00000000002,7,10,18,13,30,29,35,23,4,3,...,9,16,0,12,16,20,26,32,39,21
ENSDARG00000000018,34,37,24,30,26,45,43,24,42,38,...,3,1,0,0,0,0,4,1,1,3
ENSDARG00000000019,8253,8292,8363,8405,7790,7325,7898,9178,5129,5234,...,7602,8842,6663,7230,9397,10243,9739,9008,8907,9677
ENSDARG00000000068,244,257,334,305,356,357,409,417,147,123,...,346,363,503,463,375,356,340,320,347,394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000117202,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSDARG00000117203,5,3,0,0,0,1,0,0,4,3,...,0,0,9,12,4,0,0,0,0,0
ENSDARG00000117204,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSDARG00000117206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Create var df
GTF_gene = GTF[GTF.Feature == 'gene']
GTF_gene.index = GTF_gene.gene_id
GTF_gene

bulk_data_var = pd.DataFrame(index = bulk_data_gene.index.to_list())
bulk_data_var = bulk_data_var.merge(GTF_gene[['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_version', 'gene_name', 'gene_source',
       'gene_biotype']], how='left', left_index=True, right_index=True)
bulk_data_var.gene_name[bulk_data_var.gene_name.isna()] = bulk_data_var.gene_id
bulk_data_var.index = bulk_data_var.gene_name.to_list()
bulk_data_gene.index = bulk_data_var.gene_name.to_list()


In [35]:
# Create an anndata object of experiment
sdata = ad.AnnData(X=bulk_data_gene.transpose(), obs=bulk_data_obs, var=bulk_data_var)
sdata.var_names_make_unique()
sdata

AnnData object with n_obs × n_vars = 232 × 31901
    obs: 'Sample_id', 'Condition', 'Replicate', 'Timepoint', 'Sample_type', 'hpf', 'Batch'
    var: 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_id', 'gene_version', 'gene_name', 'gene_source', 'gene_biotype'

### Normalise and filter data

In [36]:
# Store raw counts
sdata.layers['X_preNorm'] = sdata.X

In [37]:
# Calculate library size and normalisation factors before filtering
sdata.obs['Library_size'] = sdata.layers['X_preNorm'].sum(axis=1)
sdata.obs['Normalisation_factor'] = conorm.tmm_norm_factors(sdata.X.transpose())
sdata.obs

Unnamed: 0,Sample_id,Condition,Replicate,Timepoint,Sample_type,hpf,Batch,Library_size,Normalisation_factor
MZsn_rep2_6hpf,MZsn_rep2_6hpf,MZsn,rep2,6hpf,MZsn_6hpf,6.0,A,19500001,0.919670
MZsn_rep2_5.5hpf,MZsn_rep2_5.5hpf,MZsn,rep2,5.5hpf,MZsn_5.5hpf,5.5,A,17245775,0.969629
MZsn_rep2_5hpf,MZsn_rep2_5hpf,MZsn,rep2,5hpf,MZsn_5hpf,5.0,A,16084675,1.005728
MZsn_rep2_4.5hpf,MZsn_rep2_4.5hpf,MZsn,rep2,4.5hpf,MZsn_4.5hpf,4.5,A,14784568,1.067657
MZsn_rep2_4hpf,MZsn_rep2_4hpf,MZsn,rep2,4hpf,MZsn_4hpf,4.0,A,13952695,1.113611
...,...,...,...,...,...,...,...,...,...
WT_rep11_4.5hpf,WT_rep11_4.5hpf,WT,rep11,4.5hpf,WT_4.5hpf,4.5,B,17093157,0.936199
WT_rep11_4hpf,WT_rep11_4hpf,WT,rep11,4hpf,WT_4hpf,4.0,B,15172536,0.984850
WT_rep11_3.5hpf,WT_rep11_3.5hpf,WT,rep11,3.5hpf,WT_3.5hpf,3.5,B,14315022,1.004924
WT_rep11_3hpf,WT_rep11_3hpf,WT,rep11,3hpf,WT_3hpf,3.0,B,14492287,0.993477


In [38]:
# Normalised and transformed expression values
sdata.X = np.sqrt(conorm.cpm(sdata.X.transpose(), sdata.obs['Normalisation_factor']).transpose())


In [39]:
# # Filter to transcripts detected (>=1 CPM) in at least 2 replicates of a given sample type or at least 5 samples overall
# min_counts = 1
# for i in sdata.obs.Sample_type.unique():
#     sdata.var[f'Detection_{i}'] = list((sdata[sdata.obs.Sample_type == i].X**2 >= min_counts).astype(int).sum(axis=0))

# sdata.var['Keep_filter'] = (sdata.var.loc[:,sdata.var.columns.str.contains('Detection')] >= 2).sum(axis=1) | (sdata.var.loc[:,sdata.var.columns.str.contains('Detection')].sum(axis=1) >= 5)
# sdata = sdata[:,sdata.var['Keep_filter']]
# sdata

In [40]:
# Store normalised counts
sdata.layers['X_normalised'] = sdata.X

In [41]:
# Save bulk timecourse anndata
sdata.write(f'{scratchdir}/preprocessed_data/Anndata_{study}_cells_preprocessed.h5ad')
