In [1]:
import re
import urllib
from zipfile import ZipFile

import anndata
import numpy as np
import scipy
import scipy.sparse

from rp2 import fetch_file, notebooks
from rp2.paths import get_data_path, get_scripts_path

nb_env = notebooks.initialise_environment("Data_Setup")

Download supplementary data for Hagai *et al.* (2018)

In [2]:
hagai_path = get_data_path("hagai_2018")
fetch_file(
    "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-018-0657-2/MediaObjects/41586_2018_657_MOESM4_ESM.xlsx",
    hagai_path
);

Download Hagai *et al.* (2018) datasets from ArrayExpress

In [3]:
ae_path = get_data_path("ArrayExpress")
#fetch_file(
#    "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/files/E-MTAB-6754/E-MTAB-6754.processed.2.zip",
#    ae_path
#);

Extract zipped ArrayExpress datasets

In [4]:
for zip_path in ae_path.glob("*.zip"):
    extract_path = ae_path.joinpath(zip_path.stem)
    if extract_path.exists():
        continue

    print("Extracting:", zip_path)

    with ZipFile(zip_path, "r") as zip_file:
        zip_file.extractall(extract_path)

Extracting: C:\Users\wolke\Publication\sc-variability-paper-master-1601 2\Data\ArrayExpress\E-MTAB-6754.processed.2.zip


Download txburst scripts for Larsson *et al.* (2019) burst modelling

In [5]:
txburst_filenames = (
    "txburstML.py",
    "txburstPL.py",
    "txburstTEST.py",
)

txburst_path = get_scripts_path("txburst")

for filename in txburst_filenames:
    url = f"https://raw.githubusercontent.com/sandberg-lab/txburst/master/{filename}"
    fetch_file(url, txburst_path)

Collate Hagai *et al.* (2018) mouse UMI counts into a single dataframe

In [7]:
def extract_species_and_replicate(token):
    return token[:-1], token[-1]


def extract_treatment_and_time_point(token):
    matches = re.match(r"^([a-z]+)(\d*[^\d]*)$", token)
    return matches.group(1), matches.group(2) or "0"


def load_umi_count_adata(file_path):
    species_and_replicate, treatment_and_time_point = file_path.name.split("_")[:2]
    species, replicate = extract_species_and_replicate(species_and_replicate)
    treatment, time_point = extract_treatment_and_time_point(treatment_and_time_point)

    umi_ad = anndata.read_csv(file_path, delimiter=" ").T
    umi_ad.X = scipy.sparse.csr_matrix(umi_ad.X, dtype=int)

    umi_ad.obs["species"] = species
    umi_ad.obs["replicate"] = replicate
    umi_ad.obs["treatment"] = treatment
    umi_ad.obs["time_point"] = time_point
    umi_ad.obs["barcode"] = umi_ad.obs.index

    return umi_ad


def collate_umi_counts(csv_file_paths):
    total_obs = 0
    all_adata = []

    for file_path in csv_file_paths:
        print("Loading", file_path.name)

        csv_adata = load_umi_count_adata(file_path)
        csv_adata.obs.index = [str(i) for i in range(total_obs, total_obs + csv_adata.n_obs)]

        all_adata.append(csv_adata)

        total_obs += csv_adata.n_obs

    print("Collating")

    adata = all_adata[0].concatenate(all_adata[1:], join="outer", index_unique=None)
    adata.obs.drop(columns=["batch"], inplace=True)
    return adata

In [8]:
species_id_map = {
    "mouse": "mmusculus",
    "pig": "sscrofa",
    "rabbit": "ocuniculus",
    "rat": "rnorvegicus",
}

species_of_interest = species_id_map.keys()

umi_files_path = ae_path.joinpath("E-MTAB-6754.processed.2")

for species in species_of_interest:
    umi_file_path = umi_files_path.parent.joinpath(umi_files_path.name + f".{species}.h5ad")

    if not umi_file_path.exists():
        csv_glob = umi_files_path.glob(f"{species}*.txt.gz")
        adata = collate_umi_counts(csv_glob)
        adata.write_h5ad(umi_file_path)

Loading mouse1_lps2_filtered_by_cell_cluster0.txt.gz
Loading mouse1_lps4_filtered_by_cell_cluster0.txt.gz
Loading mouse1_lps6_filtered_by_cell_cluster0.txt.gz
Loading mouse1_pic2_filtered_by_cell_cluster0.txt.gz
Loading mouse1_pic4_filtered_by_cell_cluster0.txt.gz
Loading mouse1_unst_filtered_by_cell_cluster0.txt.gz
Loading mouse2_lps2_filtered_by_cell_cluster0.txt.gz
Loading mouse2_lps4_filtered_by_cell_cluster0.txt.gz
Loading mouse2_lps6_filtered_by_cell_cluster0.txt.gz
Loading mouse2_pic2_filtered_by_cell_cluster0.txt.gz
Loading mouse2_pic4_filtered_by_cell_cluster0.txt.gz
Loading mouse2_pic6A_filtered_by_cell_cluster0.txt.gz
Loading mouse2_pic6_filtered_by_cell_cluster0.txt.gz
Loading mouse2_unst_filtered_by_cell_cluster0.txt.gz
Loading mouse3_lps2_filtered_by_cell_cluster0.txt.gz
Loading mouse3_lps4_filtered_by_cell_cluster0.txt.gz
Loading mouse3_lps6_filtered_by_cell_cluster0.txt.gz
Loading mouse3_pic2_filtered_by_cell_cluster0.txt.gz
Loading mouse3_pic4_filtered_by_cell_cluster0

... storing 'species' as categorical
... storing 'replicate' as categorical
... storing 'treatment' as categorical
... storing 'time_point' as categorical
... storing 'barcode' as categorical


Loading pig1_lps2_filtered_by_cell_cluster0.txt.gz
Loading pig1_lps4_filtered_by_cell_cluster0.txt.gz
Loading pig1_lps6_filtered_by_cell_cluster0.txt.gz
Loading pig1_unst_filtered_by_cell_cluster0.txt.gz
Loading pig2_lps2_filtered_by_cell_cluster0.txt.gz
Loading pig2_lps4_filtered_by_cell_cluster0.txt.gz
Loading pig2_lps6_filtered_by_cell_cluster0.txt.gz
Loading pig2_unst_filtered_by_cell_cluster0.txt.gz
Loading pig3_lps2_filtered_by_cell_cluster0.txt.gz
Loading pig3_lps4_filtered_by_cell_cluster0.txt.gz
Loading pig3_lps6_filtered_by_cell_cluster0.txt.gz
Loading pig3_unst_filtered_by_cell_cluster0.txt.gz
Collating


... storing 'species' as categorical
... storing 'replicate' as categorical
... storing 'treatment' as categorical
... storing 'time_point' as categorical
... storing 'barcode' as categorical


Loading rabbit1_lps2_filtered_by_cell_cluster0.txt.gz
Loading rabbit1_lps4_filtered_by_cell_cluster0.txt.gz
Loading rabbit1_lps6_filtered_by_cell_cluster0.txt.gz
Loading rabbit1_unst_filtered_by_cell_cluster0.txt.gz
Loading rabbit2_lps2_filtered_by_cell_cluster0.txt.gz
Loading rabbit2_lps4_filtered_by_cell_cluster0.txt.gz
Loading rabbit2_lps6_filtered_by_cell_cluster0.txt.gz
Loading rabbit2_unst_filtered_by_cell_cluster0.txt.gz
Loading rabbit3_lps2_filtered_by_cell_cluster0.txt.gz
Loading rabbit3_lps4_filtered_by_cell_cluster0.txt.gz
Loading rabbit3_lps6_filtered_by_cell_cluster0.txt.gz
Loading rabbit3_unst_filtered_by_cell_cluster0.txt.gz
Collating


... storing 'species' as categorical
... storing 'replicate' as categorical
... storing 'treatment' as categorical
... storing 'time_point' as categorical
... storing 'barcode' as categorical


Loading rat1_lps2_filtered_by_cell_cluster0.txt.gz
Loading rat1_lps4_filtered_by_cell_cluster0.txt.gz
Loading rat1_lps6_filtered_by_cell_cluster0.txt.gz
Loading rat1_pic2_filtered_by_cell_cluster0.txt.gz
Loading rat1_pic4_filtered_by_cell_cluster0.txt.gz
Loading rat1_pic6_filtered_by_cell_cluster0.txt.gz
Loading rat1_unst_filtered_by_cell_cluster0.txt.gz
Loading rat2_lps2_filtered_by_cell_cluster0.txt.gz
Loading rat2_lps4_filtered_by_cell_cluster0.txt.gz
Loading rat2_lps6_filtered_by_cell_cluster0.txt.gz
Loading rat2_pic2_filtered_by_cell_cluster0.txt.gz
Loading rat2_pic4_filtered_by_cell_cluster0.txt.gz
Loading rat2_pic6_filtered_by_cell_cluster0.txt.gz
Loading rat2_unst_filtered_by_cell_cluster0.txt.gz
Loading rat3_lps2_filtered_by_cell_cluster0.txt.gz
Loading rat3_lps4_filtered_by_cell_cluster0.txt.gz
Loading rat3_lps6_filtered_by_cell_cluster0.txt.gz
Loading rat3_pic2_filtered_by_cell_cluster0.txt.gz
Loading rat3_pic4_filtered_by_cell_cluster0.txt.gz
Loading rat3_pic6_filtered_by_c

... storing 'species' as categorical
... storing 'replicate' as categorical
... storing 'treatment' as categorical
... storing 'time_point' as categorical
... storing 'barcode' as categorical


Download a list of mouse genes (Ensembl ID, symbol and description) from BioMart