In [1]:
import velvet as vt

# general packages
import numpy as np
import pandas as pd
import torch
from scipy.sparse import issparse

# velocity packages
import scanpy as sc
import scvelo as scv
import anndata as ann

# plotting packages
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
from IPython.display import clear_output

# color palette object
from colors import colorpalette

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
#script-specific imports
import os

In [3]:
def aggregate_adatas(
    home, 
    pos_pattern='MAI',
    neg_pattern='tmp',
    method='count'):
    """
    load adatas from outputs of dynast.
    This function is specifically built for the directory structure
    of our data.
    """
    from tqdm import tqdm
    samples = [f for f in os.listdir(home) if pos_pattern in f]
    samples = [f for f in samples if neg_pattern not in f]
    adata_list = []
    for sample in tqdm(samples):
        try:
            adata = sc.read_h5ad(home+sample+f'/{method}/adata.h5ad')
            adata.obs.index = ['_'.join((sample,a)) for a in adata.obs.index]
            adata.obs['sample'] = sample
            adata.var['ID'] = adata.var.index
            adata.var = adata.var.set_index('gene_name')
            try:
                adata = adata[:,[a!='' for a in adata.var_names]]
            except KeyError:
                pass
            adata.var.index = adata.var.index.astype("string")
            adata.var_names_make_unique()
            adata.obs_names_make_unique()
            adata.strings_to_categoricals()
            adata_list.append(adata)
            clear_output(wait=True)
        except FileNotFoundError:
            print(f"{sample} not found.")
    total_adata = ann.concat(adata_list, join='outer', fill_value=0, axis=0)
    return total_adata

def load_data(path, timepoints, method='estimate', cutoff=1000):
    """
    aggregate across the different conditions for a replicate
    """
    adatas = []
    for tp in timepoints:
        print(path+tp+'/')
        adata = aggregate_adatas(path+tp+'/', method=method)
        adata.obs['timepoint'] = tp
        adatas.append(adata)
    adata = ann.concat(adatas)
    adata = adata[adata.layers['total'].sum(1)>cutoff]
    return adata

def save_file(adata, name, X='total'):
    """
    there's an issue with string formatting, which this fixes.
    """
    coldict = {}
    for col in adata.obs.columns:
        coldict[col] = [a for a in adata.obs[col]]
    new_obs = pd.DataFrame(coldict, index=list(adata.obs.index.values))

    coldict = {}
    for col in adata.var.columns:
        coldict[col] = [a for a in adata.var[col]]
    new_var = pd.DataFrame(coldict, index=list(adata.var.index.values))
    
    adata = ann.AnnData(X=adata.layers[X].copy(),
                         obs=new_obs,
                         var=new_var,
                         layers=adata.layers.copy())
    
    adata.write(name)

In [4]:
home = '/camp/lab/briscoej/working/Rory/transcriptomics/sciFATE_data/experiments/'
tp1 = ['D3','D4','D5','D6','D7','D8']
tp2 = ['05h','10h','15h','20h']

r1 = load_data(
    path=home+'/E1.1/data/',
    timepoints=tp1
)

r2 = load_data(
    path=home+'/E2.1/data/',
    timepoints=tp1
)

r3 = load_data(
    path=home+'/E3.1/data/',
    timepoints=tp1
)

r4 = load_data(
    path=home+'/EX1/data/',
    timepoints=tp2
)
 

100%|██████████| 384/384 [01:05<00:00,  5.83it/s]


In [5]:
adata = ann.concat([r1,r2,r3,r4])
adata.obs['rep'] = ['r1']*r1.shape[0] + ['r2']*r2.shape[0] + ['r3']*r3.shape[0] + ['r4']*r4.shape[0]

In [6]:
adata.layers['new_estimated'] = adata.layers['labeled_TC_est'].copy()
adata.layers['new'] = adata.layers['labeled_TC'].copy()
adata.layers['old_estimated'] = adata.layers['unlabeled_TC_est'].copy()
adata.layers['old'] = adata.layers['unlabeled_TC'].copy()

adata2 = adata.copy()
for layer in adata.layers:
    if layer in ['total','new','old']:
        adata2.layers[layer] = adata2.layers[layer].astype('float32')
    else:
        del adata2.layers[layer]
        
adata3 = adata.copy()
for layer in adata.layers:
    if layer in ['total','new_estimated','old_estimated']:
        adata3.layers[layer] = adata3.layers[layer].astype('float32')
    else:
        del adata3.layers[layer]
        
adata4 = adata.copy()
for layer in adata.layers:
    if layer in ['spliced','unspliced']:
        adata4.layers[layer] = adata4.layers[layer].astype('float32')
    else:
        del adata4.layers[layer]

In [7]:
save_file(adata2, '../data/adata_unprocessed.h5ad', X='total')
save_file(adata3, '../data/adata_unprocessed_estimate.h5ad', X='total')
save_file(adata4, '../data/adata_unprocessed_splicing.h5ad', X='spliced')

In [None]:
fuckl yo
