In [1]:
import velvet as vt

# general packages
import numpy as np
import pandas as pd
import torch
from scipy.sparse import issparse

# velocity packages
import scanpy as sc
import scvelo as scv
import anndata as ann

# plotting packages
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
from IPython.display import clear_output

# color palette object
from colors import colorpalette

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
# script specific imports
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os

### 1. sci-FATE comparison pilot

In [3]:
EXPERIMENT_ID = 'SCIFATE_PILOT_4'
EXCEL = 'PM20196.xlsx'

home='/camp/lab/briscoej/working/Rory/transcriptomics/sciFATE_data/pilots/FATE_PILOT3'

xl = pd.read_excel(f'{home}/metadata/{EXCEL}', header=1)

xl = xl[xl['Submitted Pool ID']==EXPERIMENT_ID]

pcr = xl[xl.columns[:2]].reset_index(drop=True)

pcr = pcr.sort_values('Sample Name')

treatments = [val for sublist in [[a]*96 for a in ['old','qia','zym','pub']] for val in sublist]

In [4]:
%%time
low_cutoff = 1000
ccs = ['NT','IO','DI','FT']
adatas = []

for cc in ccs:
    for sample, treatment in tqdm(zip(pcr['Sample limsid'].values, treatments)):
            for folder in [f'{cc}_old',f'{cc}_qia',f'{cc}_zym',f'{cc}_pub']:
                l = os.listdir(home+f'/data_130/{folder}')
                if sample in l:
                    try:
                        path=home+f'/data_130/{folder}/{sample}/count/adata.h5ad'
                        adata = sc.read_h5ad(path)
                        adata.obs['protocol'] = treatment
                        adata.obs['treatment'] = cc                        
                        adata = adata[adata.layers['total'].sum(1)>low_cutoff]
                        adatas.append(adata)
                    except FileNotFoundError:
                        pass
    clear_output(wait=True)

CPU times: user 54.2 s, sys: 7.3 s, total: 1min 1s
Wall time: 2min 12s


In [5]:
adata = ann.concat(
    adatas, 
    join='outer', 
    fill_value=0, 
    axis=0
)

In [6]:
adata.write_h5ad('../data/pilot_protocol_comparison.h5ad')

### 2. Initial sci-FATE protocol pilot

In [7]:
def aggregate_adatas(
    home, 
    pos_pattern='MAI',
    neg_pattern='tmp',
    method='count'):
    from tqdm import tqdm
    samples = [f for f in os.listdir(home) if pos_pattern in f]
    samples = [f for f in samples if neg_pattern not in f]
    adata_list = []
    for sample in tqdm(samples):
        try:
            adata = sc.read_h5ad(home+sample+f'/{method}/adata.h5ad')
            adata.obs.index = ['_'.join((sample,a)) for a in adata.obs.index]
            adata.obs['sample'] = sample
            adata.var['ID'] = adata.var.index
            adata.var = adata.var.set_index('gene_name')
            try:
                adata = adata[:,[a!='' for a in adata.var_names]]
            except KeyError:
                pass
            adata.var.index = adata.var.index.astype("string")
            adata_list.append(adata)
            clear_output(wait=True)
        except FileNotFoundError:
            print(f"{sample} not found.")
    return adata_list

def fix_adata(ad):
    warnings.simplefilter(action='ignore', category=UserWarning)

    coldict = {}
    for col in ad.obs.columns:
        coldict[col] = [a for a in ad.obs[col]]
    new_obs = pd.DataFrame(coldict, index=list(ad.obs.index.values))

    coldict = {}
    for col in ad.var.columns:
        coldict[col] = [a for a in ad.var[col]]
    new_var = pd.DataFrame(coldict, index=list(ad.var.index.values))

    ad = ann.AnnData(X=ad.layers['total'].copy(),
                         obs=new_obs,
                         var=new_var)
    
    ad.var_names_make_unique()
    ad.obs_names_make_unique()
    ad.strings_to_categoricals()
    return ad

In [8]:
home='/camp/lab/briscoej/working/Rory/transcriptomics/sciFATE_data/pilots/SCI_PILOT1/data/all/'

In [9]:
adata = aggregate_adatas(home)

100%|██████████| 384/384 [00:46<00:00,  8.26it/s]


In [10]:
adatas = [fix_adata(ad) for ad in adata]

In [11]:
total_adata = ann.concat(adatas, join='outer', fill_value=0, axis=0)
total_adata = total_adata[total_adata.X.sum(1)>500]

In [12]:
total_adata.write_h5ad('../data/pilot_original_test.h5ad')