### Download/save/read data and metadata

In [1]:
import web_files as wf

meta_http = "https://kleintools.hms.harvard.edu/paper_websites/state_fate2020/stateFate_inVitro_metadata.txt.gz"
clone_mat_http = "https://kleintools.hms.harvard.edu/paper_websites/state_fate2020/stateFate_inVitro_clone_matrix.mtx.gz"

mat = wf.WebMatrix(clone_mat_http).read().toarray()
meta_df = wf.WebFrame(meta_http).read(sep='\t')

### Annotate lineage-traced (LT) cells in `meta_df`

In [2]:
import numpy as np

def _annotate_metadata_with_LT(meta_df, mat):
    
    tmp = np.zeros(len(mat))
    LT_idx = np.where(mat.sum(1) > 0)[0]
    tmp[LT_idx] = 1
    meta_df["LT"] = tmp.astype(bool)

In [3]:
_annotate_metadata_with_LT(meta_df, mat)
meta_df.head()

Unnamed: 0,Library,Cell barcode,Time point,Starting population,Cell type annotation,Well,SPRING-x,SPRING-y,LT
0,d6_2_2,GCGTGCAA-AGAAGTTA,6.0,Lin-Kit+Sca1-,Undifferentiated,2,411.496,-96.19,True
1,d6_2_2,AAGGGACC-CTCGATGC,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-587.462,-306.925,True
2,d6_2_2,CGTACCGA-AGCGCCTT,6.0,Lin-Kit+Sca1-,Monocyte,2,1429.805,-429.3,True
3,d6_2_2,CTGAAGGG-AGGAGCTT,6.0,Lin-Kit+Sca1-,Neutrophil,2,1150.028,-2030.369,False
4,d6_2_2,CCGTAGCT-AGGCAGTT,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-1169.594,362.01,True


### Annotate clonally barcoded cells with their corresponding lineage (one lineage per cell, max)

In [15]:
def _annotate_meta_df_with_clone_idx(meta_df, mat):
    
    X_clone = mat[mat.sum(1) > 0] # X_clone.shape: (49302, 5864)
    cell_idx = np.where(mat.sum(1) > 0)
    clone_idx = np.where(X_clone > 0)[1]
    tmp = np.full(len(meta_df), np.nan)
    tmp[cell_idx] = clone_idx
    meta_df["clone_idx"] = tmp

In [16]:
_annotate_meta_df_with_clone_idx(meta_df, mat)
meta_df.head()

Unnamed: 0,Library,Cell barcode,Time point,Starting population,Cell type annotation,Well,SPRING-x,SPRING-y,LT,clone_idx
0,d6_2_2,GCGTGCAA-AGAAGTTA,6.0,Lin-Kit+Sca1-,Undifferentiated,2,411.496,-96.19,True,573.0
1,d6_2_2,AAGGGACC-CTCGATGC,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-587.462,-306.925,True,1440.0
2,d6_2_2,CGTACCGA-AGCGCCTT,6.0,Lin-Kit+Sca1-,Monocyte,2,1429.805,-429.3,True,394.0
3,d6_2_2,CTGAAGGG-AGGAGCTT,6.0,Lin-Kit+Sca1-,Neutrophil,2,1150.028,-2030.369,False,
4,d6_2_2,CCGTAGCT-AGGCAGTT,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-1169.594,362.01,True,1972.0


### Day 2 cells that have LT barcodes

### Grab clone index: `clone_idx`

* Each day 2 cell that has a clonal barcode will belong to a clonal lineage (numbered according to position in `X_clone` and annotated in `meta_df`. 
* Regardless of how many d2 cells are observed to have a given lineage, we will count the number of cells observed in each of hte 11 possible annotations (including `undiff`) at d4 and d6. These will be summed, producing a ` clone x fate ` outcome matrix. We can then row-normalize such that each row sums to 1, and the relative percentage of fate produced (counted at d4, d6) will then be represented.

In [17]:
import pandas as pd

def _count_downstream(df, key):
    return (
        df.loc[df["Time point"].isin([4, 6])]
        .groupby("Cell type annotation")
        .count()[key]
    )


def _d2_clone_progenitor_counts(meta_df):

    """
    group by clone idx, filter clones containing only a
    single timepoint or those without a cell at d2. Then,
    count the cells at d4, d6 corresponding to that clone idx.
    if the above conditions are not satisfied, return an empty series
    """

    key = meta_df.columns[1]

    d2_clone_progenitor_dict = {}

    for clone, clone_df in meta_df.groupby("clone_idx"):
        clone_time = clone_df["Time point"].unique()
        if (2 in clone_time) and (len(clone_time) > 1):
            d2_clone_progenitor_dict[clone] = _count_downstream(clone_df, key)
        else:
            d2_clone_progenitor_dict[clone] = pd.Series(
                np.array([]), name="Cell barcode"
            )

    return pd.DataFrame.from_dict(d2_clone_progenitor_dict).fillna(0).T

In [18]:
counted_clones = _d2_clone_progenitor_counts(meta_df)
print(counted_clones.shape)
counted_clones.head()

(5864, 11)


Unnamed: 0,Baso,Ccr7_DC,Eos,Erythroid,Lymphoid,Mast,Meg,Monocyte,Neutrophil,Undifferentiated,pDC
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,5.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now for each d2 cell, you can map the values in `counted_clones` by the clonal lineage they belong to:

In [19]:
counted_clones_ = counted_clones.reset_index().rename({"index": "clone_idx"}, axis=1)
meta_df_ = meta_df.merge(counted_clones_, on="clone_idx", how="left")

# subset for d2 and drop null vals (non clonal cells)
meta_clonal_d2 = meta_df_.loc[meta_df_['Time point'] == 2][counted_clones.columns].dropna()
meta_clonal_d2

Unnamed: 0,Baso,Ccr7_DC,Eos,Erythroid,Lymphoid,Mast,Meg,Monocyte,Neutrophil,Undifferentiated,pDC
13176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,6.0,0.0
13210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
13217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
69313,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
69317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0
69322,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,10.0,0.0
69324,41.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
