<a href="https://colab.research.google.com/github/sbooeshaghi/azucar/blob/main/analysis/293T/merge_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# this takes 5 minutes
!git clone https://github.com/sbooeshaghi/azucar.git

Cloning into 'azucar'...
remote: Enumerating objects: 298, done.[K
remote: Counting objects: 100% (298/298), done.[K
remote: Compressing objects: 100% (198/198), done.[K
remote: Total 298 (delta 102), reused 287 (delta 94), pack-reused 0[K
Receiving objects: 100% (298/298), 671.11 MiB | 2.73 MiB/s, done.
Resolving deltas: 100% (102/102), done.
Checking out files: 100% (207/207), done.


In [3]:
!cat azucar/analysis/293T/obs7/out/x* > azucar/analysis/293T/obs7/out/matrix.mtx.gz

In [141]:
import glob
import os
from collections import defaultdict
from scipy.io import mmread, mmwrite
import json
import pandas as pd

In [30]:
base = "azucar/analysis/293T"
fns = glob.glob(os.path.join(base, "obs*"))
pairs = {
    "obs1": "obs2",
    "obs2": "obs1",
    "obs3": "obs4",
    "obs4": "obs3",
    "obs5": "obs6",
    "obs6": "obs5",
    "obs7": "obs8",
    "obs8": "obs7"
}

In [40]:
m = defaultdict()
for fn in fns:
  obs = fn.split('/')[-1]
  md = json.load(open(f"{fn}/metadata.json", "r"))
  md.update({"pair":pairs.get(obs, "None")})
  m[obs] = md
print(json.dumps(m, indent=4, default=str))

{
    "obs8": {
        "cell_origin": "293T",
        "organism": "homo_sapiens",
        "replicate": 3,
        "lane_10x": 1,
        "molecule_type": "tags",
        "pair": "obs7"
    },
    "obs3": {
        "cell_origin": "293T",
        "organism": "homo_sapiens",
        "replicate": 2,
        "lane_10x": 1,
        "molecule_type": "cdna",
        "pair": "obs4"
    },
    "obs4": {
        "cell_origin": "293T",
        "organism": "homo_sapiens",
        "replicate": 2,
        "lane_10x": 1,
        "molecule_type": "tags",
        "pair": "obs3"
    },
    "obs5": {
        "cell_origin": "293T",
        "organism": "homo_sapiens",
        "replicate": 2,
        "lane_10x": 2,
        "molecule_type": "cdna",
        "pair": "obs6"
    },
    "obs1": {
        "cell_origin": "293T",
        "organism": "homo_sapiens",
        "replicate": 1,
        "lane_10x": 1,
        "molecule_type": "cdna",
        "pair": "obs2"
    },
    "obs7": {
        "cell_origin": "293T"

In [41]:
!gunzip azucar/analysis/293T/obs*/out/*.gz
!gunzip azucar/analysis/293T/obs*/assign/*.gz

In [112]:
%%time

for obs, d in m.items():
  if d["molecule_type"] == "cdna":
    out = os.path.join(base, obs, "out/")
    mtx_fn = os.path.join(out, "matrix.mtx")
    bcs_fn = os.path.join(out, "barcodes.txt")
    genes_fn = os.path.join(out, "genes.txt")

    dbco_fn = os.path.join(base, d["pair"], "out/dbco.txt")
    dbco_bcs_fn = os.path.join(base, d["pair"], "out/barcodes.txt")

    assignments_fn = os.path.join(base, d["pair"], "assign/assignments.txt")
    groups_fn = os.path.join(base, d["pair"], "assign/groups.txt")
    
    # load matrix, barcodes, genes
    mtx = mmread(mtx_fn).tocsr()
    bcs = pd.read_csv(bcs_fn, header=None, names=["bcs"])
    genes = pd.read_csv(genes_fn, header=None, names=["genes"])

    # load assignments and map group names to ids
    assn = pd.read_csv(assignments_fn, sep="\t", header=None, names = ["bcs", "group_id", "entropy"], index_col=0)
    groups = pd.read_csv(groups_fn, header=None, names = ["groups"])
    assn["group"] = assn.group_id.map(groups["groups"])
    bcs["group"] = bcs["bcs"].map(assn["group"])
    bcs["entropy"] = bcs["bcs"].map(assn["entropy"])

    # load dbco counts for barcodes 
    dbco = pd.read_csv(dbco_fn, sep="\t", header=None, names = ["dbco"])
    dbco.index = pd.read_csv(dbco_bcs_fn, header = None, names = ["bcs"]).bcs.values

    # map the dbco barcodes to the actual barcodes to get the counts
    bcs["dbco"] = bcs["bcs"].map(dbco["dbco"])

    mask = bcs["group"].isna().values
    bcs["group"][mask] = "unassigned"
    bcs["lane_10x"] = d["lane_10x"]
    bcs["replicate"] = d["replicate"]
    m[obs]["mtx"] = mtx
    m[obs]["bcs"] = bcs
    m[obs]["genes"] = genes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 1min 59s, sys: 2.29 s, total: 2min 2s
Wall time: 2min


In [113]:
m["obs1"].keys()

dict_keys(['cell_origin', 'organism', 'replicate', 'lane_10x', 'molecule_type', 'pair', 'mtx', 'bcs', 'genes'])

In [155]:
m["obs1"]["bcs"].columns

Index(['bcs', 'group', 'entropy', 'dbco', 'lane_10x', 'replicate'], dtype='object')

In [115]:
b = []
mx = []
g = []
for obs, d in m.items():
  if d["molecule_type"] == "cdna":
    b.append(d["bcs"])
    mx.append(d["mtx"])
    g.append(d["genes"])

In [117]:
from scipy.sparse import vstack

In [121]:
mtx = vstack([*mx])
bcs = pd.concat(b)

In [122]:
mtx

<35454x60664 sparse matrix of type '<class 'numpy.longlong'>'
	with 63399173 stored elements in Compressed Sparse Row format>

In [126]:
groups = sorted(bcs.group.unique())

In [137]:
d = {g: {"mtx": None, "bcs": None, "genes": None} for g in groups}
for g in groups:
  mask = bcs.eval(f"group == '{g}'").values
  d[g]["mtx"] = mtx[mask]
  d[g]["bcs"] = bcs[mask]
  d[g]["genes"] = m["obs1"]["genes"]

In [144]:
for k, v in d.items():
  mmwrite(f"{k}.matrix.mtx", v["mtx"])
  v["bcs"].to_csv(f"{k}.barcodes.txt", sep="\t", index=False, header=False)
  v["genes"].to_csv(f"{k}.genes.txt", sep="\t", index=False, header=False)


In [145]:
!mkdir no_sugar control tmg unassigned

In [146]:
!mv no_sugar.* no_sugar
!mv control.* control
!mv tmg.* tmg
!mv unassigned.* unassigned

In [147]:
!tar -cvf no_sugar.tar.gz no_sugar
!tar -cvf control.tar.gz control
!tar -cvf tmg.tar.gz tmg
!tar -cvf unassigned.tar.gz unassigned

no_sugar/
no_sugar/no_sugar.matrix.mtx
no_sugar/no_sugar.barcodes.txt
no_sugar/no_sugar.genes.txt
control/
control/control.mtx
control/control.genes.txt
control/control.barcodes.txt
control/control.matrix.mtx
tmg/
tmg/tmg.barcodes.txt
tmg/tmg.matrix.mtx
tmg/tmg.genes.txt
unassigned/
unassigned/unassigned.genes.txt
unassigned/unassigned.matrix.mtx
unassigned/unassigned.barcodes.txt


In [153]:
!ls -lht

total 780M
-rw-r--r-- 1 root root 205M Nov  3 23:27 control.tar.gz
drwxr-xr-x 2 root root 4.0K Nov  3 23:26 control
-rw-r--r-- 1 root root  86M Nov  3 23:26 unassigned.tar.gz
-rw-r--r-- 1 root root 188M Nov  3 23:26 tmg.tar.gz
-rw-r--r-- 1 root root 303M Nov  3 23:25 no_sugar.tar.gz
drwxr-xr-x 2 root root 4.0K Nov  3 23:25 unassigned
drwxr-xr-x 2 root root 4.0K Nov  3 23:25 tmg
drwxr-xr-x 2 root root 4.0K Nov  3 23:25 no_sugar
drwxr-xr-x 4 root root 4.0K Nov  3 21:38 azucar
drwxr-xr-x 1 root root 4.0K Nov  1 13:35 sample_data
