<a href="https://colab.research.google.com/github/sbooeshaghi/azucar/blob/main/analysis/293T/obs2/imports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet -U upsetplot scikit-learn git+http://github.com/dirguis/ipfn@master

  Building wheel for ipfn (setup.py) ... [?25l[?25hdone
  Building wheel for upsetplot (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/sbooeshaghi/azucar.git

Cloning into 'azucar'...
remote: Enumerating objects: 1536, done.[K
remote: Counting objects: 100% (294/294), done.[K
remote: Compressing objects: 100% (285/285), done.[K
remote: Total 1536 (delta 166), reused 43 (delta 9), pack-reused 1242[K
Receiving objects: 100% (1536/1536), 1.70 GiB | 19.38 MiB/s, done.
Resolving deltas: 100% (642/642), done.
Checking out files: 100% (292/292), done.


In [15]:
import os
from scipy.io import mmread
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
sample = "293T"
observation = "obs2"

base_data = f"azucar/analysis/{sample}/{observation}/out"
base_mark = f"azucar/analysis/{sample}/{observation}/assign"

matrix_fn  = os.path.join(base_data, "matrix.mtx")
genes_fn   = os.path.join(base_data, "genes.txt")
barcodes_fn   = os.path.join(base_data, "barcodes.txt")

!gunzip $base_data/*.gz

gzip: azucar/analysis/293T/obs2/out/*.gz: No such file or directory


In [3]:
!rm *.mtx *.txt *.ec

In [16]:
from azucar.analysis.scripts.mx_sanitize import mx_sanitize
from azucar.analysis.scripts.mx_filter import mx_filter, knee, gmm
from azucar.analysis.scripts.mx_norm import mx_norm
from azucar.analysis.scripts.ec_index import ec_index
from azucar.analysis.scripts.mx_select import mx_select
from azucar.analysis.scripts.mx_extract import mx_extract
from azucar.analysis.scripts.mx_assign import mx_assign
from azucar.analysis.scripts.mx_diff import mx_diff

from azucar.analysis.scripts.utils import read_str_list, read_markers_str

In [5]:
markers_fn = os.path.join(base_mark, "markers.txt")

In [11]:
# drop barcodes and genes that sum to zero, update barcodes and genes file
mx_sanitize(matrix_fn, barcodes_fn, genes_fn, 
            "./san.matrix.mtx", 
            "./san.barcodes.txt", 
            "./san.genes.txt")

# knee plot gmm filter
mx_filter("./san.matrix.mtx",
          "./san.barcodes.txt",
          "./san.fil.matrix.mtx", 
          "./san.fil.barcodes.txt",
          comps=[2,2], select_axis=2)

dbco = mmread("./san.fil.matrix.mtx").toarray()[:,-1]

# normalize matrix (log1p -> ipf)
mx_norm("./san.fil.matrix.mtx", 
        "./san.log1p.matrix.mtx", how="log1p")

mx_norm("./san.log1p.matrix.mtx", 
        "./san.norm.matrix.mtx", how="ipf", target_sum=1_000_000)

dbco_norm = mmread("./san.norm.matrix.mtx").toarray()[:,-1]

# index the markers -> markers.ec marker_genes.txt groups.txt
ec_index(markers_fn, 
        "./markers.ec", 
        "./groups.txt", 
        "./marker_genes.txt")
# get the gene ids -> select.txt (selects in order of markers.ec)
mx_select(markers_fn, 
          "./san.genes.txt", 
          "./select.txt")
# extract elements from matrix that are of interest, rows / columns (with associated metadata)
mx_extract("san.norm.matrix.mtx", 
          "san.genes.txt", 
          "select.txt", 
          "san.norm.extr.matrix.mtx", 
          "san.extr.genes.txt", axis=1)

# perform assignments with markers and matrices
mx_assign("san.norm.extr.matrix.mtx", 
          "san.fil.barcodes.txt", 
          "san.extr.genes.txt", 
          "markers.ec", 
          "groups.txt",
          "assignments.txt")

Filtered to 1,756 cells with at least 99 UMIs.
Initialization 0
  Iteration 10	 time lapse 0.01670s	 ll change 0.00359
Initialization converged: True	 time lapse 0.02330s	 ll -20.31437


In [12]:
df = pd.read_csv("assignments.txt", sep="\t", index_col=0)
print("shape: ", df.shape)

# original counts (this is the same as make df above)
raw_mtx = mmread("./san.fil.matrix.mtx").toarray()
raw_genes = []
read_str_list("./san.genes.txt", raw_genes)
for idx, v in enumerate(raw_mtx.T):
    df[f"{raw_genes[idx]}"] = v # np.exp(v) - 1

df["dbco"] = dbco
df["dbco_ipf"] = dbco_norm

# for the np log
df.loc[df['ent'] == 0.0, 'ent'] = np.finfo(float).tiny

## find entropy cutoff
adj_ent = -np.log(df["ent"].values)

u, xx, v = knee(np.log1p(adj_ent.reshape(-1,1)), sum_axis=1)
(cutoff, ncells) = gmm(xx, v, comps=[3])
cutoff = np.exp(cutoff) - 1 # adjust (since we log1p it)
ent_cutoff = np.exp(-cutoff)

print(ncells, ent_cutoff)

shape:  (1756, 10)
657 1.1304355717204084e-05


In [13]:
df.query(f"ent < {ent_cutoff}").groupby("label").agg({
      "mtag1_ipf": "mean", 
      "mtag2_ipf": "mean",
      "mtag3_ipf": "mean",
      "mtag4_ipf": "mean",
      "dbco_ipf": "mean",
      "label": "count"
      }).astype(float)

Unnamed: 0_level_0,mtag1_ipf,mtag2_ipf,mtag3_ipf,mtag4_ipf,dbco_ipf,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,109.467885,72.054797,146.579868,161.109928,80.263848,429.0
no_sugar,184.628872,56.527984,137.764228,68.030894,122.524407,5.0
tmg,10.391879,3.991904,546.637494,6.438565,2.014368,224.0


In [24]:
# fix mx_diff to save file
mdf = mx_diff("san.norm.extr.matrix.mtx", 
              "san.barcodes.txt", 
              "san.extr.genes.txt", 
              df["label"].values)

def convert_mtx(*x):
  # x is a pandas series with multiple values
  # but because our matrix has unique group, tag pairs
  # there is only one element in the series
  return x[0].values[0]

es = mdf.groupby(["group_id", "name"])["es"].apply(convert_mtx).unstack()
pv = mdf.groupby(["group_id", "name"])["p_corr"].apply(convert_mtx).unstack()

xa, ya = np.where(es.values > 0)
check = defaultdict(list)
conditions = es.index.values
tags = es.columns.values
for i, j in zip(xa, ya):
  check[conditions[i]].append(tags[j])

markers = defaultdict(list)
read_markers_str(markers_fn, markers)

# number in true only, number in intersection, number in check only
for ct, gs in markers.items():
  tm = markers[ct]
  cm = check.get(ct, [])

  left  = np.setdiff1d  (tm, cm).shape[0]
  itx   = np.intersect1d(tm, cm).shape[0]
  right = np.setdiff1d  (cm, tm).shape[0]

  print(f"{left, itx, right} \t {ct} \t {tm} ∩ {cm}")

15-Feb-22 08:18:37 - 1 of 3 assignments: control
15-Feb-22 08:18:37 - 2 of 3 assignments: no_sugar
15-Feb-22 08:18:37 - 3 of 3 assignments: tmg


(1, 1, 2) 	 no_sugar 	 ['mtag1', 'mtag3'] ∩ ['mtag1', 'mtag2', 'mtag4']
(1, 1, 2) 	 control 	 ['mtag3', 'mtag4'] ∩ ['mtag1', 'mtag2', 'mtag4']
(1, 1, 0) 	 tmg 	 ['mtag2', 'mtag3'] ∩ ['mtag3']


In [19]:
es

name,mtag1,mtag2,mtag3,mtag4
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,0.80828,0.67507,-0.211728,1.430836
no_sugar,1.141854,1.253363,-0.402409,0.579225
tmg,,,1.351189,


In [20]:
pv

name,mtag1,mtag2,mtag3,mtag4
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,1.778085e-34,3.0938810000000005e-23,6.0158119999999995e-24,1.070246e-101
no_sugar,3.270117e-68,2.101931e-80,5.271617e-87,3.381311e-18
tmg,,,2.225074e-308,


In [21]:
mdf

Unnamed: 0,group_id,name,p_raw,p_corr,es
0,control,mtag1,0.0,0.0,0.80828
1,control,mtag3,0.0,0.0,-0.211728
2,control,mtag4,0.0,0.0,1.430836
3,control,mtag2,0.0,0.0,0.67507
4,no_sugar,mtag1,0.0,0.0,1.141854
5,no_sugar,mtag3,0.0,0.0,-0.402409
6,no_sugar,mtag4,0.0,0.0,0.579225
7,no_sugar,mtag2,0.0,0.0,1.253363
9,tmg,mtag3,0.0,0.0,1.351189
