<a href="https://colab.research.google.com/github/sbooeshaghi/azucar/blob/main/analysis/293T/obs2/imports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet -U upsetplot scikit-learn git+http://github.com/dirguis/ipfn@master

  Building wheel for ipfn (setup.py) ... [?25l[?25hdone
  Building wheel for upsetplot (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/sbooeshaghi/azucar.git

Cloning into 'azucar'...
remote: Enumerating objects: 1536, done.[K
remote: Counting objects: 100% (294/294), done.[K
remote: Compressing objects: 100% (285/285), done.[K
remote: Total 1536 (delta 166), reused 43 (delta 9), pack-reused 1242[K
Receiving objects: 100% (1536/1536), 1.70 GiB | 19.38 MiB/s, done.
Resolving deltas: 100% (642/642), done.
Checking out files: 100% (292/292), done.


In [1]:
import os
from scipy.io import mmread
import pandas as pd
import numpy as np

In [2]:
sample = "293T"
observation = "obs2"

base_data = f"azucar/analysis/{sample}/{observation}/out"
base_mark = f"azucar/analysis/{sample}/{observation}/assign"

matrix_fn  = os.path.join(base_data, "matrix.mtx")
genes_fn   = os.path.join(base_data, "genes.txt")
barcodes_fn   = os.path.join(base_data, "barcodes.txt")

!gunzip $base_data/*.gz

gzip: azucar/analysis/293T/obs2/out/*.gz: No such file or directory


In [3]:
!rm *.mtx *.txt *.ec

In [4]:
from azucar.analysis.scripts.mx_sanitize import mx_sanitize
from azucar.analysis.scripts.mx_filter import mx_filter, knee, gmm
from azucar.analysis.scripts.mx_norm import mx_norm
from azucar.analysis.scripts.ec_index import ec_index
from azucar.analysis.scripts.mx_select import mx_select
from azucar.analysis.scripts.mx_extract import mx_extract
from azucar.analysis.scripts.mx_assign import mx_assign

from azucar.analysis.scripts.utils import read_str_list

In [5]:
markers_fn = os.path.join(base_mark, "markers.txt")

In [6]:
# drop barcodes and genes that sum to zero, update barcodes and genes file
mx_sanitize(matrix_fn, barcodes_fn, genes_fn, 
            "./san.matrix.mtx", 
            "./san.barcodes.txt", 
            "./san.genes.txt")

# knee plot gmm filter
mx_filter("./san.matrix.mtx",
          "./san.barcodes.txt",
          "./san.fil.matrix.mtx", 
          "./san.fil.barcodes.txt",
          comps=[3,3])

dbco = mmread("./san.fil.matrix.mtx").toarray()[:,-1]

# normalize matrix (log1p -> ipf)
mx_norm("./san.fil.matrix.mtx", 
        "./san.log1p.matrix.mtx", how="log1p")

mx_norm("./san.log1p.matrix.mtx", 
        "./san.norm.matrix.mtx", how="ipf", target_sum=1_000_000)

dbco_norm = mmread("./san.norm.matrix.mtx").toarray()[:,-1]

# index the markers -> markers.ec marker_genes.txt groups.txt
ec_index(markers_fn, 
        "./markers.ec", 
        "./groups.txt", 
        "./marker_genes.txt")
# get the gene ids -> select.txt (selects in order of markers.ec)
mx_select(markers_fn, 
          "./san.genes.txt", 
          "./select.txt")
# extract elements from matrix that are of interest, rows / columns (with associated metadata)
mx_extract("san.norm.matrix.mtx", 
          "san.genes.txt", 
          "select.txt", 
          "san.norm.extr.matrix.mtx", 
          "san.extr.genes.txt", axis=1)

# perform assignments with markers and matrices
mx_assign("san.norm.extr.matrix.mtx", 
          "san.fil.barcodes.txt", 
          "san.extr.genes.txt", 
          "markers.ec", 
          "groups.txt",
          "assignments.txt")

Filtered to 2,514 cells with at least 137 UMIs.
Initialization 0
  Iteration 10	 time lapse 0.02947s	 ll change 0.04853
  Iteration 20	 time lapse 0.04282s	 ll change 0.00012
Initialization converged: True	 time lapse 0.07588s	 ll -21.05952


In [7]:
df = pd.read_csv("assignments.txt", sep="\t", index_col=0)
print("shape: ", df.shape)

# original counts (this is the same as make df above)
raw_mtx = mmread("./san.fil.matrix.mtx").toarray()
raw_genes = []
read_str_list("./san.genes.txt", raw_genes)
for idx, v in enumerate(raw_mtx.T):
    df[f"{raw_genes[idx]}"] = v # np.exp(v) - 1

df["dbco"] = dbco
df["dbco_ipf"] = dbco_norm

# for the np log
df.loc[df['ent'] == 0.0, 'ent'] = np.finfo(float).tiny

## find entropy cutoff
adj_ent = -np.log(df["ent"].values)

u, xx, v = knee(np.log1p(adj_ent.reshape(-1,1)), sum_axis=1)
(cutoff, ncells) = gmm(xx, v, comps=[3])
cutoff = np.exp(cutoff) - 1 # adjust (since we log1p it)
ent_cutoff = np.exp(-cutoff)

print(ncells, ent_cutoff)

shape:  (2514, 10)
511 5.539211577464892e-17


In [8]:
df.query(f"ent < {ent_cutoff}").groupby("label").agg({
      "mtag1_ipf": "mean", 
      "mtag2_ipf": "mean",
      "mtag3_ipf": "mean",
      "mtag4_ipf": "mean",
      "dbco_ipf": "mean",
      "label": "count"
      }).astype(float)

Unnamed: 0_level_0,mtag1_ipf,mtag2_ipf,mtag3_ipf,mtag4_ipf,dbco_ipf,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,2.646261,2.563956,8.540264,155.899557,228.1233,168.0
no_sugar,363.628268,1.99313,22.878874,8.048774,1.220847,204.0
tmg,5.502093,343.780744,31.86518,13.402548,3.223795,140.0
