<a href="https://colab.research.google.com/github/sbooeshaghi/azucar/blob/main/analysis/293T/obs2/imports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet -U upsetplot scikit-learn git+http://github.com/dirguis/ipfn@master

  Building wheel for ipfn (setup.py) ... [?25l[?25hdone
  Building wheel for upsetplot (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/sbooeshaghi/azucar.git

Cloning into 'azucar'...
remote: Enumerating objects: 1536, done.[K
remote: Counting objects: 100% (294/294), done.[K
remote: Compressing objects: 100% (285/285), done.[K
remote: Total 1536 (delta 166), reused 43 (delta 9), pack-reused 1242[K
Receiving objects: 100% (1536/1536), 1.70 GiB | 19.38 MiB/s, done.
Resolving deltas: 100% (642/642), done.
Checking out files: 100% (292/292), done.


In [1]:
import os
from scipy.io import mmread
import pandas as pd
import numpy as np

In [2]:
sample = "293T"
observation = "obs4"

base_data = f"azucar/analysis/{sample}/{observation}/out"
base_mark = f"azucar/analysis/{sample}/{observation}/assign"

matrix_fn  = os.path.join(base_data, "matrix.mtx")
genes_fn   = os.path.join(base_data, "genes.txt")
barcodes_fn   = os.path.join(base_data, "barcodes.txt")

!gunzip $base_data/*.gz

gzip: azucar/analysis/293T/obs4/out/*.gz: No such file or directory


In [3]:
from azucar.analysis.scripts.mx_sanitize import mx_sanitize
from azucar.analysis.scripts.mx_filter import mx_filter, knee, gmm
from azucar.analysis.scripts.mx_norm import mx_norm
from azucar.analysis.scripts.ec_index import ec_index
from azucar.analysis.scripts.mx_select import mx_select
from azucar.analysis.scripts.mx_extract import mx_extract
from azucar.analysis.scripts.mx_assign import mx_assign

from azucar.analysis.scripts.utils import read_str_list

In [4]:
markers_fn = os.path.join(base_mark, "markers.txt")

In [7]:
# drop barcodes and genes that sum to zero, update barcodes and genes file
mx_sanitize(matrix_fn, barcodes_fn, genes_fn, 
            "./san.matrix.mtx", 
            "./san.barcodes.txt", 
            "./san.genes.txt")

In [9]:
# knee plot gmm filter
mx_filter("./san.matrix.mtx",
          "./san.barcodes.txt",
          "./san.fil.matrix.mtx", 
          "./san.fil.barcodes.txt",
          comps=[2,3])

dbco = mmread("./san.fil.matrix.mtx").toarray()[:,-1]

Filtered to 3,252 cells with at least 601 UMIs.


In [10]:
# normalize matrix (log1p -> ipf)
mx_norm("./san.fil.matrix.mtx", 
        "./san.log1p.matrix.mtx", how="log1p")

mx_norm("./san.log1p.matrix.mtx", 
        "./san.norm.matrix.mtx", how="ipf")

dbco_norm = mmread("./san.norm.matrix.mtx").toarray()[:,-1]

In [11]:
# index the markers -> markers.ec marker_genes.txt groups.txt
ec_index(markers_fn, 
        "./markers.ec", 
        "./groups.txt", 
        "./marker_genes.txt")
# get the gene ids -> select.txt (selects in order of markers.ec)
mx_select(markers_fn, 
          "./san.genes.txt", 
          "./select.txt")
# extract elements from matrix that are of interest, rows / columns (with associated metadata)
mx_extract("san.norm.matrix.mtx", 
          "san.genes.txt", 
          "select.txt", 
          "san.norm.extr.matrix.mtx", 
          "san.extr.genes.txt", axis=1)


In [14]:
# perform assignments with markers and matrices
mx_assign("san.norm.extr.matrix.mtx", 
              "san.fil.barcodes.txt", 
              "san.extr.genes.txt", 
              "markers.ec", 
              "groups.txt",
          "assignments.txt")

df = pd.read_csv("assignments.txt", sep="\t", index_col=0)
print("shape: ", df.shape)

# original counts (this is the same as make df above)
raw_mtx = mmread("./san.fil.matrix.mtx").toarray()
raw_genes = []
read_str_list("./san.genes.txt", raw_genes)
for idx, v in enumerate(raw_mtx.T):
    df[f"{raw_genes[idx]}"] = v # np.exp(v) - 1

df["dbco"] = dbco
df["dbco_ipf"] = dbco_norm

# for the np log
df.loc[df['ent'] == 0.0, 'ent'] = np.finfo(float).tiny

## find entropy cutoff
adj_ent = -np.log(df["ent"].values)

u, xx, v = knee(np.log1p(adj_ent.reshape(-1,1)), sum_axis=1)
(cutoff, ncells) = gmm(xx, v, comps=[3])
cutoff = np.exp(cutoff) - 1 # adjust (since we log1p it)
ent_cutoff = np.exp(-cutoff)

print(ncells, ent_cutoff)

Initialization 0
  Iteration 10	 time lapse 0.02420s	 ll change 0.00109
  Iteration 20	 time lapse 0.02302s	 ll change 0.00062
Initialization converged: True	 time lapse 0.05041s	 ll -5.89764
shape:  (3252, 10)
348 2.997065222700532e-06


In [15]:
df

Unnamed: 0,mtag1_ipf,mtag3_ipf,mtag4_ipf,mtag2_ipf,label_id,label,ent,mahalanobis_0,mahalanobis_1,mahalanobis_2,mtag1,mtag2,mtag3,mtag4,dbco,dbco_ipf
AAACCCAAGCAGCACA,6.823136,5.336539,5.986289,0.770610,1,control,0.003084,0.0,0.999656,0.000344,601,1,146,276,28,3.619065
AAACCCAGTGGCTAGA,6.415665,5.210282,5.855511,1.216895,1,control,0.004909,0.0,0.999419,0.000581,419,2,132,249,35,3.837290
AAACGAAAGCTAGCCC,6.119522,4.707293,6.118189,1.380925,1,control,0.000748,0.0,0.999929,0.000071,1698,4,299,1715,159,4.209716
AAACGCTCATGCGTGC,6.400675,5.018112,5.885096,1.437730,1,control,0.002479,0.0,0.999731,0.000269,623,3,152,374,43,3.794033
AAACGCTGTTCAAACC,6.029172,4.751779,5.470217,1.879515,1,control,0.004320,0.0,0.999497,0.000503,400,5,110,231,76,4.404967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGTCCAAGCCG,6.524326,5.016895,6.105028,0.708063,1,control,0.000611,0.0,0.999943,0.000057,780,1,164,513,68,4.181328
TTTGGAGTCTTTGCAT,5.889898,4.232289,6.177269,2.204110,1,control,0.002139,0.0,0.999772,0.000228,2684,16,285,3997,212,4.032085
TTTGGTTGTCGATTAC,6.349874,4.811356,5.640857,1.413667,1,control,0.001266,0.0,0.999873,0.000127,660,3,134,322,79,4.319891
TTTGTTGCAGCTATTG,6.387519,5.349282,5.836750,0.765316,1,control,0.005586,0.0,0.999327,0.000673,416,1,153,249,50,4.196773


In [16]:
df.groupby("label").agg({
      "mtag1_ipf": "mean", 
      "mtag2_ipf": "mean",
      "mtag3_ipf": "mean",
      "mtag4_ipf": "mean",
      "dbco_ipf": "mean",
      "label": "count"
      }).astype(float)

Unnamed: 0_level_0,mtag1_ipf,mtag2_ipf,mtag3_ipf,mtag4_ipf,dbco_ipf,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,6.214836,1.170144,5.17587,6.07531,3.899484,2225.0
no_sugar,12.411287,1.40029,4.474256,4.249806,0.0,1.0
tmg,4.915526,1.580394,6.413182,5.852649,3.773898,1026.0
