# Annotate single cells by reference transcriptomes

In [1]:
import os,sys
import datetime

### Import scanpy

In [2]:
import scanpy.api as sc
sc.logging.print_versions()
sc.logging.print_memory_usage()
sc.settings.verbosity = 2

scanpy==1.3.4 anndata==0.6.13 numpy==1.15.4 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 
Memory usage: current 0.17 GB, difference +0.17 GB


### Import my utility functions and import statements from github

In [3]:
"""# This cell is run once to download my custom functions and import statements from github

!git clone --depth=1 https://github.com/rapolaszilionis/utility_functions
    
# github doesn't seem to have an option to download a specific version of the repo from the history.
# So I download my utility functions and save the download time by appending it to the directory name.
# These utility functions to be shared together with the notebook.

toappend = datetime.datetime.now().strftime('%y%m%d_%Hh%M')
newname = "utility_functions_%s"%toappend
print(newname)


# rename the py file with utility functions
os.rename("utility_functions",newname)"""

'# This cell is run once to download my custom functions and import statements from github\n\n!git clone --depth=1 https://github.com/rapolaszilionis/utility_functions\n    \n# github doesn\'t seem to have an option to download a specific version of the repo from the history.\n# So I download my utility functions and save the download time by appending it to the directory name.\n# These utility functions to be shared together with the notebook.\n\ntoappend = datetime.datetime.now().strftime(\'%y%m%d_%Hh%M\')\nnewname = "utility_functions_%s"%toappend\nprint(newname)\n\n\n# rename the py file with utility functions\nos.rename("utility_functions",newname)'

In [4]:
# add the utility function folder to PATH
sys.path.append(os.path.abspath("utility_functions_190425_11h32"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import
import rz_utility_spring as srz

python version: 3.6.7


# Load data and place into an annData object

In [5]:
!ls data_from_geo

cell_info_8594x25.tsv         injury_barcodes.tsv
control_barcodes.tsv          injury_genes.tsv
control_genes.tsv             injury_loom.loom
control_loom.loom             injury_matrix.mtx
control_matrix.mtx            raw_counts_8594x27998.mtx
gene_names_alphabetically.txt


In [7]:
obspath = 'data_from_geo/cell_info_8594x25.tsv'
countpath = 'data_from_geo/raw_counts_8594x27998.mtx'
genepath = 'data_from_geo/gene_names_alphabetically.txt'

In [8]:
# load cell info
obs = pd.read_csv(obspath,sep='\t',index_col=0)
print(obs.shape)
obs.head()

(8594, 25)


Unnamed: 0,barcode,condition,total_counts,pass_quality_filters,inj_epithelial,ctr_epithelial,excluded_as_immune_or_mesench,class,population,phase,...,x_control,y_control,x_class3_exploded,y_class3_exploded,x_control_injured,y_control_injured,x_class1_CTR_cell_cyc_removed,y_class1_CTR_cell_cyc_removed,x_class1_INJ_cell_cyc_removed,y_class1_INJ_cell_cyc_removed
0,AAACCTGAGTGCTGCC-1,control,2787,True,False,True,False,class3,ctr_DEEx,G1,...,783.7055,-377.074709,1082.172898,-266.303304,709.78614,-325.505019,,,,
1,AAACCTGAGTGGGTTG-1,control,3325,True,False,True,False,class3,ctr_upper_IEE,G1,...,709.612642,-429.409941,854.230866,-454.025809,529.15516,-471.961493,,,,
2,AAACCTGCAAGTCTAC-1,control,1781,False,False,False,False,,,,...,,,,,,,,,,
3,AAACCTGCAATCTGCA-1,control,3468,True,False,False,True,,,,...,,,,,,,,,,
4,AAACCTGCACGGTGTC-1,control,1745,False,False,False,False,,,,...,,,,,,,,,,


In [9]:
# load counts
adata = sc.read_mtx(countpath)
print(adata.shape)

# add genes (annotation of variables)
adata.var['genes'] = np.loadtxt(genepath,dtype=str)

# make sure var names are genes
adata.var_names = adata.var['genes'].values

# add obs (annotation of observations)
adata.obs = obs

# make sure index is unique AND a string
adata.obs_names_make_unique()
adata.obs_names = adata.obs_names.astype(str)

(8594, 27998)


### Scale (normalize) data

In [10]:
# turn into counts per 10k
print(adata.X[:5,:].sum(axis=1))
print()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
print(adata.X[:5,:].sum(axis=1))

[[2787.]
 [3325.]
 [1781.]
 [3468.]
 [1745.]]

[[10000.   ]
 [10000.   ]
 [10000.001]
 [10000.   ]
 [ 9999.999]]


## Get centroids (reference transcriptomes)

In [11]:
help(rz.centroids)

Help on function centroids in module rz_functions:

centroids(label, adata, E=None, gene_list=None)
    Calculate average gene expression level per cell label (e.g. cluster).
    input:
        - label: name of column that stores the label of interest in adata.obs
        - adata: AnnData object OR a cell x feature pandas dataframe with label as one of the columns
        - E and gene_list: optional and only used when adata is not an AnnData object. In that case
        the cells x genes sparse expression matrix E and the gene_list must be specified
        
    returns:
        pandas dataframe, centroids x genes



In [12]:
label = 'population'
cell_mask = adata.obs['ctr_epithelial'].astype(bool).values
centroids = rz.centroids(label,adata[cell_mask])

In [13]:
centroids.head()

Unnamed: 0,0610007P14Rik,0610009B22Rik,0610009L18Rik,0610009O20Rik,0610010F05Rik,0610010K14Rik,0610011F06Rik,0610012D04Rik,0610012G03Rik,0610025J13Rik,...,mt-Co2,mt-Co3,mt-Cytb,mt-Nd1,mt-Nd2,mt-Nd3,mt-Nd4,mt-Nd4l,mt-Nd5,mt-Nd6
ctr_DEEx,1.391852,0.879937,0.324726,0.209586,0.086968,0.32996,0.954454,0.0,1.200782,0.0,...,14.079573,83.904465,56.074482,23.852005,3.19885,1.030915,18.962263,0.987729,1.261443,0.275973
ctr_upper_IEE,1.300044,0.695099,0.314438,0.287892,0.184926,0.397997,0.773451,0.0,0.998683,0.0,...,15.316485,84.531807,54.308296,24.424025,3.601944,1.055761,18.23905,0.895775,0.950987,0.232046
ctr_M_G1,1.303535,0.552314,0.065731,0.330279,0.12681,0.566542,0.606099,0.0,1.455639,0.0,...,12.348806,70.209267,47.538788,23.879992,2.751078,0.998057,15.154018,0.907756,0.761977,0.202841
ctr_OEE_IEE,1.737094,0.544913,0.267885,0.461709,0.1059,0.447238,0.927448,0.0,1.360467,0.0,...,13.681261,85.969254,55.733368,22.308928,4.568748,0.95702,19.031935,0.749691,1.071926,0.07611
ctr_G2_M,2.209283,0.774067,0.063674,0.473809,0.182266,0.590541,0.573025,0.0,1.316742,0.0,...,12.441298,70.31929,45.029736,23.863428,3.214058,0.953022,15.301864,0.760218,1.10323,0.170797


## Run Bayesian classifier using all genes

In [30]:
# select cells to classify
toclassify = adata.obs['inj_epithelial'].astype(bool).values
E = adata[toclassify].X
gene_list = adata.var_names

# reference transcriptomes to classify by
categories = centroids.T

# Select genes. They have to be present in both datasets.
genes_to_use = gene_list #in this case simply all genes
gene_mask = np.in1d(gene_list,genes_to_use)

pseudo = 1

In [48]:
start = time.time()
bays = []
i = 0
interval = 1000
for j in range(interval,E.shape[0]+interval,interval):
    j = min(j,E.shape[0])
    stepsize = j-i
    tmp_dense = pd.DataFrame(E.T[gene_mask][:,i:j].todense())
    tmp_dense.index = np.array(gene_list)[gene_mask]
    bay = rz.bayesian_classifier(tmp_dense,categories.loc[tmp_dense.index]+pseudo)
    bays.append(bay)
    i = j
    print('%.2f min.'%((time.time()-start)/60.))
    print('cells from %d to %d done'%(i-stepsize,min(i,E.shape[0])))
    
bay = pd.concat(bays,axis=1)
bay.columns = np.arange(bay.shape[1])

0.02 min.
cells from 0 to 1000 done
0.04 min.
cells from 1000 to 2000 done
0.06 min.
cells from 2000 to 3000 done
0.08 min.
cells from 3000 to 4000 done
0.09 min.
cells from 4000 to 4344 done


In [49]:
# log likelihoods. Can be use directly to find maximum likelihood.
bay

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4334,4335,4336,4337,4338,4339,4340,4341,4342,4343
ctr_DEEx,-37827.351562,-37193.226562,-37724.3125,-38271.964844,-38053.949219,-38834.886719,-36969.839844,-38269.835938,-38657.800781,-37963.59375,...,-37493.234375,-37796.992188,-38086.953125,-38499.703125,-37928.050781,-37438.015625,-37693.460938,-38000.359375,-37698.652344,-37769.933594
ctr_upper_IEE,-37779.1875,-37301.761719,-37749.914062,-38292.160156,-37981.261719,-38974.894531,-37144.335938,-38224.960938,-38696.679688,-38067.8125,...,-37432.890625,-37804.59375,-38370.59375,-38586.027344,-38070.097656,-37576.816406,-37768.582031,-38050.085938,-37972.800781,-37887.433594
ctr_M_G1,-37633.441406,-37372.425781,-37515.351562,-38080.683594,-37812.203125,-38638.175781,-37195.03125,-38118.628906,-38419.722656,-37928.214844,...,-37408.769531,-37672.359375,-38292.304688,-38289.351562,-38093.101562,-37668.308594,-37772.933594,-37999.277344,-37982.15625,-37893.882812
ctr_OEE_IEE,-38003.457031,-37286.925781,-37948.328125,-38363.226562,-38153.125,-38986.945312,-37022.015625,-38382.597656,-38831.328125,-38218.882812,...,-37659.273438,-37983.074219,-38369.308594,-38710.765625,-37998.824219,-37353.78125,-37843.414062,-38128.695312,-37844.609375,-37867.457031
ctr_G2_M,-37814.449219,-37498.4375,-37720.675781,-37941.269531,-37914.28125,-38463.660156,-37277.367188,-38132.328125,-38529.675781,-38074.367188,...,-37656.128906,-37866.792969,-38396.371094,-38167.925781,-38156.140625,-37678.015625,-37918.8125,-38147.777344,-38059.5625,-37993.644531
ctr_OEE_2,-38277.347656,-37329.871094,-38160.78125,-38607.304688,-38447.25,-38797.902344,-36679.453125,-38618.253906,-38712.855469,-38347.917969,...,-37911.523438,-38158.085938,-38149.726562,-38825.769531,-37675.179688,-37156.734375,-37710.210938,-38273.039062,-37458.3125,-37666.75
ctr_OSR,-38089.160156,-37278.207031,-38011.726562,-38462.550781,-38229.3125,-38879.316406,-36921.203125,-38346.566406,-38695.714844,-38299.664062,...,-37783.640625,-38048.109375,-38352.976562,-38723.402344,-37816.804688,-37350.921875,-37750.824219,-38068.828125,-37815.625,-37756.257812
ctr_SI,-37922.355469,-37182.78125,-37939.300781,-38443.992188,-38170.445312,-38995.699219,-37008.585938,-38277.964844,-38743.40625,-38198.730469,...,-37566.816406,-37883.609375,-38368.410156,-38707.847656,-37883.15625,-37407.363281,-37587.226562,-37981.140625,-37871.667969,-37822.78125
ctr_S,-37689.898438,-37388.378906,-37678.914062,-38042.070312,-37892.90625,-38731.605469,-37208.65625,-38030.496094,-38600.886719,-37993.128906,...,-37540.445312,-37748.078125,-38382.832031,-38282.371094,-38123.441406,-37640.625,-37844.835938,-38028.097656,-38010.605469,-37947.890625
ctr_VEE,-38468.382812,-37540.101562,-38340.515625,-38810.101562,-38606.800781,-38949.019531,-36767.3125,-38805.835938,-38693.207031,-38509.378906,...,-38065.285156,-38296.089844,-38450.222656,-38989.757812,-37540.753906,-37356.140625,-37515.898438,-38469.417969,-37611.816406,-37702.785156


In [50]:
# maximum likelihood for every cell
recreated = bay.idxmax().values
recreated

array(['ctr_M_G1', 'ctr_SI', 'ctr_M_G1', ..., 'ctr_SI', 'ctr_OEE_2',
       'ctr_OEE_2'], dtype=object)

## Compare to results provided on GEO

In [51]:
on_geo = adata[toclassify].obs['population'].apply(lambda x:x.replace('inj_','ctr_')).values

In [52]:
mismatches = recreated!=on_geo
print("%d out of %d cells have mismatching labels"%(mismatches.sum(),len(mismatches)))

0 out of 4344 cells have mismatching labels
