In [1]:
"""
re-do of ABA ISH filtering to match ST protocol and test if it makes a difference from before

Shaina Lu
Zador & Gillis Labs
April 2020
"""

'\nre-do of ABA ISH filtering to match ST protocol and test if it makes a difference from before\n\nShaina Lu\nZador & Gillis Labs\nApril 2020\n'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import numpy as np
import seaborn as sns

# Read in raw ABA ISH Data

In [3]:
ALLEN_PATH = "/data/slu/allen_adult_mouse_ISH/allen_adultmouse_manthan_v3.h5"
ONTOLOGY_PATH = "/data/slu/allen_adult_mouse_ISH/ontologyABA.csv"

In [4]:
#open gene by voxel hdf5 file
f = h5py.File(ALLEN_PATH, 'r')
list(f)

['gene.id.info',
 'gene.id.info.cols',
 'vox.df',
 'vox.df.cols',
 'vox.meta',
 'vox.meta.cols']

In [5]:
#unpack each matrix within hdf5 file
metacols = list(map(lambda x: x.decode('UTF-8'), list(f['vox.meta.cols'])))
dfcols = list(map(lambda x: x.decode('UTF-8'),list(f['vox.df.cols'])))

meta = pd.DataFrame(np.char.decode(np.array(f['vox.meta']).T), columns=metacols)
voxdf = pd.DataFrame(np.array(f['vox.df']).T, columns=dfcols)
geneIDName = pd.DataFrame(np.char.decode(np.array(f['gene.id.info']).T), columns=["expt_id","gene_symbol","entrez_id","specimen_id","plane"])
f.close()

In [6]:
#note one more gene in geneIDName than in the cols of voxdf, 1 non-uniq in saggittal
print(geneIDName.shape)
print(voxdf.shape)
print(meta.shape)

(26079, 5)
(159326, 26078)
(159326, 4)


In [7]:
#read in ontology file
ontology = pd.read_csv(ONTOLOGY_PATH)
ontology = ontology.drop([ontology.columns[5], ontology.columns[6]], axis=1)
ontology = ontology.fillna(-1)  #make root's parent -1

__what's happening with the duplicate in geneIDName?__

In [8]:
geneIDName.nunique()

expt_id        26078
gene_symbol    19942
entrez_id      19428
specimen_id     5440
plane              2
dtype: int64

In [9]:
#there are two rows from the sagittal plane that are duplicated across all entries
dupfilt = geneIDName.duplicated(keep=False)
geneIDName.loc[dupfilt, :]

Unnamed: 0,expt_id,gene_symbol,entrez_id,specimen_id,plane
2744,78153149,Avp,11998,77632911,sagittal
2745,78153149,Avp,11998,77632911,sagittal


In [10]:
#remove one of these rows
geneIDName = geneIDName.drop(labels=2745, axis='index')

In [11]:
print(geneIDName.shape)
print(geneIDName.nunique())

(26078, 5)
expt_id        26078
gene_symbol    19942
entrez_id      19428
specimen_id     5440
plane              2
dtype: int64


# Filter I

__remove spots that do not map to the brain first here since there are many this will speed up subsequent propogation of ontology__

In [12]:
#filter out voxels that don't map to the brain
voxbrain = voxdf.loc[meta.ids != 'NA',:]
metabrain = meta.loc[meta.ids != 'NA',:]

In [13]:
print(voxbrain.shape)
print(metabrain.shape)

(62529, 26078)
(62529, 4)


# propagate ontology

In [14]:
#do this in a sloppy way that i know works
#propogate first then look-up
#goes up to the point where all labels are root
proponto = pd.DataFrame()
proponto[['id','1']] = ontology[['id', 'parent']]

curr = 1
while True:
    #get series representing next level up
    newcol = []
    for i in range(proponto.shape[0]):
        if proponto.iloc[i,curr] == -1:   #if already at parent of root, just give root again (-1)
            newval = -1
        else:
            newval = ontology.loc[ontology["id"] == int(proponto.iloc[i,curr]), "parent"].item()
        newcol.append(newval)

    if len(proponto[str(curr)].value_counts()) < 3:  #all are root after this point
        break
    proponto[str(curr+1)] = pd.Series(newcol)     #add series to df
    curr += 1

  from ipykernel import kernelapp as app


In [15]:
#at 10 levels up, all labels are root
#proponto.head()
proponto.loc[:,'10'].value_counts()

-1.0      1281
 997.0       6
Name: 10, dtype: int64

In [16]:
proponto = proponto.astype(int)  #typecast as integer
proponto = proponto.replace(to_replace=-1, value=997)  #replace root's parent (-1) with root (997)
proponto.head()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,10
0,577,369,322,453,315,695,688,567,8,997,997
1,657,345,322,453,315,695,688,567,8,997,997
2,1114,402,669,315,695,688,567,8,997,997,997
3,606,886,254,315,695,688,567,8,997,997,997
4,472,426,403,278,477,623,567,8,997,997,997


__now propagate full binary ontology for voxels__

In [17]:
fullontST = pd.DataFrame(index=metabrain.index, columns=ontology.id, dtype=np.int64)
fullontST.columns = fullontST.columns.astype(str)
fullontST = fullontST.fillna(0) #make all entries 0

In [18]:
for i in range(len(metabrain.ids.unique())): #for each unique brain area
    tempareas = proponto.loc[proponto.id == int(metabrain.ids.unique()[i]), :] #get row representing that area propagated
    #tempareas = tempareas.values.flatten()
    #print(tempareas)
    for val in tempareas.values.flatten():
        #set row,col entries to 1 
        fullontST.loc[metabrain.ids == metabrain.ids.unique()[i], str(val)] = 1

In [19]:
fullontST.head()

id,577,657,1114,606,472,1117,10714,632,484682479,312782586,...,356,153,684,312782554,484682520,1076,234,476,1051,619
9284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
fullontST.shape

(62529, 1287)

__how many of the voxels map to ventricular systems or fiber tracts?__
these areas were removed in ST data so remove them here too

In [21]:
#ventricular systesm - 2 voxels map here
print(fullontST.loc[:,"73"].sum())
#fiber tracts - 0 map here
print(fullontST.loc[:,"1009"].sum())

2.0
0.0


In [22]:
#which are to two voxels that map to ventricular systems and where else do they map?
vsvox = fullontST.loc[fullontST["73"]==1,"73"].index
fullontST.loc[vsvox,:].sum(axis=1)

79019     3.0
120053    4.0
dtype: float64

In [23]:
#find the non-zero cols
index1 = fullontST.loc[vsvox[0],:].to_numpy().nonzero()
index2 = fullontST.loc[vsvox[1],:].to_numpy().nonzero()

In [24]:
#these are all ventricular areas
print(fullontST.columns[index1])
print(fullontST.columns[index2])

Index(['73', '997', '129'], dtype='object', name='id')
Index(['145', '73', '997', '153'], dtype='object', name='id')


In [25]:
#remove the two ventricular voxels
fullontST.drop(labels=vsvox, axis=0, inplace=True)
voxbrain.drop(labels=vsvox, axis=0, inplace=True)
metabrain.drop(labels=vsvox, axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
print(fullontST.shape)
print(voxbrain.shape)
print(metabrain.shape)

(62527, 1287)
(62527, 26078)
(62527, 4)


# Filter II

__filter genes, replace NA's (here -1) with 0 and remove genes that don't express in X% voxels__

In [27]:
#treats 0's as not expressing as well
voxbrain.replace(to_replace=-1, value=0, inplace=True)            #make missing values 0
nonzerocount = voxbrain.astype(bool).sum(axis=0)    #gets number of spots for each gene that have non-zero expression

voxbrain = voxbrain.loc[:, nonzerocount > (.001*voxbrain.shape[0])]  #if gene has expression in at least 0.1% of samples

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [28]:
voxbrain.head()
print(voxbrain.shape)

(62527, 26008)


NOTE: this is only two less genes than before in allenadultmouseISH/filter_one so shouldn't dramatically change anything

# Write to new hdf5 part I
__write filtered voxels, meta data, and propagated ontology to new hdf5 file__

In [34]:
OUT_PATH = "ABAISH_filt_v6.h5"

In [35]:
voxbrain.to_hdf(OUT_PATH, key="voxbrain", mode='w')
metabrain.to_hdf(OUT_PATH, key="metabrain")
fullontST.to_hdf(OUT_PATH, key="propontology")
geneIDName.to_hdf(OUT_PATH, key="geneIDName")

# Average duplicated genes 

In [29]:
#I want to average all genes that are repeated regardless of plane. 
#initialize dataframe with only non-repeated series (cols)
nodupfilt = geneIDName.duplicated(subset="gene_symbol", keep=False) #marks all duplicates as True
nodupgenes = geneIDName.loc[~nodupfilt,:] #14970
indexers = nodupgenes.loc[nodupgenes.expt_id.isin(list(voxbrain)),"expt_id"] #solves NAN cols
avgvoxbrain = voxbrain.loc[:,indexers]
#rename the columns using gene_symbols
temp = dict(zip(nodupgenes.expt_id.tolist(),nodupgenes.gene_symbol.tolist()))
avgvoxbrain.rename(columns=temp, inplace=True)

#get the remaining gene_symbols that are duplicated
dupgenes = geneIDName.loc[nodupfilt,:]
#get unique gene_symbols from duplicated
uniqfilt = dupgenes.duplicated(subset="gene_symbol", keep='first')
uniqgenes = dupgenes.loc[~uniqfilt,:]

In [31]:
#how many genes are duplicated?
dupgenes.gene_symbol.nunique()

4972

In [None]:
for i in range(uniqgenes.shape[0]):  #loop through unique duplicated genes
    #get all series ids for current uniq gene symbol
    currseries = dupgenes.loc[dupgenes.gene_symbol == uniqgenes.iloc[i,1], "expt_id"].values
    curravg = voxbrain.loc[:,currseries].mean(axis=1) #get voxels and average them
    curravg = curravg.rename(uniqgenes.iloc[i,1])     #rename and add to df
    avgvoxbrain = pd.concat([avgvoxbrain, curravg], axis=1)

In [39]:
avgvoxbrain.shape

(62527, 19934)

# Write to new hdf5 part II

__write averaged vox brain to hdf5__

In [40]:
OUT_PATH2 = "ABAISH_filt_v6_avgdup.h5"

In [41]:
#write to second out file
avgvoxbrain.to_hdf(OUT_PATH2, key="avgvoxbrain", mode='w')
metabrain.to_hdf(OUT_PATH2, key="metabrain")
fullontST.to_hdf(OUT_PATH2, key="propontology")
geneIDName.to_hdf(OUT_PATH2, key="geneIDName")