In [1]:
"""
First notebook to open, explore, and filter full ST dataset from Cantin

Shaina Lu
Zador & Gillis Labs
March 2020
"""

'\nFirst notebook to open, explore, and filter full ST dataset from Cantin\n\nShaina Lu\nZador & Gillis Labs\nMarch 2020\n'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import numpy as np
import seaborn as sns

# Read in Full ST Data

In [3]:
EXPRS_FILE = "/data/slu/cantin_ST/exprmat.tsv"
SPOTSMETA_FILE = "/data/slu/cantin_ST/spotstable.tsv"
EXPTMETA_FILE = "/data/slu/cantin_ST/slicestable.tsv"
CLUSTER_FILE = "/data/slu/cantin_ST/cluster-id-name.csv"
RES_FILE = "/data/slu/cantin_ST/res-29.tsv"

In [4]:
exprsdata = pd.read_csv(EXPRS_FILE, sep='\t',header=0, index_col=0, quoting=3)

In [5]:
exprsdata.head()

Unnamed: 0,Gnai3,Cdc45,Scml2,Apoh,Narf,Cav2,Klf6,Scmh1,Cox5a,Tbx2,...,Gm17056,Platr19,Olfr457,Defa2,Olfr628,Gm16982,Mir155hg,X6430562O15Rik,Olfr1386,RP24.502E20.6
02A_15.8x13.04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02A_18.75x13.07,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
02A_16.74x13.07,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02A_17.81x13.08,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02A_19.96x14.09,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
exprsdata.shape

(34103, 23371)

In [7]:
spotsmeta = pd.read_csv(SPOTSMETA_FILE, sep='\t',header=0, index_col=0, quoting=3)
exptmeta = pd.read_csv(EXPTMETA_FILE, sep='\t',header=0, index_col=0, quoting=3)

In [8]:
print(spotsmeta.shape)
spotsmeta.head()

(34103, 10)


Unnamed: 0,slice_index,ML,DV,AP,acronym,name,nuclei,radius,x,y
02A_15.8x13.04,02A,3.156438,-3.545032,2.245,MOp1,"Primary motor area, Layer 1",3,72.832245,3479.641,4936.516
02A_18.75x13.07,02A,3.012475,-2.6928,2.245,MOp1,"Primary motor area, Layer 1",1,76.475977,3491.171,4074.165
02A_16.74x13.07,02A,3.124975,-3.2928,2.245,MOp1,"Primary motor area, Layer 1",1,75.797361,3488.927,4661.036
02A_17.81x13.08,02A,3.064854,-2.974134,2.245,MOp1,"Primary motor area, Layer 1",5,73.206277,3493.646,4348.148
02A_19.96x14.09,02A,2.831225,-2.4053,2.245,MOp1,"Primary motor area, Layer 1",2,77.408797,3783.901,3720.984


In [9]:
print(exptmeta.shape)
exptmeta.head()

(75, 4)


Unnamed: 0,slice_old_id,AP,animal,rotation
01A,ID9_C1,2.945,A2,90
01B,ID9_C2,2.945,A2,90
02A,ID1_D1,2.245,A1,-90
02B,ID1_D2,2.245,A1,-90
03A,ID1_E1,1.945,A1,-90


In [10]:
#look that the remaining two files
clusteridname = pd.read_csv(CLUSTER_FILE, sep=';',header=0, index_col=0, quoting=3)
res29 = pd.read_csv(RES_FILE, sep='\t',header=None, index_col=0, quoting=3)

In [11]:
print(clusteridname.shape)
clusteridname.head()

(200, 3)


Unnamed: 0_level_0,"""cluster""","""count""","""kept"""
"""clusters.named""",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"""fiber tracts-01""","""1""",609,True
"""Isocortex-01""","""2""",528,True
"""fiber tracts-02""","""3""",459,True
"""Isocortex-02""","""4""",430,True
"""Isocortex-03""","""5""",424,True


In [12]:
print(res29.shape)
res29.head()

(34053, 1)


Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
02A_15.8x13.04,52
02A_18.75x13.07,21
02A_16.74x13.07,21
02A_17.81x13.08,21
02A_19.96x14.09,21


# propagate ontology

__read in ontology data__

In [13]:
ONTOLOGY_PATH = "/data/slu/allen_adult_mouse_ISH/ontologyABA.csv"

In [14]:
#read in ontology file
ontology = pd.read_csv(ONTOLOGY_PATH)
ontology = ontology.drop([ontology.columns[5], ontology.columns[6]], axis=1)
ontology = ontology.fillna(-1)  #make root's parent -1

In [15]:
ontology.head()

Unnamed: 0,id,acronym,name,parent,allencolor
0,577,SSp-ul4,"Primary somatosensory area, upper limb, layer 4",369.0,#188064
1,657,SSp-m2/3,"Primary somatosensory area, mouth, layer 2/3",345.0,#188064
2,1114,VISal4,"Anterolateral visual area, layer 4",402.0,#08858C
3,606,RSPv2,"Retrosplenial area, ventral part, layer 2",886.0,#1AA698
4,472,MEApd-a,"Medial amygdalar nucleus, posterodorsal part, ...",426.0,#80C0E2


__first, propagate raw ontology__

In [16]:
#do this in a sloppy way that i know works
#propogate first then look-up
#goes up to the point where all labels are root
proponto = pd.DataFrame()
proponto[['id','1']] = ontology[['id', 'parent']]

curr = 1
while True:
    #get series representing next level up
    newcol = []
    for i in range(proponto.shape[0]):
        if proponto.iloc[i,curr] == -1:   #if already at parent of root, just give root again (-1)
            newval = -1
        else:
            newval = ontology.loc[ontology["id"] == int(proponto.iloc[i,curr]), "parent"].item()
        newcol.append(newval)

    if len(proponto[str(curr)].value_counts()) < 3:  #all are root after this point
        break
    proponto[str(curr+1)] = pd.Series(newcol)     #add series to df
    curr += 1

  from ipykernel import kernelapp as app


In [17]:
#at 10 levels up, all labels are root
#proponto.head()
proponto.loc[:,'10'].value_counts()

-1.0      1281
 997.0       6
Name: 10, dtype: int64

In [18]:
proponto = proponto.astype(int)  #typecast as integer
proponto = proponto.replace(to_replace=-1, value=997)  #replace root's parent (-1) with root (997)
proponto.head()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,10
0,577,369,322,453,315,695,688,567,8,997,997
1,657,345,322,453,315,695,688,567,8,997,997
2,1114,402,669,315,695,688,567,8,997,997,997
3,606,886,254,315,695,688,567,8,997,997,997
4,472,426,403,278,477,623,567,8,997,997,997


__now propagate full binary ontology for the ST spots themselves__

In [19]:
#initiate full binary ontology dataframe
fullontST = pd.DataFrame(index=exprsdata.index, columns=ontology.id, dtype=np.int64)
fullontST.columns = fullontST.columns.astype(str)
fullontST = fullontST.fillna(0) #make all entries 0

In [20]:
for i in range(len(spotsmeta.acronym.unique())): #for each unique brain area
    currareaid = int(ontology.loc[ontology.acronym == spotsmeta.acronym.unique()[i], "id"].item())
    tempareas = proponto.loc[proponto.id == currareaid, :] #get row representing that area propagated

    for val in tempareas.values.flatten():
        #set row,col entries to 1 
        fullontST.loc[spotsmeta.acronym == spotsmeta.acronym.unique()[i], str(val)] = 1

  


In [21]:
fullontST.head()
fullontST.shape

(34103, 1287)

__remove spots that map to ventricular systems of fiber tracts__

In [22]:
#ventricular systesm - 251 spots map here
print(fullontST.loc[:,"73"].sum())
#fiber tracts - 3072 map here
print(fullontST.loc[:,"1009"].sum())

251.0
3072.0


In [23]:
#get the ventricular and fibertract spots
vsspots = fullontST.loc[fullontST["73"]==1, "73"].index
fibspots = fullontST.loc[fullontST["1009"]==1, "1009"].index

In [24]:
#remove the ventricular and fiber tract spots
fullontST.drop(labels=vsspots, axis=0, inplace=True)
exprsdata.drop(labels=vsspots, axis=0, inplace=True)
spotsmeta.drop(labels=vsspots, axis=0, inplace=True)

fullontST.drop(labels=fibspots, axis=0, inplace=True)
exprsdata.drop(labels=fibspots, axis=0, inplace=True)
spotsmeta.drop(labels=fibspots, axis=0, inplace=True)

In [25]:
print(fullontST.shape)
print(exprsdata.shape)
print(spotsmeta.shape)

(30780, 1287)
(30780, 23371)
(30780, 10)


# Filter

__remove spots that do not map to the brain__ 

In [26]:
#spotsmeta.acronym.unique()

It appears that all spots here do map to the brain 

__remove genes that only contain expression in less than X% of samples__

In [26]:
#count number of spots for each gene that has a non-zero exprs value
nonzerocount = exprsdata.astype(bool).sum(axis=0)

In [27]:
#filter for genes having expression in at least 0.1% of samples
exprsdata_filt = exprsdata.loc[:, nonzerocount > (0.001*exprsdata.shape[0])]

In [28]:
exprsdata_filt.shape

(30780, 16557)

# add id col to meta data

lots of my code from before uses the brain id column, so instead of re-doing this 

In [29]:
temp = dict(zip(ontology.loc[:,"acronym"], ontology.loc[:,"id"]))
ids = []
for val in spotsmeta.acronym.values:
    ids.append(temp[val])

In [30]:
spotsmeta["id"] = ids

In [31]:
spotsmeta.head()
spotsmeta.shape

(30780, 11)

# Write to new hdf5

only write the exprsdata and spotsmeta dataframes since the other three files are unchanged from cantin \
also write propagated ontology

In [32]:
OUT_PATH = "cantin_ST_filt_v2.h5"

In [33]:
exprsdata_filt.to_hdf(OUT_PATH, key="STspots", mode='w')
spotsmeta.to_hdf(OUT_PATH, key="STspotsmeta")
fullontST.to_hdf(OUT_PATH, key="STpropont")