In [1]:
"""
test reading output file from 1_datafilter.ipynb
set-up to run allbyall for new ST

Shaina Lu
Zador & Gillis Lab
April 2020
"""

'\ntest reading output file from 1_datafilter.ipynb\nset-up to run allbyall for new ST\n\nShaina Lu\nZador & Gillis Lab\nApril 2020\n'

In [2]:
from __future__ import division
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #stratify train/test split
import random

In [3]:
def set_style():
    #function for matplotlib formatting
    plt.style.use(['seaborn-white','seaborn-notebook'])
    plt.rcParams['figure.figsize'] = [6,4]
    plt.rcParams['axes.spines.top'] = False       #remove top line
    plt.rcParams['axes.spines.right'] = False     #remove right line
    plt.rcParams['axes.linewidth'] = 2.0          #set weight of axes
    plt.rcParams['axes.titlesize'] = 20           #set font size of title
    plt.rcParams['axes.labelsize'] = 18           #set font size of x,y labels
    plt.rcParams['axes.labelpad'] = 14            #space between labels and axes
    plt.rcParams['xtick.labelsize'] = 14          #set x label size
    plt.rcParams['ytick.labelsize'] = 14          #set y label size
    plt.rcParams['legend.fontsize'] = 16          #set legend font size

# Read in Data and pre-processing

In [12]:
#infiles
ST_CANTIN_FILT_PATH = "/home/slu/spatial/data/cantin_ST_filt_v1.h5"
ONTOLOGY_PATH = "/data/slu/allen_adult_mouse_ISH/ontologyABA.csv"

In [5]:
def read_data():
    """read in all datasets needed using pandas"""
    STspotsmeta = pd.read_hdf(ST_CANTIN_FILT_PATH, key='STspotsmeta', mode='r')
    STspots = pd.read_hdf(ST_CANTIN_FILT_PATH, key='STspots', mode='r')
    STpropont = pd.read_hdf(ST_CANTIN_FILT_PATH, key='STpropont', mode='r')

    ontology = pd.read_csv(ONTOLOGY_PATH)
    ontology = ontology.drop([ontology.columns[5], ontology.columns[6]], axis=1)
    ontology = ontology.fillna(-1)  #make root's parent -1

    return STspotsmeta, STspots, STpropont, ontology

data read in from filtered ST_cantin h5 file looks good based on head and shape

__copied and pasted pre-processing functions__

In [6]:
def zscore(voxbrain):
    """zscore voxbrain or subsets of voxbrain (rows: voxels, cols: genes)"""
    #z-score 
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    scaler.fit(voxbrain)
    z_voxbrain = scaler.transform(voxbrain, copy=True)
    
    #store z-scored voxbrain as pandas dataframe
    z_voxbrain = pd.DataFrame(z_voxbrain)
    z_voxbrain.columns = voxbrain.columns
    z_voxbrain.index = voxbrain.index
    
    return z_voxbrain

def splitdata(data, testratio):
    """generic fcn to split data in train/test folds"""
    #set seed so train and test will always split the same in diff run so ML algorithm doesn't see whole dataset (BAD)
    np.random.seed(42)
    shuffindices = np.random.permutation(len(data))
    testsize = int(len(data) * testratio)
    testindices = shuffindices[:testsize]
    trainindices = shuffindices[testsize:]
    return data.iloc[trainindices], data.iloc[testindices]

def filterproponto(sampleonto):
    """pre-processing for propogated ontology for ST data"""
    #remove brain areas that don't have any samples
    sampleonto_sums = sampleonto.apply(lambda col: col.sum(), axis=0)
    sampleonto = sampleonto.loc[:,sampleonto_sums > 5] #greater than 5 becuase less is not enough for train/test split to have non-zero areas
    
    return sampleonto

def getleaves(propontvox, ontology):
    #leaves are brain areas in the ontology that never show up in the parent column
    allareas = list(propontvox)
    parents = list(ontology.parent)
    for i in range(len(parents)): #convert parents from float to int, ids are ints
        parents[i] = int(parents[i])
    
    #remove parents from all areas
    leaves = []
    for area in allareas:
        if int(area) not in parents:
            leaves.append(area)
    
    return leaves

def analytical_auroc(featurevector, binarylabels):
    """analytical calculation of auroc
       inputs: feature (mean rank of expression level), binary label
       returns: auroc
    """
    #sort ctxnotctx binary labels by mean rank, aescending
    s = sorted(zip(featurevector, binarylabels))
    feature_sort, binarylabels_sort = map(list, zip(*s))
    #print feature_sort
    #print binarylabels_sort

    #get the sum of the ranks in feature vector corresponding to 1's in binary vector
    sumranks = 0
    for i in range(len(binarylabels_sort)):
        if binarylabels_sort[i] == 1:
            sumranks = sumranks + feature_sort[i]
    
    poslabels = binarylabels.sum()
    neglabels = (len(binarylabels) - poslabels) #- (len(binarylabels) - binarylabels.count())  #trying to subtract out 
    
    auroc = ((sumranks/(neglabels*poslabels)) - ((poslabels+1)/(2*neglabels)))
    
    return auroc

# LASSO functions

In [7]:
def applyLASSO(Xtrain, Xtest, ytrain, ytest):
    """apply LASSO regression"""
    lasso_reg = Lasso(alpha=0.05, max_iter=10000)
    #lasso_reg = LinearRegression()
    lasso_reg.fit(Xtrain, ytrain)
    
    #train
    predictions_train = lasso_reg.predict(Xtrain)
    auroc_train = analytical_auroc(sp.stats.mstats.rankdata(predictions_train), ytrain)
    #test
    predictions_test = lasso_reg.predict(Xtest)
    auroc_test = analytical_auroc(sp.stats.mstats.rankdata(predictions_test), ytest)
    
    return auroc_train, auroc_test

In [8]:
def getallbyall(data, propont):
    #initialize zeros dataframe to store entries
    allbyall_test = pd.DataFrame(index=list(propont), columns=list(propont))
    allbyall_test = allbyall_test.fillna(0)
    allbyall_train = pd.DataFrame(index=list(propont), columns=list(propont))
    allbyall_train = allbyall_train.fillna(0)
    
    areas = list(propont)
    #for each column, brain area
    for i in range(propont.shape[1]):
        print("col %d" %i)
        #for each row in each column
        for j in range(i+1,propont.shape[1]): #upper triangular!
            area1 = areas[i]
            area2 = areas[j]
            #get binary label vectors
            ylabels = propont.loc[propont[area1]+propont[area2] != 0, area1]
            #subset train and test sets for only samples in the two areas
            Xcurr = data.loc[propont[area1]+propont[area2] != 0, :]
            #split train test for X data and y labels
            #split data function is seeded so all will split the same wa
            Xtrain, Xtest, ytrain, ytest = train_test_split(Xcurr, ylabels, test_size=0.5,\
                                                            random_state=42, shuffle=True,\
                                                            stratify=ylabels)
            #ytrain, ytest = splitdata(ylabels, 0.5)
            #Xtrain, Xtest = splitdata(Xcurr, 0.5)
            #z-score current X data
            zXtrain = zscore(Xtrain)
            zXtest = zscore(Xtest)
            
            currauroc_train, currauroc_test = applyLASSO(zXtrain, zXtest, ytrain, ytest)
            allbyall_train.iloc[i,j] = currauroc_train
            allbyall_test.iloc[i,j] = currauroc_test
            #curr_row[0,j] = currauroc
            
        #if i == 1:
        break
     
    #return temp
    return allbyall_train, allbyall_test

# Main

In [13]:
#pre-processing
STspotsmeta, STspots, STpropont, ontology = read_data()

In [14]:
STpropont = filterproponto(STpropont)

In [15]:
STspots = STspots.astype('float64') #convert int to float for z-scoring
#get leaf areas
leaves = getleaves(STpropont, ontology)
leafSTpropont = STpropont.loc[STspotsmeta.id.isin(leaves),leaves] #subset prop onto for leaf areas
leafSTspots = STspots.loc[STspotsmeta.id.isin(leaves),:] #subset data for samples from leaves

In [16]:
#predictability matrix using LASSO
allbyall_train, allbyall_test = getallbyall(leafSTspots, leafSTpropont)

col 0


In [29]:
len(leaves)

461

In [18]:
type(STspots.iloc[0,0])

numpy.int64

In [47]:
STspotsmeta.head()

Unnamed: 0,slice_index,ML,DV,AP,acronym,name,nuclei,radius,x,y,id
02A_15.8x13.04,02A,3.156438,-3.545032,2.245,MOp1,"Primary motor area, Layer 1",3,72.832245,3479.641,4936.516,320
02A_18.75x13.07,02A,3.012475,-2.6928,2.245,MOp1,"Primary motor area, Layer 1",1,76.475977,3491.171,4074.165,320
02A_16.74x13.07,02A,3.124975,-3.2928,2.245,MOp1,"Primary motor area, Layer 1",1,75.797361,3488.927,4661.036,320
02A_17.81x13.08,02A,3.064854,-2.974134,2.245,MOp1,"Primary motor area, Layer 1",5,73.206277,3493.646,4348.148,320
02A_19.96x14.09,02A,2.831225,-2.4053,2.245,MOp1,"Primary motor area, Layer 1",2,77.408797,3783.901,3720.984,320
