In [30]:
import numpy as np
import pandas as pd
import os
import glob
import networkx
from nltk import edit_distance
from itertools import combinations

In [6]:
satcs = glob.glob('*_2/*')
satc_ = pd.read_csv('tabula_sapiens_10x_unpacked_2/TSP_10x_gap_0_unpublished_SPLIT_TSP21_SPLIT_BoneMarrowbin88.satc_unpacked',sep='\t',header=None)
satc_ = satc_[satc_[4]>1]
satc = satc_[satc_[2]=='AGAAGAACTAATGTTAGTATAAGTAAC']
satc_.shape, satc_[2].value_counts().reset_index()

((220244, 5),
                             index     2
 0     AGAAGAACTAATGTTAGTATAAGTAAC  4578
 1     GAGCCTGGTGATAGCTGGTTGTCCAAG  4219
 2     CGCATAAGCCTGCGTCAGATTAAAACA  4179
 3     CTCCTCACACCCAATTGGACCAATCTA  3829
 4     ATTGGACCAATCTATCACCCTATAGAA  3807
 ...                           ...   ...
 1321  TGGAATTCTTTGTCTTTGACTTTTGAC     2
 1322  AGCGTGGCCGTTGGCTGCCTCGCACAG     2
 1323  CCGACGGCACCTACGGCTCAACATTTT     2
 1324  GCTCATATGCGAGCGCTAATTCTGTGG     2
 1325  CTCAGGGAGTGCATCCGCCCCAACCCT     2
 
 [1326 rows x 2 columns])

In [7]:
satc = satc.rename(columns={1:'sample',2:'anchor',3:'target',4:'count'})
satc = satc[['sample','anchor','target','count']]

### Define configurations for a target-centered graph. 


graphName : (anchor, class)

nodeType : "target" 

nodeFeatures : [ "seqComp" , "sampleFraction" ]

edgeFeatures : [ "targetHamming" , "targetLevenshtein" , "corrSampleFractions" , 
                 "corrBoolSampleFractions" , "naiveMSA" ]

connectedness: [ "full" , "sampleCorrelated" , "toleratedHamming_"+{float} , "toleratedLevenshtein_"+{float} ] 



### Define configurations for a sample-centered graph. 

graphName : (anchor, class)

nodeType : "sample" 

nodeFeatures : [ "maskedSeqComp" , "sampleFraction" ]

edgeFeatures : [ "fullTargetHamming" , "fullTargetLevenshtein" ,"aggTargetHamming" , "aggTargetLevenshtein" ,
                 "corrSampleFractions" ,  "corrBoolSampleFractions" , "naiveMSA" ]

connectedness: [ "full" , "sampleCorrelated" , "toleratedMeanHamming_"+{float} , "toleratedMeanLevenshtein_"+{float} ] 
    

In [11]:
def node_featurization(data, nodeType, nodeFeatures, ordering):
    
    nodeFeatureDict = dict()
    
    if 'seqComp' in nodeFeatures:
        
        nodeFeatureDict['seqComp'] = dict()
        
        for target in data['target'].unique():
        
            seq = np.array(list(target))
            mat = np.zeros(shape=(4,len(seq)))
            mat[0] = np.array([1*(seq == 'A')])
            mat[1] = np.array([1*(seq == 'C')])
            mat[2] = np.array([1*(seq == 'G')])
            mat[3] = np.array([1*(seq == 'T')])
            nodeFeatureDict['seqComp'][target] = mat
            
    if 'sampleFraction' in nodeFeatures:
    
        nodeFeatureDict['sampleFraction'] = dict()
        
        if nodeType == 'sample':
            
            full = pd.DataFrame({'target':ordering})
            
            for sample in data['sample'].unique():
                
                this_full = full.merge(data[data['sample']==sample][['target','count']],how='left').fillna(0)
                nodeFeatureDict['sampleFraction'][sample] = np.array(this_full['count'] / this_full['count'].sum())
        
        if nodeType == 'target': 
          
            full = pd.DataFrame({'sample':ordering})
            
            for target in data['target'].unique():
                
                this_full = full.merge(data[data['target']==target][['target','count']],how='left').fillna(0)
                nodeFeatureDict['sampleFraction'][target] = np.array(this_full['count'] / this_full['count'].sum())
             

    if 'maskedSeqComp' in nodeFeatures:
     
        if 'seqComp' in nodeFeatures:
            internalFeatureDict = nodeFeatureDict
        
        else:
            internalFeatureDict = dict()
        
        for target in data['target'].unique():
         
            seq = np.array(list(target))
            mat = np.zeros(shape=(4,len(seq)))
            mat[0] = np.array([1*(seq == 'A')])
            mat[1] = np.array([1*(seq == 'C')])
            mat[2] = np.array([1*(seq == 'G')])
            mat[3] = np.array([1*(seq == 'T')])
            internalFeatureDict['seqComp'][target] = mat
        
        for sample in data['sample'].unique():
               
            nodeFeatureDict['maskedSeqComp'][sample] = np.zeros(shape=(len(ordering),4,len(ordering[0])))
            sample_targets = set(data[data['sample']==sample]['target'].unique())
            ordering_set = set(ordering)
            
            for index in range(len(ordering)):
                
                verbose = ordering[index]
                
                if verbose in sample_targets:
                    
                    nodeFeatureDict['maskedSeqComp'][sample][index] = internalFeatureDict['seqComp'][verbose]
        
    return nodeFeatureDict

In [67]:
def pairwise_similarities(values, function):
    
    distanceMatrix = np.zeros(shape=(len(values),len(values)))

    for i in range(len(values)): 

        for j in range(len(values)):

            distanceMatrix[i][j] = function(values[i], values[j])
    
    return distanceMatrix


def hamming_distance(seq1, seq2):
    
    return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))


def correlation_matrix(vectorDict, keys, isBoolMasked):
    
    correlationMatrix = np.zeros(shape=(len(keys),len(keys)))
    setList = []

    if not isBoolMasked:
    
        for i in range(len(keys)): 

            for j in range(len(keys)): 
            
                corr = np.dot(vectorDict[keys[i]], vectorDict[keys[j]])
                correlationMatrix[i][j] = corr
                
                if corr > 0: 
                    
                    s = set([keys[i],keys[j]])
                    
                    if len(s) > 1 and s not in setList:
                        
                        setList.append(s)
        
        return correlationMatrix, [tuple(i) for i in setList]
    
    if isBoolMasked: 
        
        for i in keys: 
            
            for j in keys: 
                
                correlationMatrix[i][j] = np.dot((1*(vectorDict[i]>0)), (1*(vectorDict[j]>0)))
    
        return correlationMatrix

In [68]:
def construct_graph(data, graphName, nodeType, nodeFeatures, edgeFeatures, connectedness, ordering):
    
    output_dict = dict()
    
    output_dict['nodeFeatures'] = node_featurization(data, nodeType, nodeFeatures, ordering)
    
    if nodeType == 'target': 
        
        node_value_arr = data['target'].unique()
        
    if nodeType == 'sample': 
        
        node_value_arr = data['sample'].unique()
        
    ####
    ####
    ####
    #### If needed, precompute all pairwise Hamming and/or Levenshtein distances.
        
    hamDistanceMatrix = 0
    
    if 'Hamming' in ''.join(edgeFeatures) or ''.join(connectedness):
        
        if nodeType == 'target': 
            
            hamDistanceMatrix = pairwise_similarities(node_value_arr, hamming_distance)
        
        if nodeType == 'sample': 
            
            hamDistanceMatrix = pairwise_similarities(ordering, hamming_distance)
            
    levDistanceMatrix = 0
        
    if 'Levenshtein' in ''.join(edgeFeatures) or ''.join(connectedness):
        
        if nodeType == 'target': 
            
            levDistanceMatrix = pairwise_similarities(node_value_arr, edit_distance)
            
        if nodeType == 'sample': 
            
            levDistanceMatrix = pairwise_similarities(ordering, edit_distance)
            
    ####
    ####
    ####
    #### Define edge sets prior to constructing edge features. 
    
    if 'full' in connectedness: 
        
        output_dict['edges'] = list(combinations(node_value_arr, 2))
        
    correlationMatrix = 0
    
    if 'sampleCorrelated' in connectedness or 'corr' in ''.join(edgeFeatures): 
        
        correlationMatrix, output_dict['edges'] = correlation_matrix(output_dict['nodeFeatures'][nodeType]['sampleFraction'], node_value_arr, False)
        
    boolcorrelationMatrix = 0
    
    if 'Bool' in ''.join(edgeFeatures):

        boolCorrelationMatrix = correlation_matrix(output_dict['nodeFeatures'][nodeType]['sampleFraction'], node_value_arr, True)   
    
    output_dict['edgeFeatures'] = edgeFeatures(data, nodeType, edges, edgeFeatures, node_value_arr, ordering, hamDistanceMatrix, levDistanceMatrix, correlationMatrix, boolCorrelationMatrix)
            
    
    return output_dict

In [76]:
def edgeFeatures(data, nodeType, edgeFeatures, nodeFeatures, nodeOrdering, ordering, hamDistanceMatrix, levDistanceMatrix, correlationMatrix, boolCorrelationMatrix):
    
    edgeFeatureDict = dict()
        
    for edge in edges: 

        i, j = np.nonzero(nodeOrdering==edge[0]), np.nonzero(nodeOrdering==edge[1])
        edgeFeatureDict[edge] = dict()

        if 'corrBoolSampleFractions' in edgeFeatures:
            
            edgeFeatureDict[edge]['corrBoolSampleFractions'] = boolCorrelationMatrix[i][j]
        
        if 'corrSampleFractions' in edgeFeatures:
            
            edgeFeatureDict[edge]['corrBoolSampleFractions'] = correlationMatrix[i][j]
            
        if 'targetHamming' in edgeFeatures:
            
            edgeFeatureDict[edge]['targetHamming'] = hamDistanceMatrix[i][j]
            
        if 'targetLevenshtein' in edgeFeatures:
            
            edgeFeatureDict[edge]['targetLevenshtein'] = levDistanceMatrix[i][j]
            
    return edgeFeatureDict