# Load Escore Data

In [7]:
import cPickle 
EScoreDict = cPickle.load(open('PBM.Escores.cpickle','r')) # (Study, ID)

In [8]:
ReplicateOverlapsDict = {}
with open('PBM.ReplicateOverlaps.txt', 'r') as infile:
    for line in infile:
        line = line.strip().split('\t')
        ReplicateOverlapsDict[(line[0], line[1])] = float(line[2])

# Load DB Info 

In [9]:
import pandas as pd

motifs = pd.read_csv('../DBFiles/motifs.tab', delimiter='\t', index_col = 0, skiprows =[1])
motifs = motifs[motifs['Motif_Type'] == 'PBM']

#Add Study
with open('../DBFiles/motif_sources.tab', 'r') as infile:
    for line in infile:
        line = line.strip().split('\t')
        motifs.loc[motifs['MSource_ID'] == line[0], 'Study'] = line[1]

#Check if it's in the EscoreDict
MissingInfo = []
for MID, info in motifs.iterrows():
    if (info['Study'], info['DBID']) in EScoreDict.keys():
        motifs.loc[MID, 'EscoreData'] = True
    else:
        print 'Missing:', MID, info['Study'], info['DBID']
        MissingInfo.append([MID,info['Study'], info['DBID']])
        motifs.loc[MID, 'EscoreData'] = False

Missing: M00411_1.97d Zta CEBPB_5hmC
Missing: M00412_1.97d Zta CEBPB_5mCG
Missing: M00413_1.97d Zta CEBPB_5mC
Missing: M00414_1.97d Zta CEBPB_C
Missing: M01033_1.97d FOXP1_2011 pEX0123
Missing: M01034_1.97d FOXP1_2011 pEX0124
Missing: M01036_1.97d FOXP1_2011 pEX0125
Missing: M01037_1.97d FOXP1_2011 pEX0126
Missing: M02321_1.97d Li2017 PHF1
Missing: M02322_1.97d Li2017 MTF2
Missing: M02329_1.97d Zta Zta_5hmC
Missing: M02330_1.97d Zta Zta_5mCG
Missing: M02331_1.97d Zta Zta_5mC
Missing: M02332_1.97d Zta Zta_C


In [10]:
#Map Domains 2 TF fams
domains = pd.read_csv('../DBFiles/domains.tab', delimiter='\t', index_col = 0, skiprows =[1])
tf_families = pd.read_csv('../DBFiles/tf_families.tab', delimiter='\t', index_col = 0, skiprows =[1])

DomainIDs2Fam = {}
for FID, info in tf_families.iterrows():
    DomainIDs = []
    DBDs = info['DBDs'].split(',')
    for DBD in DBDs:
        DomainIDs.append(domains.loc[domains['Pfam_Name'] == DBD].index[0])
    DomainIDs.sort()
    DomainIDs2Fam[tuple(DomainIDs)] = FID

In [11]:
#Map Motifs 2 TF Fams
motif_features = pd.read_csv('../DBFiles/motif_features.tab', delimiter='\t', index_col = 0)
motif_features = motif_features[motif_features['Motif_ID'].isin(motifs.index)]
tfs =  pd.read_csv('../DBFiles/tfs.tab', delimiter='\t', index_col = 0)
tfs = tfs.loc[set(motifs['TF_ID']),]

for MID, info in motifs.iterrows():
    DBDs = list(set(motif_features.loc[motif_features['Motif_ID'] == MID, 'Domain_ID']))
    DBDs.sort()
    
    FamID = DomainIDs2Fam.get(tuple(DBDs))
    TFFam = tfs.loc[info['TF_ID'], 'Family_ID']
    
    if FamID != None:
        # Check for TF agreement
        FamAgreement = (TFFam == FamID)
        motifs.loc[MID, 'Family_ID'] = FamID
        if FamAgreement == False:
            print 'TFFam Disagreement:',info['TF_ID'], MID, FamID, TFFam
    else:
        print 'Sub: TFFamily:', MID
        motifs.loc[MID, 'Family_ID'] = TFFam

Sub: TFFamily: M00001_1.97d
TFFam Disagreement: T010402_1.97d M00112_1.97d F009_1.97d F010_1.97d
TFFam Disagreement: T010402_1.97d M00113_1.97d F050_1.97d F010_1.97d
TFFam Disagreement: T010403_1.97d M00114_1.97d F009_1.97d F010_1.97d
TFFam Disagreement: T075847_1.97d M00504_1.97d F223_1.97d F116_1.97d
TFFam Disagreement: T076197_1.97d M00505_1.97d F223_1.97d F116_1.97d
TFFam Disagreement: T076449_1.97d M00506_1.97d F251_1.97d F119_1.97d
TFFam Disagreement: T167145_1.97d M00883_1.97d F223_1.97d F164_1.97d
TFFam Disagreement: T170153_1.97d M00935_1.97d F201_1.97d F173_1.97d
TFFam Disagreement: T237329_1.97d M01677_1.97d F281_1.97d F230_1.97d
TFFam Disagreement: T237329_1.97d M01678_1.97d F281_1.97d F230_1.97d
TFFam Disagreement: T238363_1.97d M01710_1.97d F223_1.97d F231_1.97d
TFFam Disagreement: T238369_1.97d M01716_1.97d F289_1.97d F231_1.97d
TFFam Disagreement: T238703_1.97d M01724_1.97d F223_1.97d F231_1.97d
TFFam Disagreement: T316254_1.97d M02106_1.97d F231_1.97d F289_1.97d
TFFam 

# Calculate Overlaps

In [12]:
import os
import itertools
import numpy as np

if os.path.isdir('ByFamily') == False:
    os.mkdir('ByFamily')

    
FamilyOverlapThresholds = open('PBM.FamilyOverlapThresholds.txt', 'w')
    
for Fam, data in motifs.groupby('Family_ID'):
    data = data[data['EscoreData'] == True]
    if data.shape[0] > 1:
        
        FamFile = open('ByFamily/' + Fam + '.EscoreOverlaps.txt', 'w')
        h = ['MID_x', 'Study_x', 'DBID_x', 'MID_y', 'Study_y', 'DBID_y', 'OverlapN', 'maxN', 'EScoreOverlap', 'EClass']
        FamFile.write('\t'.join(h) + '\n')
        IDs = data.index
        
        #Assemble the ReplicateOverlap Stats
        ReplicateOverlaps = []
        for x in IDs:
            x_info = motifs.loc[x]
            ReplicateOverlap = ReplicateOverlapsDict.get((x_info['Study'], x_info['DBID']))
            if ReplicateOverlap != None:
                ReplicateOverlaps.append(ReplicateOverlap)
        if len(ReplicateOverlaps) > 3:
            p25_cutoff = np.percentile(ReplicateOverlaps, 25)
        else:
            p25_cutoff = None
        #print Fam, tf_families.loc[Fam, 'Family_Name'], p25_cutoff
        FamilyOverlapThresholds.write('\t'.join(map(str, [Fam, tf_families.loc[Fam, 'Family_Name'], tf_families.loc[Fam, 'DBDs'], p25_cutoff])) + '\n')
        
        #Calculate the Overlap Data for each pair
        for x, y in itertools.combinations(IDs, 2):
            x_info = motifs.loc[x]
            x_maxN, x_Escores = EScoreDict[(x_info['Study'], x_info['DBID'])]
            y_info = motifs.loc[y]
            y_maxN, y_Escore = EScoreDict[(y_info['Study'], y_info['DBID'])]
            
            OverlapN = np.nan
            EScoreOverlap = np.nan
            EClass = np.nan
            
            maxN = max([x_maxN, y_maxN])
            if maxN == 0:
                maxN = np.nan
            else:
                K_Overlap = set(x_Escores[:maxN])
                K_Overlap = K_Overlap.intersection(y_Escore[:maxN])
                OverlapN = len(K_Overlap)
                EScoreOverlap = 1.0*OverlapN/maxN
                
                if p25_cutoff != None:
                    EClass = int(EScoreOverlap >= p25_cutoff)
            
            oline = [x, x_info['Study'], x_info['DBID'], y, y_info['Study'], y_info['DBID'], OverlapN, maxN, EScoreOverlap, EClass]
            FamFile.write('\t'.join(map(str, oline)) + '\n')
        FamFile.close()
            
FamilyOverlapThresholds.close()