# Load Escore Data
## First section only has to be run once

In [1]:
#import glob

#EScoreDict = {} # RNCMPT_ID : [# Escores > 0.4, [Sorted List of E-scores]]

#with open('/Users/slambert/EvoCisBP/RNA/ClusterCopy/EscoreData/AB_Escores.40Thresh.txt', 'r') as infile:
#    for line in infile:
#        line = line.strip().split('\t')
#        EScoreDict[line[0]] = [int(line[2]), line[3:]]
        
#import cPickle 
#with open('RNCMPT.Escores.cpickle','w') as handle:
#    cPickle.dump(EScoreDict, handle, cPickle.HIGHEST_PROTOCOL)

## This section loads the parsed data

In [2]:
import cPickle 
with open('RNCMPT.Escores.cpickle' ,'r') as handle:
    EScoreDict = cPickle.load(handle) # RNCMPT_ID : [# Escores > 0.4, [Sorted List of E-scores]]

# Load DB Info

In [3]:
import pandas as pd

motifs = pd.read_csv('../DBFiles/motifs.tab', delimiter='\t', index_col = 0, skiprows =[1])
motifs = motifs[motifs['Motif_Type'] == 'RNAcompete']

#Add Study
with open('../DBFiles/motif_sources.tab', 'r') as infile:
    for line in infile:
        line = line.strip().split('\t')
        motifs.loc[motifs['MSource_ID'] == line[0], 'Study'] = line[1]
        
#Map Domains 2 TF fams
domains = pd.read_csv('../DBFiles/domains.tab', delimiter='\t', index_col = 0, skiprows =[1])
tf_families = pd.read_csv('../DBFiles/tf_families.tab', delimiter='\t', index_col = 0, skiprows =[1])

DomainIDs2Fam = {}
for FID, info in tf_families.iterrows():
    DomainIDs = []
    DBDs = info['DBDs'].split(',')
    for DBD in DBDs:
        DomainIDs.append(domains.loc[domains['Pfam_Name'] == DBD].index[0])
    DomainIDs.sort()
    DomainIDs2Fam[tuple(DomainIDs)] = FID

In [4]:
#Map Motifs 2 TF Fams
motif_features = pd.read_csv('../DBFiles/motif_features.tab', delimiter='\t', index_col = 0)
motif_features = motif_features[motif_features['Motif_ID'].isin(motifs.index)]
tfs =  pd.read_csv('../DBFiles/tfs.tab', delimiter='\t', index_col = 0)
tfs = tfs.loc[set(motifs['TF_ID']),]

for MID, info in motifs.iterrows():
    DBDs = list(set(motif_features.loc[motif_features['Motif_ID'] == MID, 'Domain_ID']))
    DBDs.sort()
    
    FamID = DomainIDs2Fam.get(tuple(DBDs))
    TFFam = tfs.loc[info['TF_ID'], 'Family_ID']
    
    if FamID != None:
        # Check for TF agreement
        FamAgreement = (TFFam == FamID)
        motifs.loc[MID, 'Family_ID'] = FamID
        if FamAgreement == False:
            print 'TFFam Disagreement:',info['TF_ID'], MID, FamID, TFFam
    else:
        print 'Sub: TFFamily:', MID
        motifs.loc[MID, 'Family_ID'] = TFFam

TFFam Disagreement: T080390_1.95d M032_1.95d F47_1.95d F61_1.95d
TFFam Disagreement: T046073_1.95d M392_1.95d F43_1.95d F47_1.95d


In [5]:
#Check if it's in the EscoreDict
MissingInfo = []
for MID, info in motifs.iterrows():
    if (info['DBID']) in EScoreDict.keys():
        motifs.loc[MID, 'EscoreData'] = True
    else:
        print 'Missing:', MID, info['Study'], info['DBID']
        MissingInfo.append([MID,info['Study'], info['DBID']])
        motifs.loc[MID, 'EscoreData'] = False

# Calculate E-score Overlaps

In [6]:
import os
import itertools
import numpy as np

if os.path.isdir('ByFamily') == False:
    os.mkdir('ByFamily')
    
for Fam, data in motifs.groupby('Family_ID'):
    data = data[data['EscoreData'] == True]
    if data.shape[0] > 1:
        
        FamFile = open('ByFamily/' + Fam + '.tmpEscoreOverlaps.txt', 'w')
        h = ['MID_x', 'Study_x', 'DBID_x', 'MID_y', 'Study_y', 'DBID_y', 'OverlapN', 'maxN', 'EScoreOverlap']
        FamFile.write('\t'.join(h) + '\n')
        IDs = data.index
        
        #Calculate the Overlap Data for each pair
        for x, y in itertools.combinations(IDs, 2):
            x_info = motifs.loc[x]
            x_maxN, x_Escores = EScoreDict[(x_info['DBID'])]
            y_info = motifs.loc[y]
            y_maxN, y_Escore = EScoreDict[(y_info['DBID'])]
            
            OverlapN = np.nan
            EScoreOverlap = np.nan
            
            maxN = max([x_maxN, y_maxN])
            if maxN == 0:
                maxN = np.nan
            else:
                K_Overlap = set(x_Escores[:maxN])
                K_Overlap = K_Overlap.intersection(y_Escore[:maxN])
                OverlapN = len(K_Overlap)
                EScoreOverlap = 1.0*OverlapN/maxN
            
            oline = [x, x_info['Study'], x_info['DBID'], y, y_info['Study'], y_info['DBID'], OverlapN, maxN, EScoreOverlap]
            FamFile.write('\t'.join(map(str, oline)) + '\n')
        FamFile.close()

# Replicate Ovelap

In [7]:
FamReps = {}

ReplicateOverlaps = open('RNCMPT.ReplicateOverlaps.txt', 'w')
h = ['Family_ID', 'Family_Name', 'TF_ID', 'TF_Name', 'TF_Species', 'MID_x', 'DBID_x', 'MID_y', 'DBID_y', 'OverlapN', 'maxN', 'EScoreOverlap']
ReplicateOverlaps.write('\t'.join(map(str, h)) + '\n')

for TF_ID, data in motifs.groupby('TF_ID'):
    if data.shape[0] > 1:
        print TF_ID, tfs.loc[TF_ID,'TF_Name'], tfs.loc[TF_ID,'TF_Species'], data.shape[0]
        IDs = data.index
        for x, y in itertools.combinations(IDs, 2):
            x_info = motifs.loc[x]
            x_maxN, x_Escores = EScoreDict[(x_info['DBID'])]
            y_info = motifs.loc[y]
            y_maxN, y_Escore = EScoreDict[(y_info['DBID'])]
            
            if x_info['Family_ID'] == y_info['Family_ID']:
                OverlapN = np.nan
                EScoreOverlap = np.nan

                maxN = max([x_maxN, y_maxN])
                if maxN == 0:
                    maxN = np.nan
                else:
                    K_Overlap = set(x_Escores[:maxN])
                    K_Overlap = K_Overlap.intersection(y_Escore[:maxN])
                    OverlapN = len(K_Overlap)
                    EScoreOverlap = 1.0*OverlapN/maxN

                oline = [x_info['Family_ID'], tf_families.loc[x_info['Family_ID'] ,'Family_Name'], TF_ID, tfs.loc[TF_ID,'TF_Name'], tfs.loc[TF_ID,'TF_Species'], x, x_info['DBID'], y, y_info['DBID'], OverlapN, maxN, EScoreOverlap]
                ReplicateOverlaps.write('\t'.join(map(str, oline)) + '\n')

ReplicateOverlaps.close()

T015776_1.95d LIN28A Homo_sapiens 2
T016826_1.95d RBM4 Homo_sapiens 2
T017024_1.95d lark Drosophila_melanogaster 3
T033730_1.95d YBX1 Homo_sapiens 2
T057092_1.95d pum Drosophila_melanogaster 6
T060412_1.95d RBM5 Homo_sapiens 2
T080380_1.95d PTBP1 Homo_sapiens 2
T080386_1.95d ELAVL1 Homo_sapiens 5
T080430_1.95d HNRNPL Homo_sapiens 2
T080440_1.95d SRSF9 Homo_sapiens 2
T080447_1.95d TIA1 Homo_sapiens 2
T080477_1.95d MSI1 Homo_sapiens 2
T080480_1.95d SRSF1 Homo_sapiens 6
T080493_1.95d RBMS3 Homo_sapiens 2
T080566_1.95d SRSF10 Homo_sapiens 4
T089894_1.95d aret Drosophila_melanogaster 2
T089895_1.95d Hrb98DE Drosophila_melanogaster 3
T089903_1.95d Hrb27C Drosophila_melanogaster 2
T089916_1.95d msi Drosophila_melanogaster 2
T090006_1.95d shep Drosophila_melanogaster 3
T115273_1.95d sup-12 Caenorhabditis_elegans 2
T147494_1.95d VTS1 Saccharomyces_cerevisiae 2


# Calculate Replicate Thresholds 

In [8]:
ReplicateOverlaps = pd.read_csv('RNCMPT.ReplicateOverlaps.txt', sep = '\t')
ReplicateOverlaps.head()

Unnamed: 0,Family_ID,Family_Name,TF_ID,TF_Name,TF_Species,MID_x,DBID_x,MID_y,DBID_y,OverlapN,maxN,EScoreOverlap
0,F29_1.95d,"CCHC ZF,CSD",T015776_1.95d,LIN28A,Homo_sapiens,M035_1.95d,RNCMPT00036,M153_1.95d,RNCMPT00162,,,
1,F34_1.95d,"CCHC ZF,RRM",T016826_1.95d,RBM4,Homo_sapiens,M050_1.95d,RNCMPT00052,M109_1.95d,RNCMPT00113,35.0,42.0,0.833333
2,F34_1.95d,"CCHC ZF,RRM",T017024_1.95d,lark,Drosophila_melanogaster,M034_1.95d,RNCMPT00035,M094_1.95d,RNCMPT00097,5.0,10.0,0.5
3,F34_1.95d,"CCHC ZF,RRM",T017024_1.95d,lark,Drosophila_melanogaster,M034_1.95d,RNCMPT00035,M119_1.95d,RNCMPT00124,0.0,2.0,0.0
4,F34_1.95d,"CCHC ZF,RRM",T017024_1.95d,lark,Drosophila_melanogaster,M094_1.95d,RNCMPT00097,M119_1.95d,RNCMPT00124,8.0,10.0,0.8


In [9]:
#Divide by RRM and other
ID_RRM = list(ReplicateOverlaps.loc[ReplicateOverlaps['Family_Name'] == 'RRM', 'Family_ID'])[0]
Overlaps_RRM = ReplicateOverlaps.loc[ReplicateOverlaps['Family_ID'] == ID_RRM,'EScoreOverlap']
Overlaps_RRM = Overlaps_RRM[Overlaps_RRM.isnull() == False]
Overlaps_RRM_p25 = np.percentile(Overlaps_RRM, 25)
print 'EScoreOverlap - RRM (25th Percentile):', Overlaps_RRM_p25

Overlaps_Other = ReplicateOverlaps.loc[ReplicateOverlaps['Family_ID'] != ID_RRM,'EScoreOverlap']
Overlaps_Other = Overlaps_Other[Overlaps_Other.isnull() == False]
Overlaps_Other_p25 = np.percentile(Overlaps_Other, 25)
print 'EScoreOverlap - !RRM (25th Percentile):', Overlaps_Other_p25

Overlaps_All = ReplicateOverlaps['EScoreOverlap']
Overlaps_All = Overlaps_All[Overlaps_All.isnull() == False]
Overlaps_All_p25 = np.percentile(Overlaps_All, 25)
print 'EScoreOverlap - All Reps (25th Percentile):', Overlaps_All_p25

EScoreOverlap - RRM (25th Percentile): 0.2
EScoreOverlap - !RRM (25th Percentile): 0.285714285714
EScoreOverlap - All Reps (25th Percentile): 0.209375


## Calculate E-class

In [10]:
import glob 
for OverlapFile in glob.glob('ByFamily/*tmpEscoreOverlaps.txt'):
    FamID = OverlapFile.split('/')[1].replace('tmpEscoreOverlaps.txt', '')
    OverlapData = pd.read_csv(OverlapFile, sep= '\t')

    EScoreOverlaps = OverlapData['EScoreOverlap']
    
    #Try EClass
    Class = []
    for x in EScoreOverlaps:
        if pd.isnull(x) == False:
            if FamID == ID_RRM:
                Class.append(int(x >= Overlaps_RRM_p25))
            else:
                Class.append(int(x >= Overlaps_Other_p25))
        else:
            Class.append(np.nan)
    OverlapData['EClass'] = Class
    
    #Try EClass (All)
    Class = []
    for x in EScoreOverlaps:
        if pd.isnull(x) == False:
            Class.append(int(x >= Overlaps_All_p25))
        else:
            Class.append(np.nan)
    OverlapData['EClass_All'] = Class
    OverlapData.to_csv(OverlapFile.replace('tmpEscoreOverlaps.txt', 'EscoreOverlaps.txt'), sep = '\t', index=False)