# Specify Parameters/Requirements/Functions

In [1]:
import os
import glob
import pandas as pd
import itertools
import json
import gzip

from similarityregression import AlignmentTools as alntools
from similarityregression import PairwiseAlignment as pwsaln

### File Locations

In [2]:
WorkingOn = 'DNA'

loc_CurrentDB = '../CisBP/' + WorkingOn + '/'
loc_EScoreOverlaps = loc_CurrentDB + 'Escores/ByFamily/'
loc_DBFiles = loc_CurrentDB + '/DBFiles/'
loc_DBDAlignments = loc_CurrentDB + 'DomainAlignments/'

#Construct Alignments
if os.path.isdir(WorkingOn + '/ConstructAlignments/') == False:
    os.mkdir(WorkingOn + '/ConstructAlignments/')
loc_ConstructAlignments = WorkingOn + '/ConstructAlignments/Unaligned/'
if os.path.isdir(loc_ConstructAlignments) == False:
    os.mkdir(loc_ConstructAlignments)
if os.path.isdir(loc_ConstructAlignments.replace('Unaligned', 'Aligned')) == False:
    os.mkdir(loc_ConstructAlignments.replace('Unaligned', 'Aligned'))
    
#Models
loc_ModelsByFamily = WorkingOn + '/ByFamily/'
if os.path.isdir(loc_ModelsByFamily) == False:
    os.mkdir(loc_ModelsByFamily)

### Read DBFiles

In [3]:
motifs = pd.read_csv(loc_DBFiles + 'motifs.tab', sep = '\t', skiprows=[1], index_col=0)
motif_features = pd.read_csv(loc_DBFiles + 'motif_features.tab', sep = '\t', skiprows=[1], index_col=0)
domains = pd.read_csv(loc_DBFiles + 'domains.tab', sep = '\t', skiprows=[1], index_col=0)
tf_families = pd.read_csv(loc_DBFiles + 'tf_families.tab', sep = '\t', skiprows=[1], index_col=0)

### Blacklists & Replacements

In [4]:
blacklist_BadFamilies = {
    tf_families[tf_families['DBDs'] == 'UNKNOWN' ].index[0] : 'UNKNOWN'
}

#Filter motifs with missing data
blacklist_motifs = set(['M00001_1.97d'])
motifs = motifs.drop(blacklist_motifs, axis = 0)

#Fill the alignment locations
dict_DBDAlignments = {
}
for Domain_ID, info in domains.iterrows():
    Pfam_Name = info['Pfam_Name']
    #Check if muscle exists 
    dalnloc = loc_DBDAlignments + Pfam_Name + '.muscle.fa'
    if os.path.isfile(dalnloc) == True:
        dict_DBDAlignments[Pfam_Name] = dalnloc
    else:
        dict_DBDAlignments[Pfam_Name] = loc_DBDAlignments + Pfam_Name + '.hmmaln'
        
#
Blacklist_Studies = set() #Set to ['Lam11', 'Barrera2016']  to remove synthetic/mutated constructs


# 1) Create unaligned construct sequence files
Based on whatever constructs have E-Score Overlaps (e.g. are from PBM or RNAcompete experiments)

In [5]:
for EScoreOverlapFile in glob.glob(loc_EScoreOverlaps + '*'):
    EScoreOverlaps = pd.read_csv(EScoreOverlapFile, sep = '\t')
    ID_TFFam = EScoreOverlapFile.split('/')[-1].replace('.EscoreOverlaps.txt', '')
    
    if ID_TFFam in blacklist_BadFamilies:
        continue
    
    tf_family = tf_families.loc[ID_TFFam]
    DBDs = tf_family['DBDs'].split(',')
    
    AlnDict_ByPfam = {}
    PfamAlnLens = {}
    
    for DBD in DBDs:
        AlnDict_ByPfam[DBD] = {}
        loc_alnFile = dict_DBDAlignments[DBD]
        if 'hmmaln' in loc_alnFile:
            #Parse PFams for match positions
            alnmnt, matchpos, _, _ = alntools.ParseStockholmWithMatches(loc_alnFile)
            for record in alnmnt:
                unaln = record.id
                aln = str(record.seq)
                aln_matchpos = ''
                for i in matchpos:
                    aln_matchpos += aln[i]
                AlnDict_ByPfam[DBD][unaln] = aln_matchpos.upper().replace('.', '-')
        else:
            for unaln, aln in alntools.FastaIter(loc_alnFile):
                AlnDict_ByPfam[DBD][unaln] = aln.upper().replace('.', '-')
        PfamAlnLens[DBD] = len(aln)
        
    JointSeqDict = {}
    for currentDBD, currentDBD_dict in AlnDict_ByPfam.items():
        for unaln, aln in currentDBD_dict.items():
            jointaln = ''
            for DBD in DBDs:
                if DBD == currentDBD:
                    jointaln += aln
                else:
                    jointaln += '-'*PfamAlnLens[DBD]
            JointSeqDict[unaln] = jointaln
                
    
    MIDs = set(list(EScoreOverlaps['MID_x']) + list(EScoreOverlaps['MID_y']))
    with open(loc_ConstructAlignments + ID_TFFam + '.txt', 'w') as outf:
        for MID in MIDs:
            alnseqs = []
            if MID in motif_features['Motif_ID'].values:
                MID_mfeats = motif_features[motif_features['Motif_ID'] == MID]
                for ID_mfeat, mfeat in MID_mfeats.iterrows():
                    unaln = mfeat['MotifFeature_Sequence']
                    aln = JointSeqDict[unaln]
                    alnseqs.append(aln)
                joinedseq = ','.join(alnseqs)
                outf.write('\t'.join([MID, joinedseq]) + '\n')
            else:
                print '! MISSING DBD/RBD Domain Info:', MID

! MISSING DBD/RBD Domain Info: M00001_1.97d


# 2) Align constructs with pairwise DBD/RBD alignment 

In [None]:
for loc_ConstructAlignment in glob.glob(loc_ConstructAlignments + '*'):
    print loc_ConstructAlignment
    
    loc_AlnJSON = loc_ConstructAlignment.replace('Unaligned', 'Aligned')
    
    #Get Escore Info
    Family_ID = loc_ConstructAlignment.split('/')[-1].replace('.txt','')
    EScoreOverlapFile = loc_EScoreOverlaps + Family_ID + '.EscoreOverlaps.txt'
    EScoreOverlaps = pd.read_csv(EScoreOverlapFile, sep = '\t', index_col=[0,3]) 
    
    #Read construct sequnences
    UnalnDict = {}
    with open(loc_ConstructAlignment, 'r') as infile:
        for line in infile:
            ID, unaln = line.strip().split('\t')
            unaln = unaln.split(',')
            UnalnDict[ID] = unaln
    IDs = UnalnDict.keys()
    IDs.sort()
    
    #Loop through all pairs of constructs
    with gzip.open(loc_AlnJSON + '.gz', 'w') as outfile:
        for x, y in itertools.combinations(IDs, 2):
            o = pwsaln.AlignDBDArrays((x, UnalnDict[x]), (y, UnalnDict[y]), ByPosNorm = 'L')
            o['EScoreOverlap'] = EScoreOverlaps.loc[(x, y), 'EScoreOverlap']
            o['EClass'] = EScoreOverlaps.loc[(x, y), 'EClass']
            o['Study'] = list(EScoreOverlaps.loc[(x, y), ['Study_x', 'Study_y']])
            
            outfile.write('\t'.join([str((x, y)), json.dumps(o)]) + '\n')

DNA/ConstructAlignments/Unaligned/F002_1.97d.txt
DNA/ConstructAlignments/Unaligned/F007_1.97d.txt
DNA/ConstructAlignments/Unaligned/F009_1.97d.txt
DNA/ConstructAlignments/Unaligned/F024_1.97d.txt
DNA/ConstructAlignments/Unaligned/F026_1.97d.txt
DNA/ConstructAlignments/Unaligned/F028_1.97d.txt
DNA/ConstructAlignments/Unaligned/F039_1.97d.txt
DNA/ConstructAlignments/Unaligned/F050_1.97d.txt
DNA/ConstructAlignments/Unaligned/F082_1.97d.txt
DNA/ConstructAlignments/Unaligned/F088_1.97d.txt
DNA/ConstructAlignments/Unaligned/F091_1.97d.txt


# 3) Create training dataframes

In [None]:
for loc_AlnJSON in glob.glob(loc_ConstructAlignments.replace('Unaligned', 'Aligned') + '*'):
    FID = loc_AlnJSON.split('/')[-1].replace('.txt', '')
    print 'Parsing for R:', FID
    
    #Open Outputs (and make folders if necessary)
    loc_OutputFiles = loc_ModelsByFamily + FID
    if os.path.isdir(loc_OutputFiles) == False:
        os.mkdir(loc_OutputFiles)
    loc_OutputFiles += '/TrainingData/'
    if os.path.isdir(loc_OutputFiles) == False:
        os.mkdir(loc_OutputFiles)
       
    Y_Sims_PctID = gzip.open(loc_OutputFiles + 'Y_Sims_PctID.csv.gz', 'w')
    X_PctID = gzip.open(loc_OutputFiles + 'X_PctID.csv.gz','w')
    X_AvgB62 = gzip.open(loc_OutputFiles + 'X_AvgB62.csv.gz', 'w')
    X_PctID_Smooth3 = gzip.open(loc_OutputFiles + 'X_PctID_Smooth3.csv.gz', 'w')
    X_AvgB62_Smooth3 = gzip.open(loc_OutputFiles + 'X_AvgB62_Smooth3.csv.gz', 'w')
    
    #Loop through alignment info
    count = 0
    uIDs = set()
    IDs = []
    with gzip.open(loc_AlnJSON, 'r') as infile:
        for line in infile:
            count += 1
            ID, aln = line.strip().split('\t')
            
            ID = eval(ID)
            IDs.append(ID)
            uIDs.add(ID[0])
            uIDs.add(ID[1])
            
            aln = json.loads(aln)
            
            #Check if Escores are null
            if (pd.isnull(aln['EScoreOverlap'])) or (len(Blacklist_Studies.intersection(aln['Study'])) > 0):
                continue
                
            #Check if Excluded studies are in o
                
            #1) Parse the Y-info
            if count == 1:
                h = ['MID_x', 'MID_y', 'EScoreOverlap', 'EClass','PctID_L', 'PctID_S', 'ArrayLenDifference', 'MultiAlnFlag']
                Y_Sims_PctID.write(','.join(h) + '\n')
            oline = list(ID) 
            for col in ['EScoreOverlap', 'EClass','PctID_L', 'PctID_S', 'ArrayLenDifference', 'MultiAlnFlag']:
                oline.append(aln[col])
            Y_Sims_PctID.write(','.join(map(str, oline)) + '\n')
            
            #2) Parse the X matrices
            for filehandle, dictID in zip([X_PctID, X_AvgB62, X_PctID_Smooth3, X_AvgB62_Smooth3], 
                                      ['ByPos.PctID', 'ByPos.AvgB62', 'ByPos.PctID.Smooth3', 'ByPos.AvgB62.Smooth3']):
                if count == 1:
                    h = ['MID_x', 'MID_y'] + ['p' + str(x + 1) for x in range(len(aln['ByPos.PctID']))]
                    filehandle.write(','.join(h) + '\n')
                oline = list(ID) + map(str, aln[dictID])
                filehandle.write(','.join(oline) + '\n')
                
    #Close Files
    for x in [X_PctID, X_AvgB62, X_PctID_Smooth3, X_AvgB62_Smooth3, Y_Sims_PctID]:
        x.close()

    #Calculate Testing folds
    count = 0
    with open(loc_OutputFiles + 'CVTestIndicies_i0.txt', 'w') as outf:
        for uID in uIDs:
            present_0 = []
            count += 1
            for i, ID in enumerate(IDs):
                if uID in ID:
                    present_0.append(i)
            oline = [uID] + present_0
            outf.write('\t'.join(map(str, oline)) + '\n')
        #print count 