# =================== Import ===================

In [2]:
import os.path


# =================== Variables ===================

In [3]:
directory1 = ('1.Parsing_Data_Files/1.1.Data_Files/')
directory2 = ('1.Parsing_Data_Files/1.2.Sorted_Files/')
directory3 = ('1.Parsing_Data_Files/1.3.CD_HIT_Files/')
directory4 = ('1.Parsing_Data_Files/1.4.Cluster_Files/')
directory5 = ('1.Parsing_Data_Files/1.5.Final_Files/')

# Raw files from UniProt and PROSITE
pos_in_prosite = os.path.abspath(os.path.join(directory1,'Positives_PROSITE.fasta')) 
pos_in_uniprot = os.path.abspath(os.path.join(directory1,'Positives_UniProt.fasta'))
neg_in = os.path.abspath(os.path.join(directory1,'Negatives_UniProt.fasta'))

# Seperated files
pos_out_prosite = os.path.abspath(os.path.join(directory2,'Positives_Modified_PROSITE.fasta'))
pos_out_uniprot = os.path.abspath(os.path.join(directory2,'Positives_Modified_UniProt.fasta'))
neg_out = os.path.abspath(os.path.join(directory2,'Negatives_Modified_UniProt.fasta'))

# CD-HIT files
cdhit_prosite = os.path.abspath(os.path.join(directory3,'CD_HIT_PROSITE.txt'))
cdhit_uniprot_pos = os.path.abspath(os.path.join(directory3,'CD_HIT_UniProt.txt'))
cdhit_uniprot_neg = os.path.abspath(os.path.join(directory3,'CD_HIT_UniProt_Neg.txt'))

# Cluster files
cluster_prosite = os.path.abspath(os.path.join(directory4,'Cluster_PROSITE.fasta'))
cluster_uniprot = os.path.abspath(os.path.join(directory4,'Cluster_UniProt.fasta'))
cluster_uniprot_neg = os.path.abspath(os.path.join(directory4,'Cluster_UniProt_Neg.fasta'))

# Final files
final_prosite_pos = os.path.abspath(os.path.join(directory5,'Final_PROSITE_pos.fasta'))
final_uniprot_pos = os.path.abspath(os.path.join(directory5,'Final_UniProt_Pos.fasta'))
final_uniprot_neg = os.path.abspath(os.path.join(directory5,'Final_UniProt_Neg.fasta'))
final_file = os.path.abspath(os.path.join(directory5,'Final_File.fasta'))

# =================== Functions ===================

In [4]:
def ModifyPosAndNeg(pos_in_prosite,pos_in_uniprot,neg_in):
    '''Function to modify all headers in each fasta file, in order to have the same format in the headings'''   
    # Positive PROSITE
    pos_prosite = open(pos_in_prosite)
    pos_prosite = pos_prosite.read()
    index = 0
    f = open(pos_out_prosite, 'w')
    for line in pos_prosite.splitlines():
        if line.startswith('>sp|'):
            index += 1
            header = line[line.find('>sp|'): line.index('|', line.index('|') + 1)]  +   str('|Positive')
            print(header ,file=f)
        else:
            print(line , file=f)
    f.close()    
    # Positive UniProt
    pos_uniprot = open(pos_in_uniprot)
    pos_uniprot = pos_uniprot.read()    
    index = 0
    f = open(pos_out_uniprot, 'w')
    for line in pos_uniprot.splitlines():
        if line.startswith('>sp|'):
            index += 1
            header = line[line.find('>sp|'): line.index('|', line.index('|') + 1)]  +   str('|Positive')
            print(header ,file=f)
        else:
            print(line , file=f)
    f.close()    
    # Negative
    neg_uniprot = open(neg_in)
    seq_neg = neg_uniprot.read()    
    index = 0 
    f = open(neg_out, 'w')
    for line in seq_neg.splitlines():
        if line.startswith('>sp|'):
            index += 1
            header = line[line.find('>sp|'): line.index('|', line.index('|') + 1)]   +   str('|Negative')
            print(header , file=f)
        else:
            print(line , file=f)
    f.close()   
    return pos_out_prosite, pos_out_uniprot, neg_out

In [5]:
def ClusterPosAndNeg(cdhit_prosite,cdhit_uniprot_pos, cdhit_uniprot_neg):
    '''Function to assign cluster number to the concerned protein, where the cluster number is obtained from CD-HIT'''  
    # Add cluster number to header PROSITE pos
    cd_prosite = open(cdhit_prosite)
    cd_prosite = cd_prosite.read()  
    f = open(cluster_prosite, 'w')
    for line_cd in cd_prosite.splitlines():  
        if line_cd.startswith('>Cluster'): 
            clusternr = '_' + line_cd[9:]
        else:
            line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            print(line + clusternr, file=f)
            #if line_cd.endswith('*'):
            #    line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            #    print(line + clusternr, file=f)
            #else:
             #   continue
    f.close()                      
    # Add cluster number to header UniProt pos
    cd_uniprot = open(cdhit_uniprot_pos)
    cd_uniprot = cd_uniprot.read()
    f = open(cluster_uniprot, 'w')
    for line_cd in cd_uniprot.splitlines():  
        if line_cd.startswith('>Cluster'): 
            clusternr = '_' + line_cd[9:]
        else:
            line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            print(line + clusternr, file=f)
            #if line_cd.endswith('*'):
            #    line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            #    print(line + clusternr, file=f)
            #else:
            #    continue
    f.close()  
    # Add cluster number to header UniProt neg
    cd_uniprot_n = open(cdhit_uniprot_neg)
    cd_uniprot_n = cd_uniprot_n.read()
    f = open(cluster_uniprot_neg, 'w')
    for line_cd in cd_uniprot_n.splitlines():  
        if line_cd.startswith('>Cluster'): 
            clusternr = '_' + line_cd[9:]
        else:
            line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            print(line + clusternr, file=f)
            #if line_cd.endswith('*'):
            #    line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            #    print(line + clusternr, file=f)
            #else:
            #    continue
    f.close()
    return cluster_prosite, cluster_uniprot, cluster_uniprot_neg

In [6]:
def FinalFiles(pos_out_prosite,pos_out_uniprot,neg_out,cluster_prosite,cluster_uniprot,cluster_uniprot_neg):
    '''Function to concatenate the cluster number with the sequence, creating the final file with the desired format'''  
    # Add cluster: pos(PROSITE)
    prosite = open(pos_out_prosite)
    prosite = prosite.read()
    clus_prosite = open(cluster_prosite)
    clus_prosite = clus_prosite.read() 
    Match = 0
    f = open(final_prosite_pos, 'w')
    for line_prosite in prosite.splitlines():
        if line_prosite.startswith('>sp|'):
            Match=0
            for line_clus in clus_prosite.splitlines():
                if line_prosite[0:18] == line_clus[0:18]:
                    print(line_clus, file=f)
                    Match = 1
                    break
        else:
            if Match == 1:
                print(line_prosite, file=f)
    f.close( )
    # Add cluster: pos(UniProt)
    uniprot_p = open(pos_out_uniprot)
    uniprot_p = uniprot_p.read()
    clus_uniprot_p = open(cluster_uniprot)
    clus_uniprot_p = clus_uniprot_p.read() 
    f = open(final_uniprot_pos, 'w')  
    Match = 0
    for line_p in uniprot_p.splitlines():
        if line_p.startswith('>sp|'):
            Match=0
            for line_clus in clus_uniprot_p.splitlines():
                if line_p[0:18] == line_clus[0:18]:
                    print(line_clus, file=f)
                    Match = 1
                    break
        else:
            if Match == 1:
                print(line_p, file=f)
    f.close( )
    # Add cluster: neg(UniProt)
    uniprot_n = open(neg_out)
    uniprot_n = uniprot_n.read()
    clus_uniprot_n = open(cluster_uniprot_neg)
    clus_uniprot_n = clus_uniprot_n.read() 
    f = open(final_uniprot_neg, 'w')  
    Match = 0
    for line_n in uniprot_n.splitlines():
        if line_n.startswith('>sp|'):
            Match=0
            for line_clus in clus_uniprot_n.splitlines():
                if line_n[0:18] == line_clus[0:18]:
                    print(line_clus, file=f)
                    Match = 1
                    break
        else:
            if Match == 1:
                print(line_n, file=f)
    f.close( )
    
    # Concatenate files PROSITE(pos) + UniProt(neg)
    files_conc = [final_prosite_pos, final_uniprot_neg]
    with open(final_file, 'w') as outfile:
        for fname in files_conc:
            with open(fname) as infile:
                outfile.write(infile.read())
    
    return final_uniprot_pos, final_uniprot_neg, final_prosite_pos, final_file

# =================== Main ===================

In [7]:
pos_out_prosite, pos_out_uniprot, neg_out = ModifyPosAndNeg(pos_in_prosite,pos_in_uniprot,neg_in)
cluster_prosite, cluster_uniprot, cluster_uniprot_neg = ClusterPosAndNeg(cdhit_prosite,cdhit_uniprot_pos,cdhit_uniprot_neg)
final_uniprot_pos, final_uniprot_neg, final_prosite_pos, final_file  = FinalFiles(pos_out_prosite,pos_out_uniprot,neg_out,cluster_prosite,cluster_uniprot,cluster_uniprot_neg)