# =================== Import ===================

In [1]:
import os.path

# =================== Variables ===================

In [2]:
# Raw files from UniProt and PROSITE
pos_in_prosite = os.path.abspath('1.1.Data_Files/Positives_PROSITE.fasta') 
pos_in_uniprot = os.path.abspath('1.1.Data_Files/Positives_UniProt.fasta') 
neg_in = os.path.abspath('1.1.Data_Files/Negatives_UniProt.fasta') 

# Seperated files
pos_out_prosite = os.path.abspath("1.2.Sorted_Files/Positives_Modified_PROSITE.fasta") 
pos_out_uniprot = os.path.abspath("1.2.Sorted_Files/Positives_Modified_UniProt.fasta") 
neg_out = os.path.abspath("1.2.Sorted_Files/Negatives_Modified_UniProt.fasta") 

# Concatenated files
conc_prosite = os.path.abspath("1.2.Sorted_Files/Conc_PROSITE.fasta")
conc_uniprot = os.path.abspath("1.2.Sorted_Files/Conc_UniProt.fasta")

# CD-HIT files
cdhit_prosite = os.path.abspath("1.3.CD_HIT_Files/CD_HIT_PROSITE.txt")
cdhit_uniprot = os.path.abspath("1.3.CD_HIT_Files/CD_HIT_UniProt.txt")

# Cluster files
cluster_prosite = os.path.abspath("1.4.Cluster_Files/Cluster_PROSITE.fasta")
cluster_uniprot = os.path.abspath("1.4.Cluster_Files/Cluster_UniProt.fasta")

# Final files
final_prosite = os.path.abspath("1.5.Final_Files/Final_PROSITE.fasta")
final_uniprot = os.path.abspath("1.5.Final_Files/Final_UniProt.fasta")

# =================== Functions ===================

In [3]:
def ModifyPosAndNeg(pos_in_prosite,pos_in_uniprot,neg_in):
    '''Function to modify all headers in each fasta file, in order to have the same format in the headings'''   
    # Positive PROSITE
    pos_prosite = open(pos_in_prosite)
    pos_prosite = pos_prosite.read()
    index = 0
    f = open(pos_out_prosite, 'w')
    for line in pos_prosite.splitlines():
        if line.startswith('>sp|'):
            index += 1
            header = line[line.find('>sp|'): line.index('|', line.index('|') + 1)]  +   str('|Positive')
            print(header ,file=f)
        else:
            print(line , file=f)
    f.close()    
    # Positive UniProt
    pos_uniprot = open(pos_in_uniprot)
    pos_uniprot = pos_uniprot.read()    
    index = 0
    f = open(pos_out_uniprot, 'w')
    for line in pos_uniprot.splitlines():
        if line.startswith('>sp|'):
            index += 1
            header = line[line.find('>sp|'): line.index('|', line.index('|') + 1)]  +   str('|Positive')
            print(header ,file=f)
        else:
            print(line , file=f)
    f.close()    
    # Negative
    neg_uniprot = open(neg_in)
    seq_neg = neg_uniprot.read()    
    index = 0 
    f = open(neg_out, 'w')
    for line in seq_neg.splitlines():
        if line.startswith('>sp|'):
            index += 1
            header = line[line.find('>sp|'): line.index('|', line.index('|') + 1)]   +   str('|Negative')
            print(header , file=f)
        else:
            print(line , file=f)
    f.close()   
    return pos_out_prosite, pos_out_uniprot, neg_out  



def ConcatClusterPosAndNeg(pos_out_prosite,pos_out_uniprot,neg_out,cdhit_prosite,cdhit_uniprot):
    '''Function to concatenate each of the two postive fasta files with the negative fasta file.
    Furthermore the cluster number is assigned to the concerned protein, where the cluster number is obtained from CD-HIT'''
    # Concatenate files with pos(PROSITE) + neg
    files_prosite = [pos_out_prosite, neg_out]
    with open(conc_prosite, 'w') as outfile_prosite:
        for fname_p in files_prosite:
            with open(fname_p) as infile_p:
                outfile_prosite.write(infile_p.read())               
    # Add cluster number to header
    cd_prosite = open(cdhit_prosite)
    cd_prosite = cd_prosite.read()  
    f = open(cluster_prosite, 'w')
    for line_cd in cd_prosite.splitlines():  
        if line_cd.startswith('>Cluster'): 
            clusternr = '_' + line_cd[9:]
        else:
            line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            print(line + clusternr, file=f)
    f.close()             
    # Concatenate files with pos(UniProt) + neg
    files_uniprot = [pos_out_uniprot, neg_out]
    with open(conc_uniprot, 'w') as outfile_uniprot:
        for fname_u in files_uniprot:
            with open(fname_u) as infile_u:
                outfile_uniprot.write(infile_u.read())
    # Add cluster number to header
    cd_uniprot = open(cdhit_uniprot)
    cd_uniprot = cd_uniprot.read()
    f = open(cluster_uniprot, 'w')
    for line_cd in cd_uniprot.splitlines():  
        if line_cd.startswith('>Cluster'): 
            clusternr = '_' + line_cd[9:]
        else:
            line = line_cd[line_cd.find('>sp|'): line_cd.find('.')]
            print(line + clusternr, file=f)
    f.close()       
    return conc_prosite, conc_uniprot, cluster_prosite, cluster_uniprot



def FinalFiles(conc_prosite,cluster_prosite,conc_uniprot,cluster_uniprot):
    '''Function to concatenate the cluster number with the sequence, creating the final file with the desired format'''  
    # Final: pos(PROSITE) + neg
    concat_prosite = open(conc_prosite)
    concat_prosite = concat_prosite.read()
    clus_prosite = open(cluster_prosite)
    clus_prosite = clus_prosite.read() 
    f = open(final_prosite, 'w')
    for line_conc in concat_prosite.splitlines():
        if line_conc.startswith('>sp|'):
            for line_clus in clus_prosite.splitlines():
                if line_conc[0:18] == line_clus[0:18]:
                    print(line_clus, file=f)
                    break
        else:
            print(line_conc, file=f)
    f.close( )
    # Final: pos(UniProt) + neg
    concat_uniprot = open(conc_uniprot)
    concat_uniprot = concat_uniprot.read()
    clus_uniprot = open(cluster_uniprot)
    clus_uniprot = clus_uniprot.read() 
    f = open(final_uniprot, 'w')
    for line_conc in concat_uniprot.splitlines():
        if line_conc.startswith('>sp|'):
            for line_clus in clus_uniprot.splitlines():
                if line_conc[0:18] == line_clus[0:18]:
                    print(line_clus, file=f)
                    break
        else:
            print(line_conc, file=f)
    f.close( )
    return final_prosite, final_uniprot



def Similarity(final_prosite,final_uniprot):
    '''Functions to determine which positive data set, should be used in the further work'''   
    f_prosite = open(final_prosite)
    f_prosite = f_prosite.read()
    f_uniprot = open(final_uniprot)
    f_uniprot = f_uniprot.read()
    sameID = []
    for line_pro in f_prosite.splitlines():
            if line_pro.startswith('>sp|'):
                for line_uni in f_uniprot.splitlines():
                    if line_pro[0:18] == line_uni[0:18]:
                        sameID.append(line_pro[line_pro.find('>sp|'): line_pro.find('_')])                    
    return sameID
# Counting the amount of proteins in the PROSITE and the UniProt data set
def Similarity_prosite(final_prosite,final_uniprot):
    f_prosite = open(final_prosite)
    f_prosite = f_prosite.read()
    sameID_prosite = []
    for line_pro in f_prosite.splitlines():
            if line_pro.startswith('>sp|'):
                sameID_prosite.append(line_pro)
    return sameID_prosite
def Similarity_uniprot(final_prosite,final_uniprot):
    f_uniprot = open(final_uniprot)
    f_uniprot = f_uniprot.read()
    sameID_uniprot = []
    for line_uni in f_uniprot.splitlines():
            if line_uni.startswith('>sp|'):
                sameID_uniprot.append(line_uni)
    return sameID_uniprot

# =================== Main ===================

In [4]:
pos_out_prosite, pos_out_uniprot, neg_out = ModifyPosAndNeg(pos_in_prosite,pos_in_uniprot,neg_in)
conc_prosite, conc_uniprot, cluster_prosite, cluster_uniprot = ConcatClusterPosAndNeg(pos_out_prosite,pos_out_uniprot,neg_out,cdhit_prosite,cdhit_uniprot)
final_prosite, final_uniprot  = FinalFiles(conc_prosite,cluster_prosite,conc_uniprot,cluster_uniprot)
sameID = Similarity(final_prosite,final_uniprot)


sameID_prosite = Similarity_prosite(final_prosite,final_uniprot)
sameID_uniprot = Similarity_uniprot(final_prosite,final_uniprot)

# Similarity sum up: 

In [5]:

print(len(sameID))
print(len(sameID_prosite)-len(sameID)) # Number of positive proteins obtained from PROSITE
print(len(sameID_uniprot)-len(sameID)) # Number of positive proteins obtained from Uniprot

2523
338
39
