In [22]:
import bmrb
import os
from tqdm import tqdm

In [23]:
#bmrb_dir = '/Users/adel.schmucklermann/Desktop/test/data/'
bmrb_dir = './bmrb/bmrb_entries/'

In [24]:
idsAll = [d[3:] for d in os.listdir(bmrb_dir) if d.startswith('bmr') and '.' not in d]

bmrb_entries = {}
failed = []
#logging.basicConfig(level=logging.ERROR)
for id_ in tqdm(idsAll):
    try:
        entry = bmrb.BmrbEntry(id_, bmrb_dir)
    except:
        failed.append(id_)
        continue
    bmrb_entries[id_] = entry

 32%|████████████████████████████▋                                                            | 10/31 [00:00<00:00, 23.54it/s]ERROR:root:BMRB entry 26510 contains no chemical shift information
 52%|█████████████████████████████████████████████▉                                           | 16/31 [00:00<00:00, 15.26it/s]ERROR:root:BMRB entry 15188 contains no chemical shift information
 81%|███████████████████████████████████████████████████████████████████████▊                 | 25/31 [00:01<00:00, 18.13it/s]ERROR:root:BMRB entry 27140 contains no chemical shift information
ERROR:root:BMRB entry 50482 contains no chemical shift information
ERROR:root:BMRB entry 19361 contains no chemical shift information
100%|█████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:01<00:00, 19.76it/s]


In [25]:
shift_data = {}
#filter for unique seqs
unique_seq = {}
double = 0

for id_,entry in tqdm(bmrb_entries.items()):
    
    # skip entry if experiment type is not solution nmr
    if not ('nmr' in entry.exp_method.lower() and \
       ('solution' in entry.exp_method_subtype.lower() or \
        'STRUCTURES' in entry.exp_method_subtype or not entry.exp_method_subtype) and \
       'state' not in entry.exp_method_subtype.lower()):
        continue
    
    
    #print(entry.n_components)    
    peptide_shifts = entry.get_peptide_shifts()
    
    for (stID, condID, assemID, assem_entityID, entityID), shifts in peptide_shifts.items():
        seq = entry.entities[entityID].seq
        seq_id = entry.entities[entityID].polymer_strand_ID
        
        
        #Filter out if n_components in Assembly more than 1
        n_components = entry.assemblies['1'].n_components
        if n_components != '1' and n_components != '':
            continue
        
        
        # skip if seq is empty
        if not seq:
            continue
            
            
        # skip if experiment conditions are abnormal
        ion = entry.conditions[condID].get_ionic_strength()
        pH = entry.conditions[condID].get_pH()
        temperature = entry.conditions[condID].get_temperature()
        #keep only: pH: 6-8
        if  pH > 8. or pH < 6.:
            continue
        
        # retrieve backbone shifts (H,N)
        ret = bmrb.get_valid_bbshifts(shifts, seq)
        if not ret:
            continue
        bbshifts, bbshifts_arr, bbshifts_mask = ret
        #hn = [(int(key)+1, [d['H'][0], d['N'][0]]) for key, d in bbshifts.items() if len(d) == 2]
        
        
        # add to list of valid data
        if id_ not in shift_data:
            shift_data[id_] = {}
         
        #find entries with same protein
        if seq not in unique_seq:
            unique_seq[seq] = id_
        else:
            print(str(unique_seq[seq])+" _ "+str(id_))
            double += 1
            
        shift_data[id_][(stID, condID, assemID, assem_entityID, entityID)] = (seq, seq_id, ion, pH, temperature, bbshifts, bbshifts_arr, bbshifts_mask, entry.exp_method_subtype)
        
#%%
n_peptides = len([shifts for id_ in shift_data for shifts in shift_data[id_]])
print(f"{n_peptides} valid peptide entries with shift data in {len(shift_data)} of {len(idsAll)} BMRB entries")
print("Same peptide entries: "+str(double))


100%|████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 654.27it/s]

12 valid peptide entries with shift data in 12 of 31 BMRB entries
Same peptide entries: 0





In [26]:
file1 = open('./bmrb_detail_monomer_pH68.txt', 'w')
file1.write("BRMB_ID"+"\t"+"Sequence"+"\t"+"Sequence_ID"+"\t"+"Ion"+"\t"+"pH"+"\t"+"Temperature"+"\t"+"Experiment"+"\n")

file2 = open('./bmrb_hn_monomer_pH68.txt', 'w')
file2.write("BRMB_ID"+"\t"+"Residue_ID"+"\t"+"Residue"+"\t"+"H"+"\t"+"N"+"\n")

file3 = open('./bmrb_IDs_monomer_pH68.txt', 'w')
file3.write("BRMB_ID"+"\n")

file4 = open('./bmrb_monomer_pH68.fasta', 'w')

for ID in shift_data.keys():
    
    entry = list(shift_data[ID].values())[0]

    
    seq = entry[0]
    seq_id = entry[1]
    if seq_id == '':
        seq_id = '.'
    ion = entry[2]
    pH = entry[3]
    temperature = entry[4]
    
    experiment = entry[8]
    
    found_one_hn = False
    hn = entry[5]
    for key, d in hn.items():
        key = int(key)
        residue = seq[key]
        if len(d) == 2:
            found_one_hn = True
            file2.write(ID+"\t"+str(key+1)+"\t"+residue+"\t"+str(d['H'][0])+"\t"+str(d['N'][0])+"\n")
    
    if found_one_hn:        
        file1.write(ID+"\t"+str(seq)+"\t"+str(seq_id)+"\t"+str(ion)+"\t"+str(pH)+"\t"+str(temperature)+"\t"+experiment+"\n")   
        file3.write(ID+"\n")
        file4.write(">"+ID+"\n"+str(seq)+"\n")
    
    
        
file1.close()
file2.close()
file3.close()
file4.close()

In [None]:
#What to do with sequences with X mask residues with -100