In [2]:
from tqdm import tqdm
from itertools import groupby


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%cd /home/runzhong/ms_collaborators/nist2023_gcms/raw

/mnt/home/runzhong/ms_collaborators/nist2023_gcms/raw


In [31]:
key_mapping = {
    'Name': 'NAME',
    'Notes': 'NOTES',
    'Precursor_type': 'PRECURSOR TYPE',
    'Spectrum_type': 'SPECTRUM TYPE',
    'PrecursorMZ': 'PRECURSOR M/Z',
    'Instrument_type': 'INSTRUMENT TYPE',
    'Instrument': 'INSTRUMENT',
    'Sample_inlet': 'SAMPLE INLET',
    'Ionization': 'IONIZATION',
    'Collision_gas': 'COLLISION GAS',
    'Collision_energy': 'COLLISION ENERGY',
    'Ion_mode': 'ION MODE',
    'InChIKey': 'INCHIKEY',
    'Synon': 'SYNONYMS',
    'Formula': 'FORMULA',
    'MW': 'MW',
    'ExactMass': 'EXACT MASS',
    'CAS#': 'CASNO',
    'Related_CAS#': 'RELATED CASNO',
    'NIST#': 'NISTNO',
    'DB#': 'ID',
    'Comments': 'COMMENT',
    'Num Peaks': 'NUM PEAKS',
    'In-source_voltage': 'IN-SOURCE VOLTAGE',
    'msN_pathway': 'MSN PATHWAY',
    'Peptide_sequence': 'PEPTIDE SEQUENCE',
    'Peptide_mods': 'PEPTIDE MODS',
    'Retention_index': 'RETENTION INDEX',
    'COMPOUND_REP': 'COMPOUND REP',
    'Salt': 'SALT',
    'Salt/mix_CAS#': 'SALT/MIX CASNO',
    'Known_impurity': 'KNOWN IMPURITY',
}

def normalize_text(input_text):
    # Replace specific sequences with a consistent representation
    replacements = [
        r'¨’', '.beta.',
        r'¨€', '.alpha.',
        r'?', '.gamma.', '.delta.', '.epsilon.', r'.+/-.', '.mu.', '.pi.', '.sigma.',
        r'¨Ί', '.omega.',
        r'¨Ή', '.eta.',
        r'¨',
    ]
    for old in replacements:
        input_text = input_text.replace(old, '?')
    input_text = ' '.join(input_text.split()) # replace multiple spaces and remove trailing spaces
    return input_text

In [56]:
# LC-MS/MS
# sdf_files = [f'nist2023_part{x}_100k.SDF' for x in range(1, 12)] + [f'nist2023_hr#2_part{x}_100k.SDF' for x in range(1, 10)] + ['nist2023_ap.SDF']
# spec_files = [f'nist2023_part{x}_100k.MSPEC' for x in range(1, 12)] + [f'nist2023_hr#2_part{x}_100k.MSPEC' for x in range(1, 10)] + ['nist2023_ap.MSPEC']
# gcms = False

# GC-MS
sdf_files = [f'eims_100k_part{x}.SDF' for x in range(1, 5)]
spec_files = [f'eims_100k_part{x}.MSPEC' for x in range(1, 5)]
gcms = True

new_lines = []

for sdf_file, spec_file in zip(sdf_files, spec_files):
    print('Processing', sdf_file, '&', spec_file)
    with open(sdf_file, 'r', encoding='iso-8859-7') as sdf_fp, \
        open(spec_file, 'r', encoding='iso-8859-7') as spec_fp:
    
        # Initialize the iterator for grouped lines in spec file
        spec_iter = enumerate(groupby(spec_fp, key=lambda x: "\n" == x))
        
        # Iterate over groups in SDF file
        for i, (is_true, sdf_lines) in tqdm(enumerate(groupby(sdf_fp, key=lambda x: "$$" in x))):
            if is_true:
                continue
            else:
                sdf_lines = list(sdf_lines)
            sdf_name = normalize_text(sdf_lines[0].strip())
            if 'M  END' not in sdf_lines[-1]:
                sdf_lines.append('M  END\n')
            
            # Look for the corresponding entry in .MSPEC file
            while True:
                try:
                    j, (is_true, spec_lines) = next(spec_iter)
                    if is_true:
                        continue
                    spec_lines = list(spec_lines)
                    spec_name = normalize_text(spec_lines[0].split(':', 1)[-1].strip())
                    if sdf_name in spec_name or spec_name in sdf_name: # matched
                        spec_info = spec_lines
                        break
                    else:
                        print('Warning: Unmatched entries.', 'sdf_name:', sdf_name, 'spec_name:', spec_name, 'sdf_index', i, 'spec_index', j)
                except StopIteration:
                    spec_info = []
        
            # Process the .MSPEC file
            if len(spec_info) > 0:
                meta_dict = {}
                for is_true, info_lines in groupby(spec_info, key=lambda x: ':' in x):
                    if is_true:
                        for line in info_lines:
                            if '#' in line and len(line.split(':')) > 2: # multiple IDs in one line
                                entries = line.strip().split(';')
                            else:
                                entries = [line.strip()]
                            for l in entries:
                                key, val = l.split(':', 1)
                                key = key_mapping[key.strip()]
                                meta_dict.setdefault(key, []).append(f'{val.strip()}\n')
        
                    else:
                        key = 'MASS SPECTRAL PEAKS'
                        if gcms:
                            ms_list = []
                            for line in info_lines:
                                for entry in line.split(';'):
                                    split = entry.split()
                                    if len(split) >= 2:
                                        ms_list.append(f'{split[0]} {split[1]}\n')
                            meta_dict[key] = ms_list
                        else:
                            meta_dict[key] = [f'{line.split()[0]} {line.split()[1]}\n' for line in info_lines]
            
                append_lines = [f'> <{k}>\n' + ''.join(v) + '\n' for k, v in meta_dict.items()]
                append_lines.append('$$$$\n')
        
            new_lines += sdf_lines + append_lines

Processing eims_100k_part1.SDF & eims_100k_part1.MSPEC


42026it [00:08, 4443.63it/s]



195367it [00:45, 8503.96it/s]



199992it [00:46, 4314.83it/s]


Processing eims_100k_part2.SDF & eims_100k_part2.MSPEC


73106it [00:14, 5061.54it/s]



199996it [00:39, 5074.30it/s]


Processing eims_100k_part3.SDF & eims_100k_part3.MSPEC


199998it [00:44, 4465.96it/s]


Processing eims_100k_part4.SDF & eims_100k_part4.MSPEC


94200it [00:25, 3656.52it/s]


In [37]:
cnt1 = 0
cnt2 = 0
cnt3 = 0
cnt4 = 0
for line in new_lines:
    if 'M  END' in line:
        cnt1 += 1
    if '$$$$' in line:
        cnt4 += 1
    if '<NAME>' in line:
        cnt2 += 1
    if '<MASS SPECTRAL PEAKS>' in line:
        cnt3 += 1
(cnt1, cnt2, cnt3, cnt4)

(347093, 347093, 347093, 347093)

In [57]:
with open('gcms_nist23.SDF', 'w') as file:
    for s in tqdm(new_lines):
        file.write(s)  # Write each string without a newline

100%|██████████| 20573981/20573981 [00:27<00:00, 741580.57it/s] 
