# Concatenate low mass range and high mass range datasets

In [None]:
from pathlib import Path
from concatenate_imzml_files import concat_imzml_files
from concatenate_imzml_files import transform_imzml
from definitions import ROOT_DIR
import numpy as np
from metaspace.sm_annotation_utils import SMInstance
import pandas as pd

In [None]:
transform_imzml?

Provide inputs
- Folder where imzmls for all matrices are located

In [None]:
p = Path(ROOT_DIR) / "1_stitch_and_upload_datasets"
p_out = p / 'Missing'
print(p_out)

In [None]:
# Log into metaspace
sm = SMInstance(host='https://metaspace2020.eu')

if not sm.logged_in():
    # Using getpass here prevents the API key from being accidentally saved with this notebook.
    api_key = getpass.getpass(prompt='API key: ', stream=None)
    sm.login(api_key=api_key)

In [None]:
datasets = pd.read_csv(p / 'missing.csv') #List of datasets + metadata to grab

In [None]:
for dataset_id in datasets['ds_id']:
    print(dataset_id)
    #ds = sm.dataset(id = dataset_id)
    #ds.download_to_dir(p_out,dataset_id)

- Iterate through csv file and merge all labelled pairs

In [None]:
#Function to merge two aligned imzml files

dsets = (datasets[datasets['Mergewith'] != "0"]['ds_id'])
for first in dsets:
    ds = datasets[datasets['ds_id'] == first]
    slide = ds['Slide'].item()
    second = ds['Mergewith'].item()
    polarity = ds['Polarity'].item()
    rotation = ds['Rotation'].item()
    group = ds['Group'].item()
    done = ds['Done'].item()
    
    input_paths = [p_out / (first+".imzml"), p_out / (second+'.imzml')]
    output_path = p_out / f'{group}_{slide}_{polarity}.imzml'
    offsets = [(0,0), (0,0)] # (x,y) tuple for each imzML file if you want to offset either file's coordinates
    mz_ranges = [('auto','auto'), ('auto','auto')] # (min_mz, max_mz) tuples. Replace 'auto' with a non-string number if you want to explicitly specify the m/z range for each file
    if (done == 0):
        concat_imzml_files(input_paths, offsets, mz_ranges, output_path, rotation)

In [None]:
#Function to rotate unmerged dataset

dsets = (datasets[datasets['Mergewith'] == "0"]['ds_id'])
for first in dsets:
    ds = datasets[datasets['ds_id'] == first]
    slide = ds['Slide'].item()
    polarity = ds['Polarity'].item()
    rotation = ds['Rotation'].item()
    group = ds['Group'].item()
    done = ds['Done'].item()
    mrange = ds['Mass Range'].item()
    
    input_path = p_out / (first+".imzml")
    output_path = p_out / f'{group}_{slide}_{polarity}.imzml'
    if (done == 0 and mrange == "Full"):
        transform_imzml(input_path, output_path, rotation)

- Concatenate each pair of files

# Upload stitched datasets to METASPACE

In [None]:
from pathlib import Path
import json, getpass
from metaspace import SMInstance
import pandas as pd
from definitions import ROOT_DIR

- To avoid processing with HMDB, upload on staging

In [None]:
sm = SMInstance(host='https://metaspace2020.eu')

if not sm.logged_in():
    # Using getpass here prevents the API key from being accidentally saved with this notebook.
    api_key = getpass.getpass(prompt='API key: ', stream=None)
    sm.login(api_key=api_key)

Provide inputs, for each pair:

matrix full name, additional neutral gain just for this matrix (matrix molecule), solvent

In [None]:
matrix_fullname = dict({
    'DHB':['2,5-dihydroxybenzoic acid', ['+C7H6O4'], 'ACN (70% v/v, aq.)'],
    'DAN':['1,5-diaminonaphthalene', ['+C10H10N2', '+C10H8N2'], 'ACN (70% v/v, aq.)'],
    'norharmane':['norharmane', ['+C11H8N2'], 'CHCl3:MeOH (1:1 v/v)'],
    '9AA':['9-aminoacridine', ['+C13H10N2'], 'MeOH (70% v/v, aq.)'],
    'CHCA':['alpha-cyano-4-hydroxycinnamic acid', ['+C10H7NO3'], 'ACN (50% v/v, aq.)'],
    'ClCCA':['4-chloro-alpha-cyanocinnamic acid', ['+C10H6ClNO2'], 'ACN (50% v/v, aq.)'],
    'NEDC':['N-(1-naphthyl)ethylenediamine dihydrochloride', ['+C12H14N2', '+HCl'], 'ACN (70% v/v, aq.)'],
    'PNDIT2':['PNDI-T2', [], 'Toluene'],
    'MAPS':['Maleic anhydride proton sponge', ['+C18H18N2O3'], 'Toluene'],
    'DHAP': ['2,5-dihydroxyacetophenone', ['+C8H8O3'], 'ACN (70% v/v, aq.)'],
    'pNA' : ['4-Nitroaniline', ['+C6H6N2O2'], 'MeOH (85% v/v, aq.)'],
    'CMBT': ['5-Chloro-2-mercaptobenzothiazole', ['+C7H4ClNS2'], 'ACN (90% v/v, aq.)'],
    'None' : ['none', [], 'none'],
})

- Define neutral losses in the function below

Might be of interest for interpreting neutral losses http://www.colby.edu/chemistry/PChem/StableLoss.html

In [None]:
def submit_dataset(dataset_name, matrix, solvent, polarity, adducts, rp, mref, source, analyzer, px_x, px_y, extra_neutral_losses = [], databases = [('Spotting_project_compounds-v9', 'feb2021')], is_public = False):

    metadata = {
        'Data_Type': 'Imaging MS',  # shouldn't be changed
        'Sample_Information': {
            'Organism': 'None',
            'Organism_Part': 'None',
            'Condition': 'None',
            'Sample_Growth_Conditions': 'None'  # this is an extra field
        },
        'Sample_Preparation': {
            'Sample_Stabilisation': 'None',
            'Tissue_Modification': 'None',
            'MALDI_Matrix': matrix,
            'MALDI_Matrix_Application': 'N/A',
            'Solvent': solvent
        },
        'MS_Analysis': {
            'Polarity': polarity,
            'Ionisation_Source': source,
            'Analyzer': analyzer,
            'Detector_Resolving_Power': {
                'mz': mref,
                'Resolving_Power': rp
            },
            'Pixel_Size': {
                'Xaxis': px_x,
                'Yaxis': px_y
            }
        }
    }
    
    ds_id  = sm.submit_dataset(
    imzml_fn, ibd_fn, dataset_name,
    json.dumps(metadata), is_public, databases,
    project_ids=['62d1990a-a4ff-11eb-96db-abcc9848804b'],
    adducts=adducts,
    neutral_losses = ['-H2O', '-H2', '+H2', #redox
                      '-CO2', '-CH2O3', '-CH2O2', # CO2+H2O, formic acid?
                      '-HPO3', '-H3PO4', # phosphate
                      '-NH3',# '-C2H5NO2',  # glycine
                     ] + extra_neutral_losses,
    ppm=10        
    )
    
    return ds_id

In [None]:
id_list = []

for ds_id in dsets:
    
    ds = datasets[datasets['ds_id'] == ds_id]
    slide = ds['Slide'].item()
    pol_short = ds['Polarity'].item()
    mat_short = ds['Matrix'].item()
    rp = ds['RP'].item()
    mref = ds['Mref'].item()
    group = ds['Group'].item()
    source = ds['Source'].item()
    analyzer = ds['Analyzer'].item()
    px_x = ds['Px_x'].item()
    px_y = ds['Px_y'].item()
    
    if(group != "Dreisewerd"):
    
        matrix = matrix_fullname[mat_short][0]
        extra_neutral_losses = matrix_fullname[mat_short][1]
        imzml_fn = p_out / f'{group}_{slide}_{pol_short}.imzml'
        ibd_fn = p_out / f'{group}_{slide}_{pol_short}.ibd'
        dataset_name = f'{group}_{slide}_{pol_short}'
        solvent = 'N/A'
        if pol_short == "neg":
            polarity = 'Negative'
            adducts = ['[M]-', '-H', '+Cl']
        else:
            polarity = 'Positive'
            adducts = ['[M]+', '+H', '+Na', '+K']

        print("Hello, "+f'{group}_{slide}_{pol_short}.imzml: '+matrix+f', {px_x}:{px_y}')
        submit_dataset(dataset_name, matrix, solvent, polarity, adducts, rp, mref, source, analyzer, px_x, px_y, extra_neutral_losses)
    

    


In [None]:
id_list = pd.DataFrame(id_list)
id_list.to_csv(p / 'uploaded_interlab_datasets.csv')

- Upload datasets

In [None]:
p = Path(ROOT_DIR) / "1_stitch_and_upload_datasets"

matrices = []
modes = []
ds_names = []
ids = []
losses = []

for mpath in p.iterdir():
    if mpath == p /'pNA':
#     if mpath.is_dir():
        for pol in ['pos', 'neg']:
            imzml_fn = list(mpath.rglob(f"*{pol}*mz70-1510*.imzML"))[0]
            ibd_fn = list(mpath.rglob(f"*{pol}*mz70-1510*.ibd"))[0]
            dataset_name = imzml_fn.name
            matrix = matrix_fullname[mpath.name][0]
            extra_neutral_losses = matrix_fullname[mpath.name][1]
            solvent = matrix_fullname[mpath.name][2]
            
            if pol == 'pos':
                polarity = 'Positive'
                adducts = ['[M]+', '+H', '+Na', '+K']
            else: 
                polarity = 'Negative'
                adducts = ['[M]-', '-H', '+Cl']
            
            ds_id = submit_dataset(dataset_name, matrix, solvent, polarity, adducts, extra_neutral_losses)

            matrices.append(matrix)
            modes.append(polarity)
            ds_names.append(dataset_name)
            ids.append(ds_id)
            losses.append(extra_neutral_losses)
            
df = pd.DataFrame({
    'matrix':matrices,
    'polarity':modes,
    'ds_name':ds_names,
    'ds_id':ids,
    'extra_neutral_losses':losses
})

# Function for cloning datasets that are too big to upload through API

In [None]:
def clone_dataset(dataset, name, extra_losses):
    ds = sm.dataset(id=dataset)
    ds_config = ds.config
    # The following lines modify the existing config before sending it back
    # Remove any lines for fields where an override isn't needed
    #ds_config['analysis_version'] = 2
    ds_config['image_generation']['ppm'] = 10
    #ds_config['isotope_generation']['chem_mods'] = []
    ds_config['isotope_generation']['neutral_losses'] = ['-H2O', '-H2', '+H2', #redox
                          '-CO2', '-CH2O3', '-CH2O2', # CO2+H2O, formic acid?
                          '-HPO3', '-H3PO4', # phosphate
                          '-NH3',# '-C2H5NO2',  # glycine
                         ] + extra_losses
    
    if ds_config['isotope_generation']['charge'] > 0: # Check polarity to decide which adducts to use
        ds_config['isotope_generation']['adducts'] = ['[M]+', '+H', '+Na', '+K']
    else:
        ds_config['isotope_generation']['adducts'] = ['[M]-', '-H', '+Cl']

    # Add Pixel_Size if it's missing (only a problem with very old datasets)
    if 'Pixel_Size' not in ds.metadata['MS_Analysis']:
        ds.metadata['MS_Analysis']['Pixel_Size'] = {
            'Xaxis': 100,
            'Yaxis': 100,
        }

    # Use original databases
    #databases = [(db.name, db.version) for db in ds.database_details]
    
    # or override them:
    databases = [('Spotting_project_compounds-v9', 'feb2021')]
    
    new_dataset = sm.submit_dataset(
        imzml_fn=None,
        ibd_fn=None,
        name=name,
        metadata=ds.metadata,
        is_public=False,
        databases=databases,
        project_ids=['62d1990a-a4ff-11eb-96db-abcc9848804b'],  # Add a project ID here if desired, otherwise delete this line
        adducts=ds_config['isotope_generation']['adducts'],
        neutral_losses = ds_config['isotope_generation']['neutral_losses'],
        ppm = ds_config['image_generation']['ppm'],
        input_path=ds.s3dir,
    )
    print(new_dataset)

In [None]:
dsids = datasets['ds_id'][0:4]
for dsid in dsids:
    dset = datasets[datasets.ds_id==dsid]
    name = dset['Group'].item()+"_"+dset['Slide'].item()+"_"+dset['Polarity'].item()+"_"+dset['Mass Range'].item()
    extra_losses = matrix_fullname[dset['Matrix'].item()][1]
    print(dset)
    clone_dataset(dsid, name, extra_losses)

- Save information about uploaded dataset including dataset id on staging

In [None]:
df.to_csv(p / 'uploaded_datasets.csv')