In [None]:
# This notebook is to clone and resubmit datasets of 20 lab study participants

In [1]:
import json, time
from metaspace.sm_annotation_utils import SMInstance

In [41]:
def get_raw_dataset(sm, ds_id):
    gql = sm._gqclient
    result = gql.query(
        """
        query editDatasetQuery($id: String!) {
          dataset(id: $id) {
            id
            name
            metadataJson
            configJson
            isPublic
            inputPath
            group { id }
            submitter { id }
            principalInvestigator { name email }
            molDBs
            adducts
            databases { id }
            description
          }
        }
        """,
        {'id': ds_id}
    )

    ds = result['dataset']
    config = json.loads(ds['configJson'])
    metadata = json.loads(ds['metadataJson'])

    # If pixel size is missing from metadata, add some values so that metadata passes validation
    if not 'Pixel_Size' in metadata['MS_Analysis']:
        metadata['MS_Analysis']['Pixel_Size'] = {'Xaxis': 1, 'Yaxis': 1}
    if metadata['MS_Analysis']['Pixel_Size']['Xaxis'] < 0.1:
        metadata['MS_Analysis']['Pixel_Size']['Xaxis'] = 100
    if metadata['MS_Analysis']['Pixel_Size']['Yaxis'] < 0.1:
        metadata['MS_Analysis']['Pixel_Size']['Yaxis'] = 100

    # Fix double-encoding of tiptap field
    if (ds['description'] or '').startswith('"{\\"'):
        ds['description'] = json.loads(ds['description'])

    return ds, config, metadata


def clone_dataset(sm, ds, metadata, name, database_ids, adducts, neutral_losses, project_ids):
    gql = sm._gqclient

    result = gql.query(
        """
        mutation ($input: DatasetCreateInput!) {
          createDataset(input: $input)
        }
        """,
        {
            'input': {
                'name': name,
                'inputPath': ds['inputPath'],
                'metadataJson': json.dumps(metadata),
                'databaseIds': database_ids,
                'adducts': adducts,
                'neutralLosses': neutral_losses,               
                'submitterId': gql.get_submitter_id(),
                'groupId': gql.get_primary_group_id(),
                'isPublic': False,
                'description': ds['description'],
                'projectIds': project_ids,
                
            }
        }
    )
    # Sleep 1 second, because METASPACE will raise an error if more than 1 dataset per second is submitted
    time.sleep(1)

    # Return the new dataset ID
    return json.loads(result['createDataset'])['datasetId']

In [3]:
sm = SMInstance()

In [19]:
import pandas as pd
df = pd.read_csv(r"C:\Users\Veronica\Documents\LAB\projects\spotting\QC\Datasets.csv")

In [22]:
data_of_interest = (df["Participant lab"].isin(['Janfeldt/Pinto'])) 

datasets = df[data_of_interest]["Dataset ID"]
titles = df[data_of_interest][["Participant lab", "Technology","Polarity", "m/z range"]].agg('_'.join, axis=1)

In [42]:
ds_ids = [
    '2021-02-15_21h23m26s',
]

new_ds_ids = []

for ds_id in ds_ids:

    # Get the existing dataset data
    ds, config, metadata = get_raw_dataset(sm, ds_id)
    is_positive_mode = metadata['MS_Analysis']['Polarity'] == 'Positive'

    Adjust other parameters
    database_ids = [db['id'] for db in ds['databases']]
    # Append to database_ids to add more databases, e.g.
    if 304 not in database_ids:
        database_ids.append(304)

    # Append to adducts if you want to add adducts that aren't available through the UI
    adducts = ds['adducts']
    neutral_losses = ds['neutral_losses']

    new_ds_id = clone_dataset(
        sm=sm,
        ds=ds,
        metadata=metadata,
        name=ds['name'], # Rename or add a suffix if desired
        database_ids=database_ids,
        adducts=adducts,
        neutral_losses = neutral_losses,
        # If uploading in bulk, it's a good idea to put things in projects. Create a project through
        # the web and copy the ID from the url here:
        project_ids=['62d1990a-a4ff-11eb-96db-abcc9848804b'],
    )
    new_ds_ids.append(new_ds_id)

    print(f'{ds_id} cloned to {new_ds_id}')
    df.loc[df['Dataset ID'] == ds_id, 'Clone ID'] = new_ds_id

df.to_csv(r"C:\Users\Veronica\Documents\LAB\projects\spotting\QC\Datasets_modified.csv", index=False)

2021-03-22_13h37m54s cloned to 2021-04-24_23h00m42s
2021-03-22_13h39m15s cloned to 2021-04-24_23h00m44s
2021-03-22_13h40m42s cloned to 2021-04-24_23h00m46s
2021-03-22_13h49m18s cloned to 2021-04-24_23h00m48s
2021-03-22_13h50m40s cloned to 2021-04-24_23h00m50s
2021-03-22_13h51m44s cloned to 2021-04-24_23h00m52s
2021-03-22_13h53m18s cloned to 2021-04-24_23h00m55s
2021-03-22_13h59m01s cloned to 2021-04-24_23h00m57s
2021-03-22_14h59m52s cloned to 2021-04-24_23h00m59s
2021-03-22_15h14m29s cloned to 2021-04-24_23h01m01s
2021-03-29_04h08m18s cloned to 2021-04-24_23h01m03s
2021-03-29_03h23m18s cloned to 2021-04-24_23h01m06s
2021-03-29_02h41m53s cloned to 2021-04-24_23h01m08s
2021-03-29_02h08m19s cloned to 2021-04-24_23h01m10s
2021-03-29_00h13m35s cloned to 2021-04-24_23h01m12s
2021-03-28_23h24m30s cloned to 2021-04-24_23h01m14s
2021-03-28_22h13m58s cloned to 2021-04-24_23h01m16s
2021-03-28_21h33m31s cloned to 2021-04-24_23h01m18s
2021-04-05_03h27m09s cloned to 2021-04-24_23h01m20s
2021-04-05_0