In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from metaspace import SMInstance

from outer_spacem.io import download_all_annotation_images_to_zarr

In [2]:
%load_ext autoreload
%aimport outer_spacem

%autoreload 1

# Pixel analysis part 1
# Download datasets from Metaspace

## Set paths

### Input
`metadata_path`: metadata table (I prefer downloaded from Metaspace)  
Needs to have the following columns:  
`datasetId` - Metaspace id  
`datasetName`	- name of the dataset  

All other columns are optional, generally useful to have "condition" column or something similar used for comparison in the next steps of analysis  

Script will add to the metadata table additional column "data_file" and save updated metadata to the analysis directory at `metadata_save_path`

### Output
`data_dir`: general root dir for storing data, including downloaded Metaspace datasets and plots  
`metaspace_data_dir`: directory, where ion images and metadata will be stored  
`metadata_save_path`: metadata table with new columns for file paths 
`plots_path`: directory for plots  (ion images, cell masks, plots from scanpy analysis)

In [6]:
username = "alberto.bailoni@embl.de"
api_key="O2rVI8WeHXRL"

ds = SMInstance(api_key=api_key, email=username).dataset(id="2021-10-27_00h20m47s")
# print(ds.metadata)
print(ds._info)

{'id': '2021-10-27_00h20m47s', 'name': '2021-28-09_Gastrosome_Slide6Drugs_Well8_150x150_a29ss25_DHBpos', 'uploadDT': '2021-10-26T22:20:47.944Z', 'submitter': {'id': '5727e89e-e1dd-11e8-9d75-b7197ec46d47', 'name': 'Mohammed Shahraz'}, 'group': {'id': '5727e852-e1dd-11e8-9d75-5fefa7059e48', 'name': 'European Molecular Biology Laboratory', 'shortName': '♡EMBL♡'}, 'principalInvestigator': None, 'projects': [{'id': 'a54d9c64-2f40-11eb-96db-9f8940a662c3', 'name': '2021-11-25_microglia_Gastrosome_Peri', 'publicationStatus': 'UNPUBLISHED'}], 'polarity': 'POSITIVE', 'ionisationSource': 'MALDI', 'analyzer': {'type': 'Orbitrap', 'resolvingPower': 98994.94936611666}, 'organism': 'Homo sapiens (human) | Mus musculus (mouse)', 'organismPart': 'Cells', 'condition': 'Wildtype ', 'growthConditions': 'N/A', 'maldiMatrix': 'DHB', 'configJson': '{"image_generation":{"n_levels":30,"ppm":3,"min_px":1},"fdr":{"decoy_sample_size":20},"analysis_version":1,"isotope_generation":{"n_peaks":4,"charge":1,"isocalc_s

In [8]:
import pandas
pandas.DataFrame(ds._info)

ValueError: All arrays must be of the same length

In [3]:
data_dir = Path(r"/Users/alberto-mac/EMBL_ATeam/projects/gastrosome")
metaspace_data_dir = data_dir / "pixel_analysis" / "data"
metaspace_data_dir.mkdir(parents=True, exist_ok=True)

metadata_path = Path(r"/Users/alberto-mac/EMBL_ATeam/projects/gastrosome/metaspace_datasets.csv")
metadata_save_path = data_dir / "pixel_analysis" / "metaspace_datasets_paths.csv"

plots_path = data_dir / "pixel_analysis"/ "plots" 
plots_path.mkdir(parents=True, exist_ok=True)

## Check and subset metadata

Metadata file downloaded from Metaspace will have the two line header, then column names and then rows corresponding to the datasets. I prefer to keep the header just in case and read in with pandas starting with the third line

In [4]:
! head -n 5 $metadata_path

# Generated at 12/16/2021 2:43:06 PM. For help see https://bit.ly/3Bzs6Z4
# URL: https://metaspace2020.eu/datasets?q=20211123_AP_DANneg_100x100_a32ss25
"datasetId","datasetName","group","submitter","PI","organism","organismPart","condition","growthConditions","ionisationSource","maldiMatrix","analyzer","resPower400","polarity","uploadDateTime","FDR@10%","database","opticalImage"
"2021-11-25_12h06m17s","20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well6","♡EMBL♡","Arne Mueller","Theodore Alexandrov","N/A","pancreatic cancer cell line - K8484","Ctrl","Control, 3 day incubation","MALDI","1,5-diaminonaphthalene (DAN)","Orbitrap","98995","negative","2021-11-25T10:06:17.161000","34","SwissLipids - 2018-02-02","https://metaspace2020.eu/fs/raw_optical_images/42b0fe08b9a8c100e86839497a1963d8"
"2021-11-25_12h05m22s","20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well5","♡EMBL♡","Arne Mueller","Theodore Alexandrov","N/A","pancreatic cancer cell line - K8484","

In [5]:
metadata = pd.read_csv(metadata_path, skiprows=2)
metadata.head()

Unnamed: 0,datasetId,datasetName,group,submitter,PI,organism,organismPart,condition,growthConditions,ionisationSource,maldiMatrix,analyzer,resPower400,polarity,uploadDateTime,FDR@10%,database,opticalImage
0,2021-11-25_12h06m17s,20211123_AP_DANneg_100x100_a32ss25__rf60_CP350...,♡EMBL♡,Arne Mueller,Theodore Alexandrov,,pancreatic cancer cell line - K8484,Ctrl,"Control, 3 day incubation",MALDI,"1,5-diaminonaphthalene (DAN)",Orbitrap,98995,negative,2021-11-25T10:06:17.161000,34,SwissLipids - 2018-02-02,https://metaspace2020.eu/fs/raw_optical_images...
1,2021-11-25_12h05m22s,20211123_AP_DANneg_100x100_a32ss25__rf60_CP350...,♡EMBL♡,Arne Mueller,Theodore Alexandrov,,pancreatic cancer cell line - K8484,TGF-ß 5ng per mL,3 days TGF-ß,MALDI,"1,5-diaminonaphthalene (DAN)",Orbitrap,98995,negative,2021-11-25T10:05:22.863000,19,SwissLipids - 2018-02-02,https://metaspace2020.eu/fs/raw_optical_images...
2,2021-11-25_12h03m15s,20211123_AP_DANneg_100x100_a32ss25__rf60_CP350...,♡EMBL♡,Arne Mueller,Theodore Alexandrov,,pancreatic cancer cell line - K8484,Ctrl,"Control, 3 day incubation",MALDI,"1,5-diaminonaphthalene (DAN)",Orbitrap,98995,negative,2021-11-25T10:03:15.241000,33,SwissLipids - 2018-02-02,https://metaspace2020.eu/fs/raw_optical_images...
3,2021-11-25_12h02m02s,20211123_AP_DANneg_100x100_a32ss25__rf60_CP350...,♡EMBL♡,Arne Mueller,Theodore Alexandrov,,pancreatic cancer cell line - K8484,TGF-ß 5ng per mL,3 days TGF-ß,MALDI,"1,5-diaminonaphthalene (DAN)",Orbitrap,98995,negative,2021-11-25T10:02:02.109000,36,SwissLipids - 2018-02-02,https://metaspace2020.eu/fs/raw_optical_images...
4,2021-11-25_12h04m46s,20211123_AP_DANneg_100x100_a32ss25__rf60_CP350...,♡EMBL♡,Arne Mueller,Theodore Alexandrov,,pancreatic cancer cell line - K8484,Ctrl,"Control, 3 day incubation",MALDI,"1,5-diaminonaphthalene (DAN)",Orbitrap,98995,negative,2021-11-25T10:04:46.322000,41,SwissLipids - 2018-02-02,https://metaspace2020.eu/fs/raw_optical_images...


Frequent operation: splitting dataset name to get some information into a separate column.  

As an example, let's get the well number and subset the metadata table to analyze only wells 1-4.

A good and more stable way to do it is to use regular expressions, but, at least for me, getting it right, usually requires so much time that I give up and use split function...

In [6]:
def get_well(ds_name):
    well = ds_name.split("_")[-1]
    well = int(well[4:])
    return well

metadata["well"] = metadata["datasetName"].apply(get_well)
# subset_metadata = metadata[metadata["well"] <= 4]

Another example: getting date from Metaspace ID and filtering by date

In [7]:
def get_date(ds_id):
    date = ds_id.split("_")[0]
    return date

metadata["date"] = metadata["datasetId"].apply(get_date)

# subset_metadata = metadata[metadata["date"] == "2021-12-18"]

Doing analysis with all samples

In [8]:
subset_metadata = metadata

## Set up Metaspace download parameters

In [9]:


# Some examples of databases
# database = ("AB_tcells_ht_nedc_union_DB", "v1")
# database = ('CoreMetabolome', 'v3')
database = ("VS_custom_database_long", "Sep2020")
if type(database) is tuple:
    database_str = "_".join(database)
else:
    database_str = str(database)
    
# Maximum false discovery rate. Valid values are 0.05, 0.1, 0.2, 0.5 corresponding to 5%, 10%, 20% and 50%
fdr = 0.5

## Set paths of the datasets and store final metadata table

In [10]:
subset_metadata.loc[:, "ion_metadata_path"] = [str(metaspace_data_dir / f"{ds_name}_{database_str}_{str(fdr)}_ion_metadata.csv") for ds_name in subset_metadata.datasetName]
subset_metadata.loc[:, "ion_images_path"] = [str(metaspace_data_dir / f"{ds_name}_{database_str}_{str(fdr)}_ion_images.zarr") for ds_name in subset_metadata.datasetName]
subset_metadata.loc[:, "tic_path"] = [str(metaspace_data_dir / f"{ds_name}_{database_str}_{str(fdr)}_tic.zarr") for ds_name in subset_metadata.datasetName]
subset_metadata.loc[:, "adata_path"] = [str(metaspace_data_dir / f"{ds_name}_{database_str}_{str(fdr)}_adata.h5ad") for ds_name in subset_metadata.datasetName]
subset_metadata.to_csv(metadata_save_path, index=False)

## Download

In [11]:
sm = SMInstance(api_key=api_key, email=username)

for id, ds_meta in subset_metadata.iterrows():
    dataset_id = ds_meta["datasetId"]
    ion_images_path = ds_meta["ion_images_path"]
    ion_metadata_path = ds_meta["ion_metadata_path"]
    tic_path = ds_meta["tic_path"]

    ion_metadata, ion_images_array, tic = download_all_annotation_images_to_zarr(
        sm=sm,
        ion_images_path=ion_images_path,
        tic_path=tic_path,
        dataset_id=dataset_id,
        database=database,
    )
    ion_metadata.to_csv(ion_metadata_path)

Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well6
Dataset ID:  2021-11-25_12h06m17s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 499/499 [00:03<00:00, 147.84it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well5
Dataset ID:  2021-11-25_12h05m22s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 501/501 [00:03<00:00, 146.81it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well2
Dataset ID:  2021-11-25_12h03m15s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 501/501 [00:03<00:00, 153.12it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well1
Dataset ID:  2021-11-25_12h02m02s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 500/500 [00:03<00:00, 156.31it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well4
Dataset ID:  2021-11-25_12h04m46s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 503/503 [00:02<00:00, 168.39it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well8
Dataset ID:  2021-11-25_12h07m56s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 497/497 [00:03<00:00, 155.88it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well7
Dataset ID:  2021-11-25_12h07m11s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 501/501 [00:02<00:00, 167.69it/s]


Ion image shape:  (100, 100)
Dataset name:  20211123_AP_DANneg_100x100_a32ss25__rf60_CP350_3.10kV_mz100-500_Well3
Dataset ID:  2021-11-25_12h04m00s
Available databases: HMDB, CoreMetabolome, VS_custom_database_long, SwissLipids
Downloading annotations for database  ('VS_custom_database_long', 'Sep2020')


100%|████████████████████████████████████████| 502/502 [00:03<00:00, 143.49it/s]


Ion image shape:  (100, 100)
