# Apply classification model

In [190]:
%reload_ext autoreload
%autoreload 2
%matplotlib notebook
# %matplotlib inline

In [191]:
import sys

from catboost import CatBoostClassifier, Pool
from scipy.ndimage import binary_dilation
from sklearn import clone
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
import numpy as np
import pandas as pd
from definitions import ROOT_DIR
import matplotlib.pyplot as plt
import seaborn as sns
from metaspace.sm_annotation_utils import SMInstance
from metaspace.image_processing import clip_hotspots

import getpass
from metaspace import SMInstance
from datetime import datetime

from matplotlib.colors import Normalize, LogNorm

In [192]:
# Suppress warnings, because many models spam them during feature selection
# as some subsets of features just don't have enough information to make
# a good model.
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

## Utility functions

In [193]:
def colorize_image_with_mask(image, mask):
    """Plotting function for combining a colorized ion image with a spot mask"""
    
    image = clip_hotspots(image)
    image /= np.max(image)
    
    on_spot_colorized = plt.cm.cividis(image)
    off_spot_colorized = plt.cm.magma(image)
    return np.where(mask[:,:,np.newaxis], on_spot_colorized, off_spot_colorized)
    
def save_image_with_mask(image, mask, fname):
    plt.imsave(fname, colorize_image_with_mask(image, mask))

In [194]:
def crop_zeros(img):
    """Crop an image, removing all empty outer rows/columns"""
    cols = np.flatnonzero(np.count_nonzero(img, axis=0) != 0)
    rows = np.flatnonzero(np.count_nonzero(img, axis=1) != 0)
    top = rows[0]
    bottom = rows[-1] + 1
    left = cols[0]
    right = cols[-1] + 1

    return img[top:bottom, left:right]

In [195]:
def get_mispredictions(model, X, y):
    """
    Find which values would be mispredicted, returning two lists:
        * indexes of items that would be falsely predicted as positives
        * indexes of items that would be falsely predicted as negatives
        
    cross_val_predict uses a shuffled 5-fold test-train split so that each chunk of 
    20% of the input data gets its own model that was trained on the other 80%, 
    ensuring that the items being predicted aren't included in the training data.
    """
    preds = cross_val_predict(model, X, y)
    mispreds = preds != y
    fpos_idxs = np.flatnonzero(mispreds & ~y)
    fneg_idxs = np.flatnonzero(mispreds & y)
        
    return fpos_idxs, fneg_idxs

## Paths

In [196]:
p_root_dir = Path(ROOT_DIR)

p_data = p_root_dir / "5_data_analysis"
p_analysis = p_root_dir  / "4_model_evaluation"
p_grids = p_root_dir / r"2_grid_calibration/grid_masks/20labs_all"
p_wellmap = p_root_dir / "5_data_analysis/wellmap.csv"

# Paths for evaluation
p_eval = p_analysis / "model_application"
p_pickles = p_eval / "pickles"
p_metrics = p_eval / "metrics"
p_apply = p_analysis / "model_application_best_replicates"
p_images = p_eval / "images.hdf5"
p_model = p_analysis / 'model_evaluation' / 'model.json'
p_datasets = p_root_dir / "5_data_analysis/Datasets_modified.csv"
p_metadata = p_data / "Datasets_01_June_2022.csv"
p_chem_class = p_data / "custom_classification_v2.csv"

timestamp = datetime.now().strftime("%d-%b-%Y") 
p_predictions = p_eval / f"all_predictions_{timestamp}.csv"
p_predictions_curated = p_apply / f"all_predictions_curated_{timestamp}.csv"

# False positives/negatives - preview output from model prediction for molecules with known labels
# Note that all files in these directories are cleared before a prediction run
p_eval_fpos = p_eval / 'false_positives'
p_eval_fneg = p_eval / 'false_negatives'
p_eval_tpos = p_eval / 'true_positives'
p_eval_tneg = p_eval / 'true_negatives'
# Unknown positives/negatives - preview output from model prediction for molecules with no label
# Note that all files in these directories are cleared before a prediction run
p_eval_upos = p_eval / 'unknown_positives'
p_eval_uneg = p_eval / 'unknown_negatives'
# Manually labeled positives/negatives - Move preview files from any of the above directories into 
# these directories to add to the labelled data. Make sure to re-run the appropriate steps 
# in "Input data" to detect the changes
p_eval_lpos = p_eval / 'manual_label_positives'
p_eval_lneg = p_eval / 'manual_label_negatives'
# Manually labeled positives/negatives - Move preview files from any of the above directories into 
# these directories to add to the labelled data. Make sure to re-run the appropriate steps 
# in "Input data" to detect the changes
p_apply_lpos = p_apply / 'manual_label_positives'
p_apply_lneg = p_apply / 'manual_label_negatives'
# Directories for three-state positive/unsure/negative classification
p_tri_pos = p_eval / 'three-state' / 'positive'
p_tri_unk = p_eval / 'three-state' / 'unsure'
p_tri_neg = p_eval / 'three-state' / 'negative'

# METASPACE
database = ('Spotting_project_compounds-v9', 'feb2021')
fdr = 0.5

print(timestamp)

15-Aug-2022


In [8]:
# Log into metaspace
sm = SMInstance(host='https://metaspace2020.eu')

if not sm.logged_in():
    # Using getpass here prevents the API key from being accidentally saved with this notebook.
    api_key = getpass.getpass(prompt='API key: ', stream=None)
    sm.login(api_key=api_key)

API key: ········


## Input data

In [197]:
# Get dataset IDs based on grid files 
datasets = pd.read_csv(p_datasets)
dataset_stems = [x.stem[-20:] for x in p_grids.glob("*.npy")]
dataset_paths = [x for x in p_grids.glob("*.npy")]
dataset_names = [x.stem for x in p_grids.glob("*.npy")]
dataset_ids = datasets['Clone ID']
dataset_new_ids = datasets['20 Labs ID']

In [198]:
#Check if we need to download additional data (assumes that correctly named pickle files are correct!)
to_download = []
pickles = [x.stem[-20:] for x in p_pickles.glob("*.pkl")]
for i, ds_id in enumerate(dataset_new_ids):
    if ds_id not in pickles:
        print(ds_id)
        to_download.append(ds_id)

2022-02-18_23h32m31s
2022-02-18_23h32m33s
2022-02-18_23h32m35s
2022-02-18_23h32m37s
2022-03-14_22h35m48s


In [33]:
# Images from METASPACE
# Ignore any warnings about connection pools in this step

p_eval.mkdir(parents=True, exist_ok=True)
#pickles = [x.stem[-20:] for x in p_eval.glob("*.pkl")]

for i, ds_id in enumerate(dataset_new_ids):
    if ds_id in to_download:
        images = []
        print(f'Downloading images for {ds_id} ({i}/{len(dataset_ids)-1})')
        dataset = sm.dataset(id=ds_id)
        ds_tic_image = dataset.tic_image()
        for img in dataset.all_annotation_images(
            fdr=fdr, 
            database=database, 
            only_first_isotope=True, 
            scale_intensity=True, 
            hotspot_clipping=False
        ):
            # Exclude annotations with no first-isotopic-image
            if img[0] is not None:
                images.append({
                    'dataset_id': ds_id,
                    'formula': img.formula,
                    'adduct': img.adduct,
                    'neutral_loss': img.neutral_loss or '',
                    'image': img[0],
                    'tic_norm_image': np.nan_to_num(img[0] / ds_tic_image),  # nan_to_num replaces nan values with 0.0. This line will probably complain about division by zero but it can be ignored as it's fixed by the nan_to_num
                })
        images_df = pd.DataFrame(images)
        images_df.to_pickle(p_pickles / f"images_{ds_id}.pkl")
        print(f'Images for {ds_id} saved')

In [199]:
# Wellmap and grids
wellmap = pd.read_csv(p_wellmap)
grids = {
    ds_stem: np.load(ds_p) 
    for ds_stem, ds_p in zip(dataset_stems, dataset_paths)
}

In [200]:
#Sanity check - do we have all the data that we have grids for?
sorted(set(grids)) == sorted(set(dataset_stems))

True

## Calculate metrics (or load pre-calculated)

In [201]:
# Calculate metrics
def calc_far_bg(mask, bg):
    """Gets mask for background pixels that are at least 4 radii away from the spot"""
    # 3 iterations = (1+3=)4x the spot radius
    expanded_spot = binary_dilation(mask, crop_zeros(mask), iterations=3)
    return bg & ~expanded_spot

def occ(px):
    """Calculates non-zero % of the given array"""
    return np.count_nonzero(px) / px.size

def calculate_metrics(merged_df, grids, dataset_ids, dataset_new_ids, path):
    
    lasterror = ""
    progress = 0
    metrics = []
    
    for row in merged_df.itertuples():
        
        progress = progress+1
        if progress % 1000 == 0:
            print(progress)
        
        if row.dataset_id in list(dataset_new_ids):
            oid = list(dataset_ids[dataset_new_ids==row.dataset_id].values)[0]
            grid = grids[oid]
        else:
            grid= grids[row.dataset_id]

        mask = grid == row.well
        bg = grid == 0
        
        #Catch missing wells
        try:
            far_bg = calc_far_bg(mask, bg)
        except:
            error = f"Missing well: {row.dataset_id} #{row.well}"
            if error != lasterror:
                print(error)
                lasterror = error
            continue

        in_mask = row.image[mask]   
        in_bg = row.image[bg]
        in_far_bg = row.image[far_bg]   
        in_other_spots = row.image[~bg & ~mask]

        # tic image
        in_mask_tic_norm = row.tic_norm_image[mask]
        in_bg_tic_norm = row.tic_norm_image[bg]
        in_far_bg_tic_norm = row.tic_norm_image[far_bg]
        in_other_spots_tic_norm = row.tic_norm_image[~bg & ~mask]

        # Calculate threshold (0.01 * 99th percentile) 
        # (note the image is already hotspot-removed, so the max is the 99th percentile)
        threshold = np.max(row.image) * 0.01
        metrics.append({
            'row_id': row[0],   # with .itertuples(), item[0] is the index
            'dataset_id' : row.dataset_id,
            'name_short' : row.name_short,
            'formula' : row.formula,
            'adduct' : row.adduct,
            'neutral_loss' : row.neutral_loss,
            'well' : row.well,
            # Original metrics
            # NOTE: The constant in the denominator of `on_off_ratio` was changed to
            # 0.001 as it seemed to produce slightly better results
            'occupancy_ratio': (occ(in_mask) * 100) / (occ(in_bg) * 100 + 1),
            'on_off_ratio': (np.mean(in_mask)) / (np.mean(in_bg) + 0.001),

            # Single-spot occupancy %
            'spot_occupancy': occ(in_mask),
            'spot_occupancy_thresholded': occ(in_mask > threshold),
            # Other occupancy metrics
            'image_occupancy': occ(row.image),
            'other_spots_occupancy': occ(in_other_spots),
            'bg_occupancy': occ(in_bg),
            'far_bg_occupancy': occ(in_bg),
            'occupancy_vs_far_bg_ratio' : (occ(in_mask) * 100) / (occ(in_far_bg) * 100 + 1),

            # How many spots have a non-zero pixel
            'in_n_spots': len(np.unique(grid[(grid != 0) & (row.image > threshold)])),

            # Intensity ratios
            'spot_intensity' : np.mean(in_mask),
            'spot_intensity_bgr_corrected' : np.mean(in_mask) - np.mean(in_far_bg),
            'spot_intensity_sum' : np.sum(in_mask),
            'spot_intensity_std' : np.std(in_mask),
            'other_spot_intensity': np.mean(in_other_spots),
            'bg_intensity' : np.mean(in_bg),
            'far_bg_intensity' : np.mean(in_far_bg),
            'intensity_vs_far_bg_ratio': np.mean(in_mask) / (np.mean(in_far_bg) + 0.001),
            'intensity_vs_other_spots_ratio': np.mean(in_mask) / (np.mean(in_other_spots) + 0.001),
           
            # Intensity ratios for TIC normalised
            'spot_intensity_tic_norm': np.mean(in_mask_tic_norm),
            'spot_intensity_bgr_corrected_tic_norm' : np.mean(in_mask_tic_norm) - np.mean(in_far_bg_tic_norm),
            'spot_intensity_sum_tic_norm' : np.sum(in_mask_tic_norm),
            'spot_intensity_std_tic_norm' : np.std(in_mask_tic_norm),
            'other_spot_intensity_tic_norm': np.mean(in_other_spots_tic_norm),
            'bg_intensity_tic' : np.mean(in_bg_tic_norm),
            'far_bg_intensity_tic' : np.mean(in_far_bg_tic_norm),
            'intensity_vs_far_bg_ratio_tic': np.mean(in_mask_tic_norm) / (np.mean(in_far_bg_tic_norm) + 0.001),
            'intensity_vs_other_spots_ratio_tic': np.mean(in_mask_tic_norm) / (np.mean(in_other_spots_tic_norm) + 0.001),
        })

    metrics_df = pd.DataFrame(metrics).set_index('row_id')
    metrics_df.to_csv(path)

In [203]:
# Load pre-saved individual images_df and generate metrics
start = 0 #Start and end points allow running a subset of pickles (for example when adding more data to the project)
count = 0
end = 200
for fpath in p_eval.rglob("*.pkl"):
    if fpath.stem[-20:] in pickles and count >= start and count <= end:
        print(f"Loading {fpath.name}")
        try:
            f = pd.read_pickle(fpath)
        except:
            print(f"Failed to load {fpath.name}")
            continue
        merged_df = f.merge(wellmap[['well', 'formula', 'name_short']], on=['formula']).reset_index()
        merged_df['row_id'] = [f'{row.dataset_id}_{row.formula}_{row.adduct}_{row.neutral_loss}_{row.well}' for row in merged_df.itertuples()]
        merged_df = merged_df.set_index('row_id')
        print(merged_df['dataset_id'].unique())
        calculate_metrics(merged_df, grids, dataset_ids, dataset_new_ids, p_metrics / f"Metrics_{timestamp}_{count}.csv")
            
    count = count+1

Loading images_2021-07-29_18h23m02s.pkl
['2021-07-29_18h23m02s']
1000
2000
3000
Loading images_2021-07-29_23h46m45s.pkl
['2021-07-29_23h46m45s']
1000
2000
3000
Loading images_2021-07-30_00h51m20s.pkl
['2021-07-30_00h51m20s']
1000
2000
3000
4000
Loading images_2021-07-30_02h04m13s.pkl
['2021-07-30_02h04m13s']
1000
2000
3000
4000
Loading images_2021-07-30_02h22m33s.pkl
['2021-07-30_02h22m33s']
1000
2000
3000
4000
Loading images_2021-07-30_02h32m16s.pkl
['2021-07-30_02h32m16s']
1000
2000
3000
Loading images_2021-07-30_02h53m49s.pkl
['2021-07-30_02h53m49s']
1000
2000
3000
4000
Loading images_2021-07-30_03h21m10s.pkl
['2021-07-30_03h21m10s']
1000
2000
3000
4000
Loading images_2021-07-30_04h08m37s.pkl
['2021-07-30_04h08m37s']
1000
2000
3000
4000
5000
Loading images_2021-07-30_04h20m26s.pkl
['2021-07-30_04h20m26s']
Missing well: 2021-07-30_04h20m26s #2
1000
2000
3000
4000
5000
Loading images_2021-07-30_04h27m16s.pkl
['2021-07-30_04h27m16s']
1000
2000
3000
4000
Loading images_2021-07-30_04h35m

Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well: 2022-02-18_23h32m46s #100
Missing well: 2022-02-18_23h32m46s #57
Missing well:

Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_23h32m46s #40
Missing well: 2022-02-18_23h32m46s #33
Missing well: 2022-02-18_

In [204]:
# Load multiple metrics files and join
metrics_list = []
for fpath in p_metrics.rglob("*.csv"):
    #if fpath.stem[-20:] in list(dataset_new_ids):
    #print(f"Loading {fpath.name}")
    try:
        f = pd.read_csv(fpath, index_col=0)
        metrics_list.append(f)
    except:
        print(f"Failed to load {fpath.name}")
metrics_df = pd.concat(metrics_list)#.reset_index()
# metrics_df = metrics_df.set_index('row_id')
# metrics_df = metrics_df.drop(columns=['index'])
metrics_df['score'] = np.nan

## Examine results for a specific model

In [205]:
model = CatBoostClassifier(verbose=False)
features = ['spot_intensity_tic_norm', 'spot_occupancy', 'occupancy_vs_far_bg_ratio', 'intensity_vs_far_bg_ratio', 'intensity_vs_other_spots_ratio']

model.load_model(p_model, format='json')

# Make predictions for all data
predictions_df = pd.DataFrame({
    'pred_val': model.predict_proba(metrics_df[features].values)[:, 1]
}, index=metrics_df.index)

### Both options: Assign labels to predictions

In [206]:
 # Make combined DF
output_df = metrics_df.join(predictions_df)
output_df = output_df[output_df['pred_val'] > -1] #Drops nan
#output_df = metrics_df.join(predictions_df)

# Add two-state and three-state classes
output_df['pred_twostate'] = np.where(output_df.pred_val < 0.5, 0, 1)
unsure_range = [0.2, 0.8] # Lowest & highest values to include in the "unsure" class
# This assigns 0 = negative, 1 = unsure, 2 = positive
output_df['pred_threestate'] = np.digitize(output_df.pred_val, unsure_range)

### Write predictions CSV files

In [209]:
#Manually merging results from partially overlapping images. 
#Simple way by reassigning all ids to the primary image, then removing any duplicates (from the overlap)

csv_df = output_df
csv_df = csv_df.replace('2022-02-18_23h32m33s','2022-02-18_23h32m31s')
csv_df = csv_df.replace('2022-02-18_23h32m37s','2022-02-18_23h32m35s')
csv_df = csv_df.replace('2022-02-18_23h32m46s','2022-02-18_23h32m44s')

csv_df = csv_df.drop_duplicates(subset=['dataset_id','well','formula','adduct','neutral_loss'])

#Trim overlapping annotations in unmerged dataset pairs, keep highest pred_val

datasets = pd.read_csv(p_metadata)
datasets_info = datasets.groupby('Dataset ID').first()[['Polarity', 'Participant lab', 'Slide code', 'All', 'EMBL', 'Interlab', 'Technology', 'Matrix short']] # 'Participant lab', 'Technology'
datasets_info['sample_name'] = datasets_info['Slide code'] + ': ' + datasets_info['Technology'] + ': ' + datasets_info['Matrix short']
df = pd.merge(csv_df, datasets_info[['sample_name', 'Polarity']], left_on='dataset_id', right_on='Dataset ID', how='left')
df.sort_values(by='pred_val', ascending=False)
df = df.drop_duplicates(subset=['sample_name','well','formula','adduct','neutral_loss', 'Polarity'], keep='first')
df = df.drop(columns=['sample_name', 'Polarity']).sort_index()

In [210]:
csv_df = df

In [142]:
set(df['pred_threestate'])

{0, 1, 2}

In [212]:
len(df.neutral_loss)

443000

In [211]:
#csv_df = output_df.drop(columns=['image', 'filename']) # Skip unwanted columns
csv_df.to_csv(p_predictions)

for dataset_id, results_df in csv_df.groupby('dataset_id'):
    output_path = p_eval / f'{dataset_id}_predictions.csv'
    results_df.to_csv(output_path)

### Write image files into false positives, false negatives, etc.

In [147]:
# Clean output directories
for output_path in [
#     p_eval_fpos, p_eval_fneg, p_eval_tpos, p_eval_tneg, 
    p_eval_upos, p_eval_uneg, 
    p_tri_pos, p_tri_unk, p_tri_neg
]:
    output_path.mkdir(parents=True, exist_ok=True)
    for f in output_path.glob('*.png'):
        f.unlink()  # Delete existing files

# Write images with two-state classification
for row in output_df.itertuples():
    
    if row.dataset_id in list(dataset_new_ids):
        oid = list(dataset_ids[dataset_new_ids==row.dataset_id].values)[0]
        grid = grids[oid]
    else:
        grid = grids[row.dataset_id]
    
    try:
        mask = grid == row.well
    except:
        print("Error")
    # Figure out which directory to use
#     if row.score == 0:
#         twostate_path = [p_eval_tneg, p_eval_fpos][row.pred_twostate]
#     elif row.score == 1:
#         twostate_path = [p_eval_fneg, p_eval_tpos][row.pred_twostate]
#     else:
    twostate_path = [p_eval_uneg, p_eval_upos][row.pred_twostate]
    
    save_image_with_mask(row.image, mask, twostate_path / row.filename)
    
# Write images with three-state classification
for row in output_df.itertuples():
    
    if row.dataset_id in list(dataset_new_ids):
        oid = list(dataset_ids[dataset_new_ids==row.dataset_id].values)[0]
        grid = grids[oid]
    else:
        grid = grids[row.dataset_id]
    
    try:
        mask = grid == row.well
        threestate_path = [p_tri_neg, p_tri_unk, p_tri_pos][row.pred_threestate]
        save_image_with_mask(row.image, mask, threestate_path / row.filename)
    except:
        print("Error")


AttributeError: 'Pandas' object has no attribute 'image'

In [180]:
csv_df

Unnamed: 0_level_0,dataset_id,name_short,formula,adduct,neutral_loss,well,occupancy_ratio,on_off_ratio,spot_occupancy,spot_occupancy_thresholded,...,spot_intensity_std_tic_norm,other_spot_intensity_tic_norm,bg_intensity_tic,far_bg_intensity_tic,intensity_vs_far_bg_ratio_tic,intensity_vs_other_spots_ratio_tic,score,pred_val,pred_twostate,pred_threestate
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-07-29_18h23m02s_C42H81NO8_[M]-_+C8H8O3_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,[M]-,+C8H8O3,37,27.683616,3.473016e+07,0.276836,0.237288,...,0.005269,3.342277e-08,0.000000e+00,0.000000e+00,1.968299,1.968233,,0.998725,1,2
2021-07-29_18h23m02s_C42H81NO8_[M]-__37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,[M]-,,37,12.896532,9.776323e+02,0.338983,0.288136,...,0.000707,3.735256e-07,1.014762e-06,1.038963e-06,0.282808,0.282996,,0.997522,1,2
2021-07-29_18h23m02s_C42H81NO8_[M]-_-H2_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,[M]-,-H2,37,1.818681,3.103635e+01,0.508475,0.384181,...,0.000167,7.464335e-06,9.086224e-06,9.278818e-06,0.073879,0.074012,,0.759375,1,1
2021-07-29_18h23m02s_C42H81NO8_-H_+H2_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,-H,+H2,37,1.637234,3.772151e+01,0.372881,0.271186,...,0.000170,5.479433e-06,6.899984e-06,7.058225e-06,0.068767,0.068875,,0.521369,1,1
2021-07-29_18h23m02s_C42H81NO8_+Cl_+C8H8O3_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,+Cl,+C8H8O3,37,0.663699,8.734966e-01,0.248588,0.248588,...,0.000010,1.427082e-05,1.554668e-05,1.586990e-05,0.004272,0.004278,,0.009389,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-29_18h23m02s_C3H4O3_-H_-H2_102,2021-07-29_18h23m02s,Pyruvic acid,C3H4O3,-H,-H2,102,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,1.440544e-08,1.386637e-08,1.422462e-08,0.000000,0.000000,,0.006168,0,0
2021-07-29_18h23m02s_C3H4O3_+Cl_-H2_102,2021-07-29_18h23m02s,Pyruvic acid,C3H4O3,+Cl,-H2,102,0.507274,4.549708e+00,0.005650,0.005650,...,0.000002,2.226675e-08,1.698576e-08,1.656464e-08,0.000182,0.000182,,0.003462,0,0
2021-07-29_18h23m02s_C3H4O3_+Cl__102,2021-07-29_18h23m02s,Pyruvic acid,C3H4O3,+Cl,,102,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,3.610870e-08,2.223975e-08,2.264994e-08,0.000000,0.000000,,0.006168,0,0
2021-07-29_18h23m02s_C3H4O3_+Cl_-CO2_102,2021-07-29_18h23m02s,Pyruvic acid,C3H4O3,+Cl,-CO2,102,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,9.106587e-09,1.128143e-08,1.157289e-08,0.000000,0.000000,,0.006168,0,0


In [65]:
csv_df.Polarity

row_id
2021-07-29_18h23m02s_C10H12N2O3_+Cl_+C8H8O3_155    negative
2021-07-29_18h23m02s_C10H12N2O3_+Cl_+H2_155        negative
2021-07-29_18h23m02s_C10H12N2O3_+Cl_-CH2O3_155     negative
2021-07-29_18h23m02s_C10H12N2O3_+Cl_-CO2_155       negative
2021-07-29_18h23m02s_C10H12N2O3_+Cl_-H2O_155       negative
                                                     ...   
2022-06-01_18h22m03s_CH4NO5P_[M]+_-CH2O3_94        positive
2022-06-01_18h22m03s_CH4NO5P_[M]+_-CO2_94          positive
2022-06-01_18h22m03s_CH4NO5P_[M]+_-H2O_94          positive
2022-06-01_18h22m03s_CH4NO5P_[M]+_-H2_94           positive
2022-06-01_18h22m03s_CH4NO5P_[M]+_-NH3_94          positive
Name: Polarity, Length: 390715, dtype: object

In [148]:
output_df

Unnamed: 0_level_0,dataset_id,name_short,formula,adduct,neutral_loss,well,occupancy_ratio,on_off_ratio,spot_occupancy,spot_occupancy_thresholded,...,intensity_vs_other_spots_ratio,spot_intensity_tic_norm,spot_intensity_bgr_corrected_tic_norm,spot_intensity_sum_tic_norm,spot_intensity_std_tic_norm,other_spot_intensity_tic_norm,score,pred_val,pred_twostate,pred_threestate
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-07-29_18h23m02s_C42H81NO8_[M]-_+C8H8O3_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,[M]-,+C8H8O3,37,27.683616,3.473016e+07,0.276836,0.237288,...,1.968233,1.968299e-03,1.968299e-03,0.348389,0.005269,3.342277e-08,,0.675284,1,1
2021-07-29_18h23m02s_C42H81NO8_[M]-__37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,[M]-,,37,12.896532,9.776323e+02,0.338983,0.288136,...,0.282996,2.831021e-04,2.820631e-04,0.050109,0.000707,3.735256e-07,,0.576847,1,1
2021-07-29_18h23m02s_C42H81NO8_[M]-_-H2_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,[M]-,-H2,37,1.818681,3.103635e+01,0.508475,0.384181,...,0.074012,7.456457e-05,6.528575e-05,0.013198,0.000167,7.464335e-06,,0.007378,0,0
2021-07-29_18h23m02s_C42H81NO8_-H_+H2_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,-H,+H2,37,1.637234,3.772151e+01,0.372881,0.271186,...,0.068875,6.925205e-05,6.219382e-05,0.012258,0.000170,5.479433e-06,,0.005964,0,0
2021-07-29_18h23m02s_C42H81NO8_+Cl_+C8H8O3_37,2021-07-29_18h23m02s,GlcCer d18:1/18:0,C42H81NO8,+Cl,+C8H8O3,37,0.663699,8.734966e-01,0.248588,0.248588,...,0.004278,4.339548e-06,-1.153035e-05,0.000768,0.000010,1.427082e-05,,0.009554,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-01_18h22m03s_C2H5NO2_[M]+__93,2022-06-01_18h22m03s,Glycine,C2H5NO2,[M]+,,93,1.314244,1.410388e+01,0.014493,0.014493,...,0.000295,2.946150e-07,2.737837e-07,0.000020,0.000002,2.012838e-08,,0.002403,0,0
2022-06-01_18h22m03s_C2H5NO2_+Na__93,2022-06-01_18h22m03s,Glycine,C2H5NO2,+Na,,93,5.060521,2.637430e+02,0.057971,0.057971,...,0.009213,9.213558e-06,9.183582e-06,0.000636,0.000057,1.484226e-08,,0.087077,0,0
2022-06-01_18h22m03s_C2H5NO2_+Na_+H2_93,2022-06-01_18h22m03s,Glycine,C2H5NO2,+Na,+H2,93,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,-1.484649e-08,0.000000,0.000000,6.689161e-09,,0.006168,0,0
2022-06-01_18h22m03s_C2H5NO2_+H__93,2022-06-01_18h22m03s,Glycine,C2H5NO2,+H,,93,5.447302,3.522930e+02,0.057971,0.057971,...,0.005462,5.461903e-06,5.448562e-06,0.000377,0.000029,1.840173e-08,,0.094427,0,0
