# Concatenate individual interlaboratory Catboost prediction result files

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
p_root_dir = Path.cwd().parents[0]
p_datasets = p_root_dir / r"5_data/metadata/Datasets_18Apr2023.csv"
p_data = p_root_dir / r"4_apply_classifier/individual_predictions"
p_out = p_root_dir / r"4_apply_classifier/concatenated_predictions"

In [3]:
# Load metadata file and select only 10ppm metadata
datasets = pd.read_csv(p_datasets, index_col=0)
metadata = datasets[datasets.All]

# Create an empty DataFrame to hold concatenated data
concatenated_df = pd.DataFrame()

# Loop through all csv files in directory
for p_prediction in p_data.glob("*.csv"):
    
    # Load csv file
    prediction_df = pd.read_csv(p_prediction, index_col=0)
    
    # Extract dataset ID
    dataset_id = prediction_df.dataset_id.unique()[0]
    
    # Append csv_df to concatenated_df
    concatenated_df = pd.concat([concatenated_df, prediction_df])
        
# Merge metadata_df with concatenated_df on dataset_id
final_df = pd.merge(metadata, concatenated_df, right_on="dataset_id", left_on="Dataset ID")

In [4]:
# Remove one adduct/neutral_loss combination that doesn"t make sense (-H + HCL= +CL)
final_df.score = final_df.score.fillna("")
final_df.neutral_loss = final_df.neutral_loss.fillna("")
final_df = final_df[~((final_df.neutral_loss == "+HCl")&(final_df.adduct == "-H"))]

In [5]:
# A handful Inerlaboratory survey participants recorded unmergable imzML files (e.g. 1 for lower mass range and one for higher mass range)
# For such cases, we change "dataset_id" column to represent both IDs
metadata["ID for the app"] = metadata.groupby("Sample name")["Dataset ID"].transform(lambda x: "|".join(set(x)))
final_df = final_df.merge(metadata[["Dataset ID", "ID for the app"]].drop_duplicates(), on="Dataset ID", how="left")
final_df.dataset_id = final_df["ID for the app"]

# Next, we make sure that the results in such cases are merged together so that there are no duplicate ions per "Sample name" 
# In particular, where two rows represent the same ion, we only keep the results for ions with best prediction score ("pred_val")
rows_wo_duplicates = final_df.groupby(["Sample name","name_short", "adduct", "neutral_loss"])["pred_val"].idxmax()
print(f"{len(final_df) - len(rows_wo_duplicates)} duplicated ion entries are removed in this step")
final_df = final_df.loc[rows_wo_duplicates]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata["ID for the app"] = metadata.groupby("Sample name")["Dataset ID"].transform(lambda x: "|".join(set(x)))


70879 duplicated ion entries are removed in this step


In [6]:
# Save data files containing metadata for plotting and for the app
cols_to_drop = ['m/z range', 'Pixel size', 'Original Dataset ID (3ppm)', 'Dataset ID', 'Original link',
       'Dataset link', 'EMBL', 'Interlab', 'All', 'ID for the app']

# Save input for Interlab data tab and plotting
interlab_df = final_df[final_df.Interlab].drop(columns = cols_to_drop)
interlab_df.to_csv(p_out / "Interlab_data_19Apr2023.csv", index=False)

# Save input for All data tab
final_df = final_df.drop(columns = cols_to_drop)
final_df.to_csv(p_out / "All_data_19Apr2023.csv", index=False)

In [8]:
# Save the meadata file that includes modified ids (just in case)
metadata = metadata.drop(columns = ['Dataset name', 'Project', 'm/z range', 'Original Dataset ID (3ppm)', 'Dataset ID', 'Original link',
       'Dataset link', 'EMBL', 'Pixel size']).rename(columns={"ID for the app":"Dataset ID"}).drop_duplicates()
metadata.to_csv(p_out / "Interlaboratory-All_metadata_19Apr2023.csv", index=False)