# ML model:

### One-Hot Encoded Models

### 400 files

### with keywords

In [26]:
# === MODELL SPEICHERN ===
prefix = "onehot_400_wkey"
clf_file = f"prometainfer/models/{prefix}_model.pkl"
p_file = f"prometainfer/models/{prefix}_preprocessor.pkl"
enc_file = f"prometainfer/models/{prefix}_label_encoders.pkl"
train_features = f"prometainfer/models/{prefix}_train_features.csv"
keyword_files = "../data/ML_test/keyword_files/keyword_parsing_results.csv"

In [27]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer  # For handling NaNs
import joblib

In [28]:
import os
import re
import pandas as pd
from pathlib import Path

# === 1. EXTRACT FEATURES FROM OpenMS FILEINFO TXT FILES ===
def extract_features_from_txt(file_path):
    """Extracts features from an OpenMS FileInfo text file."""
    features = {
        "instrument_model": "Not available",
        "organism": "Not available",
        "tissue": "Not available",
        "disease": "Not available",
        "software": "Not available",
        "activation_method": "",
        "experiment_type": "Not available",
        "fraction_identifier": "Not available",
        "quantification_method": "Not available",
        "cleavage_agent": "Not available",
    }

    with open(file_path, "r") as f:
        content = f.readlines()
    
    if not content:
        print("empty file")

    for line in content:
        line = line.strip()

        # Extract retention time range
        if "retention time:" in line:
            match = re.findall(r"([\d\.]+)", line)
            if len(match) == 2:
                features["rt_min"] = float(match[0])
                features["rt_max"] = float(match[1])
        
        # Extract mass-to-charge (m/z) range
        elif "mass-to-charge:" in line:
            match = re.findall(r"([\d\.]+)", line)
            if len(match) == 2:
                features["mz_min"] = float(match[0])
                features["mz_max"] = float(match[1])
        
        # Extract intensity range
        elif "intensity:" in line:
            match = re.findall(r"([\d\.e\+]+)", line)  # Handle scientific notation
            if len(match) == 2:
                features["intensity_min"] = float(match[0])
                features["intensity_max"] = float(match[1])
        
        # Extract precursor charge distribution
        elif "charge" in line and "x" in line:
            match = re.search(r"charge (\d+): (\d+)x", line)
            if match:
                charge, count = int(match.group(1)), int(match.group(2))
                features[f"precursor_charge_{charge}"] = count
        
        # Extract instrument model
        elif "Instrument:" in line:
            if features["instrument_model"] == "Not available":
                if(line.split("Instrument: ")[-1].strip() != "Instrument:"):
                    features["instrument_model"] = line.split("Instrument: ")[-1].strip()
        
        # Extract total number of peaks
        elif "Total number of peaks:" in line:
            match = re.search(r"(\d+)", line)
            if match:
                features["total_peaks"] = int(match.group(0))
        
        # Extract number of spectra
        elif "Number of spectra:" in line:
            match = re.search(r"(\d+)", line)
            if match:
                features["num_spectra"] = int(match.group(0))
        
        # Extract organism, tissue, and disease (if available)
        elif "organism:" in line.lower():
            if line.split(": ")[-1].strip() != 'organism:':
                features["organism"] = line.split(": ")[-1].strip() 
        elif "tissue:" in line.lower():
            features["tissue"] = line.split(": ")[-1].strip()
        elif "disease:" in line.lower():
            features["disease"] = line.split(": ")[-1].strip()
        
        # Extract software information
        elif "software name:" in line.lower():
            if(line.split(": ")[-1].strip() != 'software name:'):
                features["software"] = line.split(": ")[-1].strip()

        elif(features["activation_method"] == 'Not available'):
            features["activation_method"] = line.split(':')[0].split("MS-Level 2 & ")[-1].strip()
        
        # Extract activation method
        elif "activation methods" in line.lower():
            features["activation_method"] = 'Not available'
        
        # Extract experiment type (e.g., bottom-up, top-down)
        elif "experiment type:" in line.lower():
            features["experiment_type"] = line.split(": ")[-1].strip()
        
        # Extract fraction identifier
        elif "fraction identifier:" in line.lower():
            features["fraction_identifier"] = line.split(": ")[-1].strip()
        
        # Extract quantification method
        elif "quantification:" in line.lower():
            features["quantification_method"] = line.split(": ")[-1].strip()
        
        # Extract cleavage agent details
        elif "cleavage agent:" in line.lower():
            features["cleavage_agent"] = line.split(": ")[-1].strip()
    if(features["activation_method"] == ""):
        features["activation_method"] = "Not available"
    return features

# === 2. PROCESS ALL FILEINFO TXT FILES ===
txt_folders = ["../data/ML_test/fileinfo_files"]
data = []

for folder in txt_folders:
    folder_path = Path(folder)
    for txt_file in folder_path.glob("*"):  # Iterate over all files
        features = extract_features_from_txt(txt_file)
        features["Filename"] = str(txt_file).split("/")[-1]+".mzML"#.split('_fileinfo')[0].split('txts/')[-1]  # Convert filename for matching
        data.append(features)

# Convert to DataFrame
features_df = pd.DataFrame(data)
features_df

Unnamed: 0,instrument_model,organism,tissue,disease,software,activation_method,experiment_type,fraction_identifier,quantification_method,cleavage_agent,...,precursor_charge_41,precursor_charge_42,precursor_charge_43,precursor_charge_44,precursor_charge_45,precursor_charge_46,precursor_charge_47,precursor_charge_48,precursor_charge_49,precursor_charge_50
0,Not available,Not available,Not available,Not available,Proteome Discoverer,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
1,Not available,Not available,Not available,Not available,Proteome Discoverer,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
2,Q Exactive,Not available,Not available,Not available,ProteoWizard software,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
3,Orbitrap Fusion,Not available,Not available,Not available,ProteoWizard software,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
4,Not available,Not available,Not available,Not available,ProteoWizard software,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,Not available,Not available,Not available,Not available,Proteome Discoverer,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
396,Not available,Not available,Not available,Not available,Proteome Discoverer,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
397,TripleTOF 6600,Not available,Not available,Not available,ProteoWizard software,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,
398,Not available,Not available,Not available,Not available,Proteome Discoverer,HCID (High-energy collision-induced dissociation),Not available,Not available,Not available,Not available,...,,,,,,,,,,


In [29]:
# load metadata and merge with features
metadata = pd.read_csv("../data/metadata/prot_ids_and_pride_meta.csv")
df = metadata.merge(features_df, on="Filename", how="inner")

# load parsed keywords
#keywords_file = "../data/ML_test/keyword_files/keyword_parsing_results.csv"
#keywords_df = pd.read_csv(keywords_file)
#df = keywords_df.merge(df, on="Filename", how="inner")

# preprocessing
label_columns = ["Organism", "Organism part", "Diseases", "Modification", "Instrument","Software",
                 "Experiment Type","Quantification"]
df[label_columns] = df[label_columns].fillna("Not available")
df[label_columns]

Unnamed: 0,Organism,Organism part,Diseases,Modification,Instrument,Software,Experiment Type,Quantification
0,Mus musculus (mouse),"Cortex of kidney, Kidney",Renal cell carcinoma,"monohydroxylated residue, acetylated residue, ...",Orbitrap Fusion Lumos,"SEQUEST, Proteome Discoverer, Percolator",Shotgun proteomics,precursor ion
1,"Trypanosoma brucei brucei, Leishmania major st...",Not available,Not available,No PTMs are included in the dataset,Orbitrap Fusion Lumos,Not available,Shotgun proteomics,Not available
2,Mus musculus (mouse),Cell culture,Not available,No PTMs are included in the dataset,LTQ Orbitrap Velos,Mascot,Gel-based experiment,Not available
3,Homo sapiens (human),"Epithelial cell, Early embryonic cell",Not available,methylthiolated residue,Q Exactive,Not available,Affinity purification coupled with mass spectr...,Not available
4,Escherichia coli,Not available,Not available,No PTMs are included in the dataset,Q Exactive,Not available,RNA mass spectrometry,Not available
...,...,...,...,...,...,...,...,...
389,Homo sapiens (human),"Substantia nigra, Brain, Dopaminergic neuron",Not available,"monohydroxylated residue, deamidated residue, ...",Orbitrap Fusion Lumos,"Andromeda, MaxQuant",Shotgun proteomics,MS1 intensity based label-free quantification ...
390,Homo sapiens (human),"Cell culture, Brain",Human immunodeficiency virus infectious disease,"monohydroxylated residue, TMT6plex-126 reporte...",Q Exactive,Not available,Shotgun proteomics,TMT
391,"Homo sapiens (human), Mus musculus (mouse)",Not available,Not available,"Gln->pyro-Glu, Carbamyl, TMT6plex, Deamidated,...",Q Exactive,"2.2.0.388 2.2.0.388, Mascot Server 2.4.1, Masc...",Shotgun proteomics,TMT
392,Homo sapiens (human),Brain,Alzheimer's disease,"monohydroxylated residue, phosphorylated resid...",LTQ Orbitrap Velos,Mascot,Shotgun proteomics,Spectrum counting


In [30]:
### Eucaryote/ Bacteria/ Virus prediction -- Generate labels

domain_mapping = {
    "Homo sapiens (human)": "Eukaryota",
    "Mus musculus (mouse)": "Eukaryota",
    "Arabidopsis thaliana (mouse-ear cress)": "Eukaryota",
    "Escherichia coli": "Bacteria",
    "Rattus norvegicus (rat)": "Eukaryota",
    "Saccharomyces cerevisiae (baker's yeast)": "Eukaryota",
    "Plasmodium berghei anka": "Eukaryota",
    "Bos taurus (bovine)": "Eukaryota",
    "Brassica oleracea var. botrytis (cauliflower)": "Eukaryota",
    "Danio rerio (zebrafish) (brachydanio rerio)": "Eukaryota",
    "Gallus gallus (chicken)": "Eukaryota",
    "Apis mellifera (honeybee)": "Eukaryota",
    "Naja melanoleuca": "Eukaryota",
    "Deinococcus radiodurans r1": "Bacteria",
    "Kalanchoe fedtschenkoi": "Eukaryota",
    "Toxoplasma gondii rh": "Eukaryota",
    "Phaeodactylum tricornutum (strain ccap 1055/1)": "Eukaryota",
    "Mycobacterium avium 104": "Bacteria",
    "Elysia crispata": "Eukaryota",
    "Phormidium sp. oscr": "Bacteria",
    "Norovirus": "Virus",
    "Caenorhabditis elegans": "Eukaryota",
    "Saccostrea glomerata": "Eukaryota",
    "Botryotinia fuckeliana (noble rot fungus) (botrytis cinerea)": "Eukaryota",
    "Candida glabrata (yeast) (torulopsis glabrata)": "Eukaryota",
    "Giardia lamblia atcc 50803": "Eukaryota",
    "Triticum aestivum (wheat)": "Eukaryota",
    "Streptomyces coelicolor": "Bacteria",
    "Coffea canephora": "Eukaryota",
    "Hordeum vulgare (barley)": "Eukaryota",
    "Pisum sativum (garden pea)": "Eukaryota",
    "Brugia malayi": "Eukaryota",
    "Plasmodium vivax": "Eukaryota",
    "Prochlorococcus marinus subsp. pastoris str. ccmp1986": "Bacteria",
    "Cupriavidus necator (strain atcc 17699 / h16 / dsm 428 / stanier 337) (ralstonia eutropha)": "Bacteria",
    "Nicotiana tabacum (common tobacco)": "Eukaryota",
    "Chlamydomonas reinhardtii": "Eukaryota",
    "Sus scrofa domesticus (domestic pig)": "Eukaryota",
    "Pinus pinaster (maritime pine)": "Eukaryota",
    "Emiliania huxleyi": "Eukaryota",
    "Plasmodium berghei": "Eukaryota",
    "[candida] glabrata cbs 138": "Eukaryota",
    "Trichuris suis": "Eukaryota",
    "Methanothrix soehngenii gp6": "Archaea",
    "Clostridium cellulolyticum (strain atcc 35319 / dsm 5812 / jcm 6584 / h10)": "Bacteria",
    "Methanospirillum hungatei jf-1": "Archaea",
    "Desulfovibrio vulgaris str. hildenborough": "Bacteria",
    "Haloferax volcanii (halobacterium volcanii)": "Archaea",
    "Trypanosoma brucei brucei, Leishmania major strain friedlin": "Eukaryota",
    "Toxoplasma gondii": "Eukaryota",
    "Synechococcus sp. (strain wh8102)": "Bacteria",
    "Cryptococcus neoformans var. grubii vni" : "Eukaryota",
    "Rupicapra pyrenaica, Ovis aries musimon, Capra pyrenaica" : "Eukaryota",
    "Streptococcus pyogenes stab901":"Bacteria",
    "Bemisia sp. unknown_th_13_12_s.lycopersicum": "Eukaryota"
}

# Funktion zur Ermittlung der Domain
def get_domain(organism):
    domains = set()
    for key, value in domain_mapping.items():
        if key in organism:
            domains.add(value)
    return ", ".join(domains) if domains else "Unknown"


df["Domain"] = df["Organism"].apply(get_domain)

df[["Organism", "Domain"]].to_csv("../data/metadata/domains.csv")

label_columns = ["Domain","Organism", "Organism part", "Diseases", "Modification", 
                 "Experiment Type", "Instrument", "Quantification", "Software"]

Get unique count values for label columns to determine baseline (random prediction).

In [31]:
for col in df[label_columns].columns:
    print(f"{col}: {df[col].nunique()} unique values")

Domain: 7 unique values
Organism: 71 unique values
Organism part: 108 unique values
Diseases: 56 unique values
Modification: 88 unique values
Experiment Type: 27 unique values
Instrument: 47 unique values
Quantification: 25 unique values
Software: 52 unique values


### One-Hot-Encoding Organism

##### Insert keywords

In [32]:
# === MODELL SPEICHERN ===
prefix = "org_onehot_400_wkey"
clf_file = f"prometainfer/models/{prefix}_model.pkl"
p_file = f"prometainfer/models/{prefix}_preprocessor.pkl"
enc_file = f"prometainfer/models/{prefix}_label_encoders.pkl"
train_features = f"prometainfer/models/{prefix}_train_features.csv"
keyword_files = "../data/ML_test/keyword_files/keyword_parsing_results.csv"

### add keyword features
keywords_df = pd.read_csv(keyword_files)
organism_df = keywords_df.merge(df, on="Filename", how="inner")

organism_df.to_csv(train_features, index = False)

organism_df = organism_df.drop(columns=["Filename", "PRIDE ID", "Unnamed: 0", "Submission Date",
                     "Publication Date", "Country", "Keywords"] + ['Domain','Organism part',
                                                                   'Diseases','Modification','Experiment Type',
                                                                   'Instrument','Quantification','Software'])

### One-hot encoding
organism_df["Organism"] = organism_df["Organism"].str.split(",")
organism_df = organism_df.explode("Organism")
organism_df = pd.get_dummies(organism_df, columns=["Organism"])
# Aggregate back to original rows (if needed)
#organism_df = organism_df.groupby(organism_df.index).max().reset_index(drop=True)
organism_labels = [col for col in organism_df.columns if col.startswith("Organism")]

print(len(organism_labels))

86


In [33]:
# Features & Labels splitten
X = organism_df.drop(columns=organism_labels)
y = organism_df[organism_labels]

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Create a ColumnTransformer to preprocess categorical and numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        # Impute NaNs in numeric columns with the mean, then scale
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),  # Impute NaNs with mean
            ("scaler", StandardScaler())  # Scale numeric columns
        ]), numeric_cols),
        
        # Impute NaNs in categorical columns with the most frequent value, then one-hot encode
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute NaNs with mode
            ("onehot", OneHotEncoder(handle_unknown="ignore"))  # One-hot encode categorical columns
        ]), categorical_cols),
    ]
)

# Apply preprocessing to X
X_preprocessed = preprocessor.fit_transform(X)

# === 5. MACHINE LEARNING MODELL TRAINIEREN ===
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

print(X_train.shape)

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

# Vorhersagen
y_pred = rf.predict(X_test)

# === 6. PERFORMANCE AUSWERTEN ===
for i, col in enumerate(organism_labels):
    print(f"Classification Report for {col}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i], zero_division=0))

# === 7. MODELL SPEICHERN ===
joblib.dump(rf, clf_file)
joblib.dump(preprocessor, p_file)
#joblib.dump(label_encoders, enc_file)

(382, 5645)
Classification Report for Organism_ Arabidopsis thaliana (mouse-ear cress):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96

    accuracy                           1.00        96
   macro avg       1.00      1.00      1.00        96
weighted avg       1.00      1.00      1.00        96

Classification Report for Organism_ Bacillus cereus:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96

    accuracy                           1.00        96
   macro avg       1.00      1.00      1.00        96
weighted avg       1.00      1.00      1.00        96

Classification Report for Organism_ Bacillus subtilis:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        95
           1       0.00      0.00      0.00         1

    accuracy                           0.99        96
   macro avg       0.49      0.50      0

['prometainfer/models/org_onehot_400_wkey_preprocessor.pkl']

In [None]:
numeric_feature_names = numeric_cols.tolist()
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']

try:
    categorical_feature_names = onehot_encoder.get_feature_names_out(categorical_cols)
except AttributeError:
    categorical_feature_names = []
    for i, col in enumerate(categorical_cols):
        categories = onehot_encoder.categories_[i]
        categorical_feature_names.extend([f"{col}_{cat}" for cat in categories])

all_feature_names = numeric_feature_names + categorical_feature_names
print(len(all_feature_names),"Finale Feature-Namen:", all_feature_names)

5645 Finale Feature-Namen: ['iTRAQ', 'SILAC', 'SWATH', 'phospho', 'acetyl', 'methyl', 'glygly', 'oxidation', 'carbamidomethyl', '+15.99', '+79.97', '+42.01', '+14.02', '+114.04', '+57.02', 'human', 'mouse', 'rat', 'yeast', 'bacteria', 'virus', 'fly', 'liver', 'kidney', 'brain', 'heart', 'lung', 'plasma', 'serum', 'cancer', 'RAT_counthits', 'RAT_avgevalhits', 'HUMAN_counthits', 'HUMAN_avgevalhits', 'CANLF_counthits', 'CANLF_avgevalhits', 'MOUSE_counthits', 'MOUSE_avgevalhits', 'CHICK_counthits', 'CHICK_avgevalhits', 'HORSE_counthits', 'HORSE_avgevalhits', 'BOVIN_counthits', 'BOVIN_avgevalhits', 'RABIT_counthits', 'RABIT_avgevalhits', 'PANTR_counthits', 'PANTR_avgevalhits', 'DESRO_counthits', 'DESRO_avgevalhits', 'FELCA_counthits', 'FELCA_avgevalhits', 'PONAB_counthits', 'PONAB_avgevalhits', 'MACMU_counthits', 'MACMU_avgevalhits', 'MACFA_counthits', 'MACFA_avgevalhits', 'XENLA_counthits', 'XENLA_avgevalhits', 'PIG_counthits', 'PIG_avgevalhits', 'MACNE_counthits', 'MACNE_avgevalhits', 'CO

In [121]:
# Vorhersagen zurück in DataFrame umwandeln
y_pred_df = pd.DataFrame(y_pred, columns=organism_labels, index=y_test.index)

# Echte Labels auch als DataFrame
y_test_df = pd.DataFrame(y_test, columns=organism_labels, index=y_test.index)

def one_hot_to_labels(df, labels):
    return df.apply(lambda row: ", ".join([labels[i] for i in range(len(row)) if row[i] == 1]), axis=1)

# Erstelle eine Spalte mit den vorhergesagten Organismen
y_pred_labels = one_hot_to_labels(y_pred_df, organism_labels)

# Erstelle eine Spalte mit den tatsächlichen Organismen
y_test_labels = one_hot_to_labels(y_test_df, organism_labels)

# Berechnung der Accuracy (Anteil korrekt vorhergesagter vollständiger Organismus-Sets)
overall_accuracy = (y_pred_labels == y_test_labels).mean()

print(f"Overall Accuracy: {overall_accuracy:.4f}")


Overall Accuracy: 0.3021


In [122]:
mask = [pred is not None and pred != "" for pred in y_pred_labels]

# Alternativ, wenn es numpy Arrays sind:
# mask = ~np.array([pred is None or pred == "" for pred in y_pred_labels])

# Nur gültige Einträge auswählen
valid_y_pred = np.array(y_pred_labels)[mask]
valid_y_test = np.array(y_test_labels)[mask]

# Accuracy berechnen
overall_accuracy = (valid_y_pred == valid_y_test).mean()
print(overall_accuracy)
print(sum(mask), len(mask))

0.725
40 96


In [123]:
((96-40) * 0.4 + 40 * 0.725)/96

0.5354166666666668

### One-Hot-Encoding for Instrument

In [None]:
# === save model ===
prefix = "ins_onehot_400_wkey"
#clf_file = f"prometainfer/models/{prefix}_model.pkl"
#p_file = f"prometainfer/models/{prefix}_preprocessor.pkl"
#enc_file = f"prometainfer/models/{prefix}_label_encoders.pkl"
train_features = f"prometainfer/models/{prefix}_train_features.csv"
#keyword_files = "../data/ML_test/keyword_files/keyword_parsing_results.csv"

clf_file = 'prometainfer/models/ins_onehot_400_wkey_model.pkl'
p_file = 'prometainfer/models/ins_onehot_400_wkey_preprocessor.pkl'
### add keyword features
keywords_df = pd.read_csv(keyword_files)

instrument_df = keywords_df.merge(df, on="Filename", how="inner")

instrument_df.to_csv(train_features, index = False)

instrument_df = instrument_df.drop(columns=["Filename", "PRIDE ID", "Unnamed: 0", "Submission Date",
                     "Publication Date", "Country", "Keywords"] + ['Domain','Organism','Organism part',
                                                                   'Diseases','Modification','Experiment Type',
                                                                   'Quantification','Software'])

### One-hot encoding
instrument_df["Instrument"] = instrument_df["Instrument"].str.split(",")
instrument_df = instrument_df.explode("Instrument")
instrument_df = pd.get_dummies(instrument_df, columns=["Instrument"])
# Aggregate back to original rows (if needed)
#organism_df = organism_df.groupby(organism_df.index).max().reset_index(drop=True)
instrument_labels = [col for col in instrument_df.columns if col.startswith("Instrument")]

print(len(instrument_labels))
# split features and labels
X = instrument_df.drop(columns=instrument_labels)
y = instrument_df[instrument_labels]

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Create a ColumnTransformer to preprocess categorical and numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        # Impute NaNs in numeric columns with the mean, then scale
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),  # Impute NaNs with mean
            ("scaler", StandardScaler())  # Scale numeric columns
        ]), numeric_cols),
        
        # Impute NaNs in categorical columns with the most frequent value, then one-hot encode
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute NaNs with mode
            ("onehot", OneHotEncoder(handle_unknown="ignore"))  # One-hot encode categorical columns
        ]), categorical_cols),
    ]
)

# Apply preprocessing to X
X_preprocessed = preprocessor.fit_transform(X)

# === train ML Model ===
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

print(X_train.shape)

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

# predict
y_pred = rf.predict(X_test)

# === evaluate performance===
for i, col in enumerate(instrument_labels):
    print(f"Classification Report for {col}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i], zero_division=0))

# ===save model ===
joblib.dump(rf, clf_file)
joblib.dump(preprocessor, p_file)
#joblib.dump(label_encoders, enc_file)


y_pred_df = pd.DataFrame(y_pred, columns=instrument_labels, index=y_test.index)


y_test_df = pd.DataFrame(y_test, columns=instrument_labels, index=y_test.index)

def one_hot_to_labels(df, labels):
    return df.apply(lambda row: ", ".join([labels[i] for i in range(len(row)) if row[i] == 1]), axis=1)


y_pred_labels = one_hot_to_labels(y_pred_df, instrument_labels)


y_test_labels = one_hot_to_labels(y_test_df, instrument_labels)

# calculate accuracy over all org labels
overall_accuracy = (y_pred_labels == y_test_labels).mean()

print(f"Overall Accuracy: {overall_accuracy:.4f}")

mask = [pred is not None and pred != "" for pred in y_pred_labels]


# only select non-empty values
valid_y_pred = np.array(y_pred_labels)[mask]
valid_y_test = np.array(y_test_labels)[mask]

# Accuracy
overall_accuracy = (valid_y_pred == valid_y_test).mean()
print(overall_accuracy)
print(sum(mask), len(mask))



46
(339, 5645)
Classification Report for Instrument_ 6330 Ion Trap LC/MS:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        85
           1       0.00      0.00      0.00         0

    accuracy                           0.98        85
   macro avg       0.50      0.49      0.49        85
weighted avg       1.00      0.98      0.99        85

Classification Report for Instrument_ LTQ:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        85

    accuracy                           1.00        85
   macro avg       1.00      1.00      1.00        85
weighted avg       1.00      1.00      1.00        85

Classification Report for Instrument_ LTQ Orbitrap:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        81
           1       0.00      0.00      0.00         4

    accuracy                           0.95        85
   macro avg 

In [125]:
((85-27)*0.39 + 27* 0.555555)/ 85 

0.4425880588235294

### Same for One-Hot-Encoding for Modification

In [None]:
clf_file = 'prometainfer/models/mod_onehot_400_wkey_model.pkl'
p_file = 'prometainfer/models/mod_onehot_400_wkey_preprocessor.pkl'

### add keyword features
keywords_df = pd.read_csv(keyword_files)
mod_df = keywords_df.merge(df, on="Filename", how="inner")

#mod_df.to_csv(train_features, index = False)

mod_df = mod_df.drop(columns=["Filename", "PRIDE ID", "Unnamed: 0", "Submission Date",
                     "Publication Date", "Country", "Keywords"] + ['Domain','Organism','Organism part',
                                                                   'Diseases','Instrument','Experiment Type',
                                                                   'Quantification','Software'])

### One-hot encoding
mod_df["Modification"] = mod_df["Modification"].str.split(",")
mod_df = mod_df.explode("Modification")
mod_df = pd.get_dummies(mod_df, columns=["Modification"])

# Aggregate back to original rows (if needed)
mod_labels = [col for col in mod_df.columns if col.startswith("Modification")]

print(len(mod_labels))

X = mod_df.drop(columns=mod_labels)
y = mod_df[mod_labels]

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Create a ColumnTransformer to preprocess categorical and numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numeric_cols),
        
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),
    ]
)

# Apply preprocessing to X
X_preprocessed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

print(X_train.shape)

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

for i, col in enumerate(mod_labels):
    print(f"Classification Report for {col}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i], zero_division=0))

joblib.dump(rf, clf_file)
joblib.dump(preprocessor, p_file)

y_pred_df = pd.DataFrame(y_pred, columns=mod_labels, index=y_test.index)

y_test_df = pd.DataFrame(y_test, columns=mod_labels, index=y_test.index)

def one_hot_to_labels(df, labels):
    return df.apply(lambda row: ", ".join([labels[i] for i in range(len(row)) if row[i] == 1]), axis=1)

y_pred_labels = one_hot_to_labels(y_pred_df, mod_labels)

y_test_labels = one_hot_to_labels(y_test_df, mod_labels)

overall_accuracy = (y_pred_labels == y_test_labels).mean()
print(f"Overall Accuracy: {overall_accuracy:.4f}")

mask = [pred is not None and pred != "" for pred in y_pred_labels]

valid_y_pred = np.array(y_pred_labels)[mask]
valid_y_test = np.array(y_test_labels)[mask]

overall_accuracy = (valid_y_pred == valid_y_test).mean()
print(overall_accuracy)
print(sum(mask), len(mask))


83


(723, 5645)
Classification Report for Modification_ (R)-5-oxo-1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       181

    accuracy                           1.00       181
   macro avg       1.00      1.00      1.00       181
weighted avg       1.00      1.00      1.00       181

Classification Report for Modification_ 6x(13)C:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       181

    accuracy                           1.00       181
   macro avg       1.00      1.00      1.00       181
weighted avg       1.00      1.00      1.00       181

Classification Report for Modification_ 6x(13)C labeled residue:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       181

    accuracy                           1.00       181
   macro avg       1.00      1.00      1.00       181
weighted avg       1.00      1.00      1.00       181

Cl

### One-Hot-Encoding for Experiment Type

In [None]:
prefix = "exp_onehot_400_wkey"
#clf_file = f"prometainfer/models/{prefix}_model.pkl"
#p_file = f"prometainfer/models/{prefix}_preprocessor.pkl"
#enc_file = f"prometainfer/models/{prefix}_label_encoders.pkl"
train_features = f"prometainfer/models/{prefix}_train_features.csv"
#keyword_files = "../data/ML_test/keyword_files/keyword_parsing_results.csv"


clf_file = 'prometainfer/models/exp_onehot_400_wkey_model.pkl'
p_file = 'prometainfer/models/exp_onehot_400_wkey_preprocessor.pkl'

### add keyword features
keywords_df = pd.read_csv(keyword_files)
exp_df = keywords_df.merge(df, on="Filename", how="inner")

exp_df.to_csv(train_features, index = False)

exp_df = exp_df.drop(columns=["Filename", "PRIDE ID", "Unnamed: 0", "Submission Date",
                     "Publication Date", "Country", "Keywords"] + ['Domain','Organism','Organism part',
                                                                   'Diseases','Instrument','Modification',
                                                                   'Quantification','Software'])

### One-hot encoding
exp_df["Experiment Type"] = exp_df["Experiment Type"].str.split(",")
exp_df = exp_df.explode("Experiment Type")
exp_df = pd.get_dummies(exp_df, columns=["Experiment Type"])

# Aggregate back to original rows (if needed)
exp_labels = [col for col in exp_df.columns if col.startswith("Experiment Type")]

print(len(exp_labels))

# Features & Labels splitten
X = exp_df.drop(columns=exp_labels)
y = exp_df[exp_labels]


# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Create a ColumnTransformer to preprocess categorical and numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numeric_cols),
        
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),
    ]
)

# Apply preprocessing to X
X_preprocessed = preprocessor.fit_transform(X)

# === 5. MACHINE LEARNING MODELL TRAINIEREN ===
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

print(X_train.shape)

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

# Vorhersagen
y_pred = rf.predict(X_test)

# === 6. PERFORMANCE AUSWERTEN ===
for i, col in enumerate(exp_labels):
    print(f"Classification Report for {col}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i], zero_division=0))

# === 7. MODELL SPEICHERN ===
joblib.dump(rf, clf_file)
joblib.dump(preprocessor, p_file)

# Vorhersagen zurück in DataFrame umwandeln
y_pred_df = pd.DataFrame(y_pred, columns=exp_labels, index=y_test.index)

y_test_df = pd.DataFrame(y_test, columns=exp_labels, index=y_test.index)

def one_hot_to_labels(df, labels):
    return df.apply(lambda row: ", ".join([labels[i] for i in range(len(row)) if row[i] == 1]), axis=1)

# Erstelle eine Spalte mit den vorhergesagten Experimentstypen
y_pred_labels = one_hot_to_labels(y_pred_df, exp_labels)

# Erstelle eine Spalte mit den tatsächlichen Experimentstypen
y_test_labels = one_hot_to_labels(y_test_df, exp_labels)

# Berechnung der Accuracy
overall_accuracy = (y_pred_labels == y_test_labels).mean()
print(f"Overall Accuracy: {overall_accuracy:.4f}")

mask = [pred is not None and pred != "" for pred in y_pred_labels]

# Nur gültige Einträge auswählen
valid_y_pred = np.array(y_pred_labels)[mask]
valid_y_test = np.array(y_test_labels)[mask]

# Accuracy berechnen
overall_accuracy = (valid_y_pred == valid_y_test).mean()
print(overall_accuracy)
print(sum(mask), len(mask))


23
(364, 5645)
Classification Report for Experiment Type_ Affinity purification coupled with mass spectrometry proteomics:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        90
           1       0.00      0.00      0.00         1

    accuracy                           0.97        91
   macro avg       0.49      0.49      0.49        91
weighted avg       0.98      0.97      0.97        91

Classification Report for Experiment Type_ All-ion fragmentation:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        91
           1       0.00      0.00      0.00         0

    accuracy                           0.99        91
   macro avg       0.50      0.49      0.50        91
weighted avg       1.00      0.99      0.99        91

Classification Report for Experiment Type_ Bottom-up proteomics:
              precision    recall  f1-score   support

           0       1.00      1.00    

In [128]:
(0.575 * 23 + 0.64705 * 68)/ 91

0.6288395604395605