In [8]:
!pip install ipywidgets

import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm
from pathlib import Path

import json

from collections import defaultdict
import toml

from proteobench.modules.quant.quant_lfq_ion_DDA import DDAQuantIonModule



In [9]:
# Examples to choose from:
#
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DDA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_peptidoform_DIA/contents/",
# "https://api.github.com/repos/Proteobench/Results_subcellprofile_DOMLFQ_protein_DIA_EXPL/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_peptidoform_DDA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA_diaPASEF/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA_singlecell/contents/",

repo_url = "https://api.github.com/repos/Proteobench/Results_quant_ion_DDA/contents/"
    
file_names = []

response = requests.get(repo_url)

if response.status_code == 200:
    repo_contents = response.json()

    for item in repo_contents:
        file_name = item.get("name")
        if file_name:
            file_names.append(file_name)    
else:
    print(f"Failed to retrieve the repository contents. Status code: {response.status_code}")
    print(repo_url)
        
file_names = [f for f in file_names if f.endswith(".json")]
json_files_content = []

for f in file_names:
    url = f"https://raw.githubusercontent.com/Proteobot/Results_quant_ion_DDA/refs/heads/main/{f}"
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request fails

    data = response.json()
    
    json_files_content.append(data)
    
df = pd.json_normalize(json_files_content)

df.head(5)

Unnamed: 0,id,old_new,software_name,software_version,search_engine,search_engine_version,ident_fdr_psm,ident_fdr_peptide,ident_fdr_protein,enable_match_between_runs,...,results.2.CV_q75,results.2.CV_q95,results.1.median_abs_epsilon,results.1.mean_abs_epsilon,results.1.variance_epsilon,results.1.nr_prec,results.1.CV_median,results.1.CV_q90,results.1.CV_q75,results.1.CV_q95
0,MaxQuant_20241216_122433,old,MaxQuant,2.5.1.0,Andromeda,,,0.01,0.01,False,...,0.291818,0.512293,0.199852,0.274547,0.160229,50302,0.202619,0.416519,0.291818,0.512293
1,MaxQuant_20241216_124040,old,MaxQuant,2.3.1.0,Andromeda,,,0.01,0.01,True,...,0.312472,0.569765,0.207123,0.304261,0.222469,50339,0.215507,0.46077,0.312472,0.569765
2,i2MassChroQ_20241216_124142,old,i2MassChroQ,1.0.16,X! Tandem,X! Tandem Alanine (2017.2.1.4),0.008998,0.011963,0.009873,False,...,0.269583,0.61368,0.211233,0.329852,0.255909,77949,0.130385,0.477048,0.269583,0.61368
3,AlphaPept_20241217_071436,old,AlphaPept,0.5.0,AlphaPept,0.5.0,,0.01,0.01,True,...,0.279738,0.600532,0.224729,0.350489,0.334572,59868,0.164423,0.465654,0.279738,0.600532
4,MaxQuant_20241216_122819,old,MaxQuant,1.5.8.2,Andromeda,,,0.01,0.01,True,...,0.317542,0.597572,0.209998,0.324032,0.276706,49679,0.217079,0.475874,0.317542,0.597572


In [10]:
# Create a SelectMultiple widget with names as options
row_selector = widgets.SelectMultiple(
    options=[(f"{row['id']} (hash: {row['intermediate_hash']}, submission comments: {row['submission_comments']})", idx) for idx, row in df.iterrows()],
    description='Select Rows:',
    rows=10,  # Number of visible rows in the widget
    layout=widgets.Layout(width='50%')  # Adjust layout as needed
)

# Button to confirm selection
button = widgets.Button(description='Filter Rows')

# Output widget to display the filtered DataFrame
output = widgets.Output()

# Callback for filtering rows
def on_button_click(b):
    with output:
        output.clear_output()
        selected_indices = list(row_selector.value)
        global filtered_df  # Store filtered DataFrame globally
        filtered_df = df.iloc[selected_indices]
        print("Filtered DataFrame:")
        display(filtered_df)

# Attach callback
button.on_click(on_button_click)

# Display the widgets
display(row_selector, button, output)



SelectMultiple(description='Select Rows:', layout=Layout(width='50%'), options=(('MaxQuant_20241216_122433 (ha…

Button(description='Filter Rows', style=ButtonStyle())

Output()

In [5]:
filtered_df["intermediate_hash"]

20    1bfa914c771321b285a9ca40d4aa538cb9fdc42e
21    e8e80290fb48ff02de5ee54eb6b0114ff661bace
Name: intermediate_hash, dtype: object

In [6]:
# Step 1: Extract the hash list from the DataFrame
hash_list = filtered_df["intermediate_hash"].tolist()

# Step 2: Fetch folder names from the webpage
base_url = "https://proteobench.cubimed.rub.de/datasets/"
response = requests.get(base_url)
response.raise_for_status()  # Check for errors

soup = BeautifulSoup(response.text, "html.parser")
folder_links = [link['href'].strip("/") for link in soup.find_all("a") if link['href'].endswith("/")]

# Step 3: Filter folder links based on the hash list
matching_folders = [folder for folder in folder_links if folder in hash_list]

# Step 4: Download and extract zip files from matching folders
for folder in matching_folders:
    folder_url = f"{base_url}{folder}/"
    print(f"Processing folder: {folder_url}")

    # Fetch the folder page
    folder_response = requests.get(folder_url)
    folder_response.raise_for_status()

    folder_soup = BeautifulSoup(folder_response.text, "html.parser")
    zip_files = [link['href'] for link in folder_soup.find_all("a") if link['href'].endswith(".zip")]

    # Process each .zip file
    for zip_file in zip_files:
        zip_url = f"{folder_url}{zip_file}"
        print(f"Downloading: {zip_url}")

        # Download with a progress bar
        zip_response = requests.get(zip_url, stream=True)
        zip_response.raise_for_status()

        zip_filename = os.path.basename(zip_file)
        total_size = int(zip_response.headers.get('content-length', 0))
        block_size = 1024  # 1 KB

        # Save the zip file
        with open(zip_filename, "wb") as f, tqdm(
            desc=f"Downloading {zip_filename}",
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
        ) as progress:
            for data in zip_response.iter_content(block_size):
                f.write(data)
                progress.update(len(data))

        # Extract the zip file
        extract_dir = f"extracted_files/{folder}"
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(zip_filename, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
            print(f"Extracted contents to: {extract_dir}")

        # Cleanup downloaded .zip file
        os.remove(zip_filename)


Processing folder: https://proteobench.cubimed.rub.de/datasets/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/
Downloading: https://proteobench.cubimed.rub.de/datasets/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/1bfa914c771321b285a9ca40d4aa538cb9fdc42e_data.zip


Downloading 1bfa914c771321b285a9ca40d4aa538cb9fdc42e_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 66.4M/66.4M [00:15<00:00, 4.50MB/s]


Extracted contents to: extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e
Processing folder: https://proteobench.cubimed.rub.de/datasets/e8e80290fb48ff02de5ee54eb6b0114ff661bace/
Downloading: https://proteobench.cubimed.rub.de/datasets/e8e80290fb48ff02de5ee54eb6b0114ff661bace/e8e80290fb48ff02de5ee54eb6b0114ff661bace_data.zip


Downloading e8e80290fb48ff02de5ee54eb6b0114ff661bace_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 46.5M/46.5M [00:08<00:00, 5.71MB/s]


Extracted contents to: extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace


In [28]:
token = toml.load("../webinterface/.streamlit/secrets.toml")["gh"]["token"]

# TODO change to the correct module
module_obj = DDAQuantIonModule(token=token)
results_df = module_obj.obtain_all_data_points(all_datapoints=None)

results_df.head(5)

Unnamed: 0,id,old_new,software_name,software_version,search_engine,search_engine_version,ident_fdr_psm,ident_fdr_peptide,ident_fdr_protein,enable_match_between_runs,...,color,hover_text,scatter_size,scan_window,quantification_method_DIANN,second_pass,protein_inference,predictors_library,quantification_method,mean_abs_epsilon
0,MaxQuant_20241216_100704,old,MaxQuant,1.5.2.8,Andromeda,,,0.01,0.01,False,...,#377eb8,ProteoBench ID: MaxQuant_20241216_100704<br>So...,20,,,,,,,0.26549
1,ProlineStudio_20241216_103006,old,ProlineStudio,2.3.0-SNAPSHOT_2024-09-11T06:45:20Z_jenkins,Mascot,2.8.3,0.01,,,True,...,#5f0f40,ProteoBench ID: ProlineStudio_20241216_103006<...,20,,,,,,,0.319847
2,i2MassChroQ_20241216_103323,old,i2MassChroQ,1.0.16,X! Tandem,X! Tandem Alanine (2017.2.1.4),0.008998,0.011963,0.009873,True,...,#984ea3,ProteoBench ID: i2MassChroQ_20241216_103323<br...,20,,,,,,,0.36988
3,MaxQuant_20241216_130203,old,MaxQuant,1.5.3.30,Andromeda,,,0.01,0.01,True,...,#377eb8,ProteoBench ID: MaxQuant_20241216_130203<br>So...,20,,,,,,,0.322391
4,MaxQuant_20241216_120735,old,MaxQuant,1.5.3.30,Andromeda,,,0.01,0.01,False,...,#377eb8,ProteoBench ID: MaxQuant_20241216_120735<br>So...,20,,,,,,,0.259993


In [29]:
extra_path = Path("extracted_files")

# submission_files = [
#    {
#        "input_file" : "../test/data/dda_quant/MaxQuant_evidence_sample.txt",
#        "param_file" : "../test/params/mqpar_MQ1.6.3.3_MBR.xml",
#        "input_type" : "MaxQuant",
#        "default_cutoff_min_prec" : 3,
#        "user_comments" : "Put comments here."
#    }
#]

submission_files = []

for idx,row in filtered_df.iterrows():
    base_path = extra_path / row["intermediate_hash"]
    comments = "\n".join(open(base_path / "comment.txt").readlines())
    input_file = base_path / "input_file.txt"
    parameter_file = base_path / "param_0.txt"
    
    submission_files.append({
        "input_file" : input_file,
        "param_file" : parameter_file,
        "input_type" : row["software_name"],
        "default_cutoff_min_prec" : 3,
        "user_comments" : comments
    })

In [30]:
for submission_settings in submission_files:
    param_file = submission_settings["param_file"]
    input_file = submission_settings["input_file"]
    input_type = submission_settings["input_type"]
    default_cutoff_min_prec = submission_settings["default_cutoff_min_prec"]
    user_comments = submission_settings["user_comments"]
    
    user_config = defaultdict(lambda: "")

    results_intermediates, results_df_new, parsed_input = module_obj.benchmarking(
        input_file,
        input_type,
        user_config,
        results_df,
        default_cutoff_min_prec=default_cutoff_min_prec,
    )

    results_df_new.tail(5)
    
    param_obj = module_obj.load_params_file(
        [param_file], input_type
    )
    print(param_obj)

    pr_url = module_obj.clone_pr(
        results_df_new,
        param_obj,
        remote_git="",
        submission_comments=user_comments,
    )
    
    print(f"Submitted: {submission_settings}")
    print("------------------------")

Not all columns required for making the ion are available.
Load locally: extracted_files\1bfa914c771321b285a9ca40d4aa538cb9fdc42e\param_0.txt
ProteoBenchParameters(software_name='AlphaPept', software_version='0.5.0', search_engine='AlphaPept', search_engine_version='0.5.0', ident_fdr_psm=None, ident_fdr_peptide=0.01, ident_fdr_protein=0.01, enable_match_between_runs=True, precursor_mass_tolerance='[-20 ppm, 20 ppm]', fragment_mass_tolerance='[-50 ppm, 50 ppm]', enzyme='Trypsin', allowed_miscleavages=2, min_peptide_length=7, max_peptide_length=27, fixed_mods='cC', variable_mods='oxM', max_mods=3, min_precursor_charge=1, max_precursor_charge=6, scan_window=None, quantification_method=None, second_pass=None, protein_inference=None, predictors_library=None)


Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348
INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348
Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main
INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main


Submitted: {'input_file': WindowsPath('extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/input_file.txt'), 'param_file': WindowsPath('extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/param_0.txt'), 'input_type': 'AlphaPept', 'default_cutoff_min_prec': 3, 'user_comments': 'Fixed mod of N-acetyl (N-term)'}
------------------------
Not all columns required for making the ion are available.
Load locally: extracted_files\e8e80290fb48ff02de5ee54eb6b0114ff661bace\param_0.txt
ProteoBenchParameters(software_name='AlphaPept', software_version='0.5.0', search_engine='AlphaPept', search_engine_version='0.5.0', ident_fdr_psm=None, ident_fdr_peptide=0.01, ident_fdr_protein=0.01, enable_match_between_runs=True, precursor_mass_tolerance='[-10 ppm, 10 ppm]', fragment_mass_tolerance='[-20 ppm, 20 ppm]', enzyme='Trypsin', allowed_miscleavages=1, min_peptide_length=7, max_peptide_length=27, fixed_mods='cC', variable_mods='oxM', max_mods=3, min_precursor_charge=1, max_precursor_charge=6, sca

Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348
INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348
Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main
INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main
ERROR:root:Error in PR: 422 {"message": "Validation Failed", "errors": [{"resource": "PullRequest", "code": "custom", "message": "A pull request already exists for Proteobot:AlphaPept_20241217_084044."}], "documentation_url": "https://docs.github.com/rest/pulls/pulls#create-a-pull-request", "status": "422"}


Submitted: {'input_file': WindowsPath('extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace/input_file.txt'), 'param_file': WindowsPath('extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace/param_0.txt'), 'input_type': 'AlphaPept', 'default_cutoff_min_prec': 3, 'user_comments': ''}
------------------------
