In [18]:
!pip install ipywidgets

import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm



In [16]:
# Step 1: Fetch the JSON data from the URL
url = "https://raw.githubusercontent.com/Proteobot/Results_quant_ion_DDA/refs/heads/main/results.json"
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request fails

# Step 2: Load the JSON into Python
data = response.json()

# Step 3: Normalize the JSON structure into a DataFrame
df = pd.json_normalize(data)

df.head(5)

Unnamed: 0,id,old_new,software_name,software_version,search_engine,search_engine_version,ident_fdr_psm,ident_fdr_peptide,ident_fdr_protein,enable_match_between_runs,...,results.2.CV_q75,results.2.CV_q95,results.1.median_abs_epsilon,results.1.mean_abs_epsilon,results.1.variance_epsilon,results.1.nr_prec,results.1.CV_median,results.1.CV_q90,results.1.CV_q75,results.1.CV_q95
0,MaxQuant_20241216_100704,old,MaxQuant,1.5.2.8,Andromeda,,,0.01,0.01,False,...,0.295455,0.524623,0.202356,0.278361,0.162885,51193,0.204522,0.426579,0.295455,0.524623
1,ProlineStudio_20241216_103006,old,ProlineStudio,2.3.0-SNAPSHOT_2024-09-11T06:45:20Z_jenkins,Mascot,2.8.3,0.01,,,True,...,0.329396,0.616819,0.211673,0.327418,0.267728,59609,0.22633,0.49495,0.329396,0.616819
2,i2MassChroQ_20241216_103323,old,i2MassChroQ,1.0.16,X! Tandem,X! Tandem Alanine (2017.2.1.4),0.008998,0.011963,0.009873,True,...,0.286459,0.650784,0.223987,0.375265,0.337984,82533,0.1329,0.502209,0.286459,0.650784
3,MaxQuant_20241216_130203,old,MaxQuant,1.5.3.30,Andromeda,,,0.01,0.01,True,...,0.321331,0.605836,0.213234,0.333518,0.291254,51338,0.21863,0.481902,0.321331,0.605836
4,MaxQuant_20241216_120735,old,MaxQuant,1.5.3.30,Andromeda,,,0.01,0.01,False,...,0.291836,0.514675,0.199873,0.272681,0.155889,51345,0.202474,0.417133,0.291836,0.514675


In [14]:
# Create a SelectMultiple widget with names as options
row_selector = widgets.SelectMultiple(
    options=[(f"{row['id']} (hash: {row['intermediate_hash']}, submission comments: {row['submission_comments']})", idx) for idx, row in df.iterrows()],
    description='Select Rows:',
    rows=10,  # Number of visible rows in the widget
    layout=widgets.Layout(width='50%')  # Adjust layout as needed
)

# Button to confirm selection
button = widgets.Button(description='Filter Rows')

# Output widget to display the filtered DataFrame
output = widgets.Output()

# Callback for filtering rows
def on_button_click(b):
    with output:
        output.clear_output()
        selected_indices = list(row_selector.value)
        global filtered_df  # Store filtered DataFrame globally
        filtered_df = df.iloc[selected_indices]
        print("Filtered DataFrame:")
        display(filtered_df)

# Attach callback
button.on_click(on_button_click)

# Display the widgets
display(row_selector, button, output)



SelectMultiple(description='Select Rows:', layout=Layout(width='50%'), options=(('MaxQuant_20241216_100704 (ha…

Button(description='Filter Rows', style=ButtonStyle())

Output()

In [19]:
filtered_df["intermediate_hash"]

14    8cbc0bce20eee581ad10326e02a09dbc316c30e1
15    36b7b01b380f641722b3b34633bb53d72348eb80
16    0280a06fabdbe84746419d0810deae56e7ab2406
17    47db7ef37a0fb5fec79f3bedbfb4f67835774f10
Name: intermediate_hash, dtype: object

In [20]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm

# Step 1: Extract the hash list from the DataFrame
hash_list = filtered_df["intermediate_hash"].tolist()

# Step 2: Fetch folder names from the webpage
base_url = "https://proteobench.cubimed.rub.de/datasets/"
response = requests.get(base_url)
response.raise_for_status()  # Check for errors

soup = BeautifulSoup(response.text, "html.parser")
folder_links = [link['href'].strip("/") for link in soup.find_all("a") if link['href'].endswith("/")]

# Step 3: Filter folder links based on the hash list
matching_folders = [folder for folder in folder_links if folder in hash_list]

# Step 4: Download and extract zip files from matching folders
for folder in matching_folders:
    folder_url = f"{base_url}{folder}/"
    print(f"Processing folder: {folder_url}")

    # Fetch the folder page
    folder_response = requests.get(folder_url)
    folder_response.raise_for_status()

    folder_soup = BeautifulSoup(folder_response.text, "html.parser")
    zip_files = [link['href'] for link in folder_soup.find_all("a") if link['href'].endswith(".zip")]

    # Process each .zip file
    for zip_file in zip_files:
        zip_url = f"{folder_url}{zip_file}"
        print(f"Downloading: {zip_url}")

        # Download with a progress bar
        zip_response = requests.get(zip_url, stream=True)
        zip_response.raise_for_status()

        zip_filename = os.path.basename(zip_file)
        total_size = int(zip_response.headers.get('content-length', 0))
        block_size = 1024  # 1 KB

        # Save the zip file
        with open(zip_filename, "wb") as f, tqdm(
            desc=f"Downloading {zip_filename}",
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
        ) as progress:
            for data in zip_response.iter_content(block_size):
                f.write(data)
                progress.update(len(data))

        # Extract the zip file
        extract_dir = f"extracted_files/{folder}"
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(zip_filename, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
            print(f"Extracted contents to: {extract_dir}")

        # Cleanup downloaded .zip file
        os.remove(zip_filename)


Processing folder: https://proteobench.cubimed.rub.de/datasets/0280a06fabdbe84746419d0810deae56e7ab2406/
Downloading: https://proteobench.cubimed.rub.de/datasets/0280a06fabdbe84746419d0810deae56e7ab2406/0280a06fabdbe84746419d0810deae56e7ab2406_data.zip


Downloading 0280a06fabdbe84746419d0810deae56e7ab2406_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33.6M/33.6M [00:24<00:00, 1.42MB/s]


Extracted contents to: extracted_files/0280a06fabdbe84746419d0810deae56e7ab2406
Processing folder: https://proteobench.cubimed.rub.de/datasets/36b7b01b380f641722b3b34633bb53d72348eb80/
Downloading: https://proteobench.cubimed.rub.de/datasets/36b7b01b380f641722b3b34633bb53d72348eb80/36b7b01b380f641722b3b34633bb53d72348eb80_data.zip


Downloading 36b7b01b380f641722b3b34633bb53d72348eb80_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 25.4M/25.4M [00:14<00:00, 1.83MB/s]


Extracted contents to: extracted_files/36b7b01b380f641722b3b34633bb53d72348eb80
Processing folder: https://proteobench.cubimed.rub.de/datasets/8cbc0bce20eee581ad10326e02a09dbc316c30e1/
Downloading: https://proteobench.cubimed.rub.de/datasets/8cbc0bce20eee581ad10326e02a09dbc316c30e1/8cbc0bce20eee581ad10326e02a09dbc316c30e1_data.zip


Downloading 8cbc0bce20eee581ad10326e02a09dbc316c30e1_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33.2M/33.2M [00:18<00:00, 1.94MB/s]


Extracted contents to: extracted_files/8cbc0bce20eee581ad10326e02a09dbc316c30e1
