In [2]:
%load_ext autoreload
%autoreload 2

import sqlite3
import duckdb
import numpy as np
import pandas as pd
import pyopenms as poms

pd.set_option('display.float_format', '{:.15f}'.format)

from pyprophet.io.dispatcher import  ReaderDispatcher
from pyprophet.io.scoring.osw import OSWReader
from pyprophet.io.scoring.parquet import ParquetReader
from pyprophet.io.scoring.split_parquet import SplitParquetReader
from pyprophet.io.scoring.tsv import TSVReader  # legacy, limited support
from pyprophet._config import RunnerIOConfig, RunnerConfig, IPFIOConfig

In [11]:
conn_duck = duckdb.connect(':memory:')

query = """
WITH normalized_peptides AS (
                SELECT 
                    ID AS PEPTIDE_ID,
                    REPLACE(
                        REPLACE(
                            REPLACE(
                                REPLACE(MODIFIED_SEQUENCE, '(UniMod:1)', '(Acetyl)'),
                            '(UniMod:35)', '(Oxidation)'),
                        '(UniMod:21)', '(Phospho)'),
                    '(UniMod:4)', '(Carbamidomethyl)') AS NORMALIZED_SEQUENCE
                FROM sqlite_scan('./data/test_data.osw', 'PEPTIDE')
            ),
            ipf_groups AS (
                SELECT 
                    NORMALIZED_SEQUENCE,
                    MIN(PEPTIDE_ID) AS IPF_PEPTIDE_ID
                FROM normalized_peptides
                GROUP BY NORMALIZED_SEQUENCE
            ),
            peptide_ipf_map AS (
                SELECT 
                    np.PEPTIDE_ID,
                    g.IPF_PEPTIDE_ID
                FROM normalized_peptides np
                JOIN ipf_groups g USING (NORMALIZED_SEQUENCE)
            ) 

SELECT * FROM peptide_ipf_map"""

ipf_peptide_map = conn_duck.execute(query).df()
ipf_peptide_map

Unnamed: 0,PEPTIDE_ID,IPF_PEPTIDE_ID
0,0,0
1,1,1
2,2,2
3,3,3
4,5,5
...,...,...
1365,1288,1288
1366,479,479
1367,1086,1086
1368,764,764


In [3]:
conn = sqlite3.connect("./data/test_data.osw")
peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn)

peptide_table

Unnamed: 0,ID,UNMODIFIED_SEQUENCE,MODIFIED_SEQUENCE,DECOY
0,0,AACNNQGQQIAEHR,AACNNQGQQIAEHR(Label:13C(6)15N(4)),0
1,1,AAEDFTLLVK,AAEDFTLLVK(Label:13C(6)15N(2)),0
2,2,AAEDFTLLVK,AAEDFTLLVK(UniMod:259),0
3,3,AAEPFLDALLAR,AAEPFLDALLAR(UniMod:267),0
4,4,AAGASAQVLGQEGK,AAGASAQVLGQEGK(Label:13C(6)15N(2)),0
...,...,...,...,...
1365,1365,YVQDGTYTAK,YVQDGTYTAK(Label:13C(6)15N(2)),0
1366,1366,YVQDGTYTAK,YVQDGTYTAK(UniMod:259),0
1367,1367,YVWSYNSDAPR,YVWSYNSDAPR(UniMod:267),0
1368,1368,YYDYTLSINGK,YYDYTLSINGK(Label:13C(6)15N(2)),0


In [1]:
def unimod_to_codename(seq):
    """
    Convert a sequence with unimod modifications to a codename.
    This is a placeholder function; actual implementation may vary.
    """
    seq_poms = poms.AASequence.fromString(seq)
    codename = seq_poms.toString()
    return codename

In [18]:
# apply the conversion to the MODIFIED_SEQUENCE column and create a new column
peptide_table['codename'] = peptide_table['MODIFIED_SEQUENCE'].apply(unimod_to_codename)

# filter for MODIFIED_SEQUENCE wiht UniMod in sequence
unimod_peptide_table = peptide_table[peptide_table['MODIFIED_SEQUENCE'].str.contains('UniMod')]

# Perform oppisite operation to get the original sequence
codename_peptide_table = peptide_table[ ~peptide_table['MODIFIED_SEQUENCE'].str.contains('UniMod')]

# Merge the two DataFrames on the 'codename' column
merged_peptide_table = pd.merge(unimod_peptide_table[['codename', 'ID']], codename_peptide_table[['codename', 'ID']], on='codename', suffixes=('_unimod', '_codename'), how='outer')

# Fill NaN values in the 'ID_codename' column with the 'ID_unimod' values
merged_peptide_table['ID_codename'] = merged_peptide_table['ID_codename'].fillna(merged_peptide_table['ID_unimod'])
# Fill NaN values in the 'ID_unimod' column with the 'ID_codename' values
merged_peptide_table['ID_unimod'] = merged_peptide_table['ID_unimod'].fillna(merged_peptide_table['ID_codename'])

# Convert both 'ID_unimod' and 'ID_codename' columns to integer type
merged_peptide_table['ID_unimod'] = merged_peptide_table['ID_unimod'].astype(int)
merged_peptide_table['ID_codename'] = merged_peptide_table['ID_codename'].astype(int)

merged_peptide_table


Unnamed: 0,codename,ID_unimod,ID_codename
0,AACNNQGQQIAEHR(Label:13C(6)15N(4)),0,0
1,AAEDFTLLVK(Label:13C(6)15N(2)),2,1
2,AAEPFLDALLAR(Label:13C(6)15N(4)),3,3
3,AAGASAQVLGQEGK(Label:13C(6)15N(2)),5,4
4,AAGNEDDLER(Label:13C(6)15N(4)),6,6
...,...,...,...
1023,YVILQLLSGETR(Label:13C(6)15N(4)),1362,1361
1024,YVPIHTIDDGYSVIK(Label:13C(6)15N(2)),1364,1363
1025,YVQDGTYTAK(Label:13C(6)15N(2)),1366,1365
1026,YVWSYNSDAPR(Label:13C(6)15N(4)),1367,1367


In [15]:
merged_peptide_table[ merged_peptide_table["ID_unimod"]==125]

Unnamed: 0,codename,ID_unimod,ID_codename
92,ASTLTIVTDTGPDR(Label:13C(6)15N(4)),125.0,


# Scoring

In [2]:
def create_reader_config(level, infile, outfile):
    """
    Common config generator to avoid repetition
    """
    return RunnerIOConfig(
        infile=infile,
        outfile=outfile,
        subsample_ratio=1,
        context="score_learn",
        level=level,
        runner=RunnerConfig(
        ),
    )

## MS1MS2

In [3]:
osw_config = create_reader_config(
    "ms1ms2", "data/test_data.osw", "data/test_data.osw"
)
parquet_config = create_reader_config(
    "ms1ms2", "data/test_data.parquet", "data/test_data.parquet"
)
split_parquet_config = create_reader_config(
    "ms1ms2",
    "data/test_data.oswpq",
    "data/test_data.oswpq",
)

osw_reader = ReaderDispatcher.get_reader(osw_config)
parquet_reader = ReaderDispatcher.get_reader(parquet_config)
split_parquet_reader = ReaderDispatcher.get_reader(split_parquet_config)

cols = ['run_id', 'feature_id', 'decoy', 'main_var_xcorr_shape']
osw_data = osw_reader.read().sort_values(by=cols).reset_index(drop=True)
parquet_data = parquet_reader.read().sort_values(by=cols).reset_index(drop=True)
split_parquet_data = split_parquet_reader.read().sort_values(by=cols).reset_index(drop=True)

[32m2025-06-02 00:34:55.429[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m311[0m - [1mDetected 1 split_parquet run files[0m
[32m2025-06-02 00:34:55.430[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m312[0m - [1mInput Parquet Structure:[0m


└── 📁 data/test_data.oswpq
    ├── 📁 test_data.oswpq
    │   ├── 📄 precursors_features.parquet
    │   └── 📄 transition_features.parquet


In [4]:
osw_data

Unnamed: 0,feature_id,area_intensity,apex_intensity,var_bseries_score,var_dotprod_score,var_intensity_score,var_isotope_correlation_score,var_isotope_overlap_score,var_library_corr,var_library_dotprod,...,exp_rt,precursor_charge,decoy,transition_count,group_id,var_ms1_massdev_score,var_ms1_isotope_correlation_score,var_ms1_isotope_overlap_score,var_ms1_xcorr_coelution,var_ms1_xcorr_shape
0,-9211032279639747263,8969.0,1401.0,1.0,0.600952,0.040370,0.855669,0.092987,-0.417037,0.788891,...,1793.450,2,0,4,-8670811102654834151_62,6.843700,0.934914,0.507910,6.736079,0.493892
1,-9209834744278112856,5275.0,1672.0,1.0,0.675908,0.107070,0.920075,0.148815,-0.796921,0.674237,...,457.673,2,1,4,-8670811102654834151_470,20.641658,-0.423074,0.571429,2.816497,0.700567
2,-9204568338203974043,32824.0,5721.0,4.0,0.802029,0.191046,0.854679,0.000000,0.545997,0.905890,...,1509.490,2,1,4,-8670811102654834151_664,10.202549,-0.081574,1.077040,4.160247,0.577194
3,-9202066408251325127,7158.0,947.0,5.0,0.816020,0.016773,0.255529,0.277871,0.982370,0.997704,...,3133.200,2,0,4,-8670811102654834151_265,6.245661,-0.639745,2.426517,6.207427,0.413703
4,-9194114845888269381,9826.0,2161.0,4.0,0.632115,0.025071,0.181583,0.853145,0.834004,0.989984,...,2219.430,2,0,4,-8670811102654834151_217,11.232396,0.401923,1.510283,3.632993,0.501500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3405,9204571350293509371,23649.0,3193.0,7.0,0.658018,0.048516,0.805307,0.065161,-0.661946,0.845153,...,4244.440,2,1,4,-8670811102654834151_582,16.097756,0.948893,4.148277,4.642969,0.623494
3406,9204761823691693843,66548.0,4605.0,2.0,0.688721,0.113038,-0.021843,0.000000,-0.073519,0.773383,...,1715.920,2,1,4,-8670811102654834151_363,1.847235,-0.698961,1.384799,10.242641,0.474553
3407,9218597765662578237,55289.0,11128.0,6.0,0.789868,0.271694,0.990321,0.000000,0.834778,0.974048,...,2889.390,2,0,4,-8670811102654834151_134,1.192150,0.986200,0.104489,0.000000,0.958779
3408,9219889606523665288,18025.0,2431.0,4.0,0.617810,0.031256,0.599238,0.904466,-0.349098,0.916917,...,1732.890,2,0,4,-8670811102654834151_273,1.595447,-0.503802,0.274504,4.112908,0.684720


In [5]:
parquet_data

Unnamed: 0,group_id,feature_id,protein_id,run_id,precursor_id,precursor_charge,exp_rt,decoy,area_intensity,apex_intensity,...,var_sonar_log_sn,var_sonar_log_diff,var_sonar_log_trend,var_sonar_rsq,transition_count,var_ms1_massdev_score,var_ms1_isotope_correlation_score,var_ms1_isotope_overlap_score,var_ms1_xcorr_coelution,var_ms1_xcorr_shape
0,-8670811102654834151_62,-9211032279639747263,[2],-8670811102654834151,62,2,1793.450,False,8969.0,1401.0,...,,,,,4,6.843699,0.934914,0.507910,6.736079,0.493892
1,-8670811102654834151_470,-9209834744278112856,[21],-8670811102654834151,470,2,457.673,True,5275.0,1672.0,...,,,,,4,20.641659,-0.423074,0.571429,2.816497,0.700567
2,-8670811102654834151_664,-9204568338203974043,[31],-8670811102654834151,664,2,1509.490,True,32824.0,5721.0,...,,,,,4,10.202549,-0.081574,1.077040,4.160247,0.577194
3,-8670811102654834151_265,-9202066408251325127,[12],-8670811102654834151,265,2,3133.200,False,7158.0,947.0,...,,,,,4,6.245661,-0.639745,2.426517,6.207427,0.413703
4,-8670811102654834151_217,-9194114845888269381,[10],-8670811102654834151,217,2,2219.430,False,9826.0,2161.0,...,,,,,4,11.232396,0.401923,1.510283,3.632993,0.501500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3405,-8670811102654834151_582,9204571350293509371,[27],-8670811102654834151,582,2,4244.440,True,23649.0,3193.0,...,,,,,4,16.097755,0.948893,4.148277,4.642970,0.623494
3406,-8670811102654834151_363,9204761823691693843,[17],-8670811102654834151,363,2,1715.920,True,66548.0,4605.0,...,,,,,4,1.847235,-0.698961,1.384799,10.242640,0.474553
3407,-8670811102654834151_134,9218597765662578237,[6],-8670811102654834151,134,2,2889.390,False,55289.0,11128.0,...,,,,,4,1.192150,0.986200,0.104489,0.000000,0.958779
3408,-8670811102654834151_273,9219889606523665288,[12],-8670811102654834151,273,2,1732.890,False,18025.0,2431.0,...,,,,,4,1.595447,-0.503802,0.274504,4.112908,0.684720


In [6]:
split_parquet_data

Unnamed: 0,group_id,feature_id,protein_id,run_id,precursor_id,precursor_charge,exp_rt,decoy,area_intensity,apex_intensity,...,var_sonar_log_sn,var_sonar_log_diff,var_sonar_log_trend,var_sonar_rsq,transition_count,var_ms1_massdev_score,var_ms1_isotope_correlation_score,var_ms1_isotope_overlap_score,var_ms1_xcorr_coelution,var_ms1_xcorr_shape
0,-8670811102654834151_62,-9211032279639747263,[2],-8670811102654834151,62,2,1793.450,0,8969.0,1401.0,...,,,,,4,6.843700,0.934914,0.507910,6.736079,0.493892
1,-8670811102654834151_470,-9209834744278112856,[21],-8670811102654834151,470,2,457.673,1,5275.0,1672.0,...,,,,,4,20.641658,-0.423074,0.571429,2.816497,0.700567
2,-8670811102654834151_664,-9204568338203974043,[31],-8670811102654834151,664,2,1509.490,1,32824.0,5721.0,...,,,,,4,10.202549,-0.081574,1.077040,4.160247,0.577194
3,-8670811102654834151_265,-9202066408251325127,[12],-8670811102654834151,265,2,3133.200,0,7158.0,947.0,...,,,,,4,6.245661,-0.639745,2.426517,6.207427,0.413703
4,-8670811102654834151_217,-9194114845888269381,[10],-8670811102654834151,217,2,2219.430,0,9826.0,2161.0,...,,,,,4,11.232396,0.401923,1.510283,3.632993,0.501500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3405,-8670811102654834151_582,9204571350293509371,[27],-8670811102654834151,582,2,4244.440,1,23649.0,3193.0,...,,,,,4,16.097756,0.948893,4.148277,4.642969,0.623494
3406,-8670811102654834151_363,9204761823691693843,[17],-8670811102654834151,363,2,1715.920,1,66548.0,4605.0,...,,,,,4,1.847235,-0.698961,1.384799,10.242641,0.474553
3407,-8670811102654834151_134,9218597765662578237,[6],-8670811102654834151,134,2,2889.390,0,55289.0,11128.0,...,,,,,4,1.192150,0.986200,0.104489,0.000000,0.958779
3408,-8670811102654834151_273,9219889606523665288,[12],-8670811102654834151,273,2,1732.890,0,18025.0,2431.0,...,,,,,4,1.595447,-0.503802,0.274504,4.112908,0.684720


## Transition

In [3]:
osw_config = create_reader_config(
    "transition", "data/test_data.osw", "data/test_data.osw"
)
parquet_config = create_reader_config(
    "transition", "data/test_data.parquet", "data/test_data.parquet"
)
split_parquet_config = create_reader_config(
    "transition",
    "data/test_data.oswpq",
    "data/test_data.oswpq",
)

osw_reader = ReaderDispatcher.get_reader(osw_config)
parquet_reader = ReaderDispatcher.get_reader(parquet_config)
split_parquet_reader = ReaderDispatcher.get_reader(split_parquet_config)

cols = ['run_id', 'feature_id', 'transition_id', 'decoy', 'main_var_xcorr_shape']
osw_data = osw_reader.read().sort_values(by=cols).reset_index(drop=True)
parquet_data = parquet_reader.read().sort_values(by=cols).reset_index(drop=True)
split_parquet_data = split_parquet_reader.read().sort_values(by=cols).reset_index(drop=True)

[32m2025-06-02 00:56:04.534[0m | [1mINFO    [0m | [36mpyprophet.io._base[0m:[36m_collapse_ipf_peptide_ids[0m:[36m1065[0m - [1mCollapsed 13 of 6520 rows due to multiple transitions features mapping to different peptidoforms.[0m
[32m2025-06-02 00:56:04.612[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m311[0m - [1mDetected 1 split_parquet run files[0m
[32m2025-06-02 00:56:04.615[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m312[0m - [1mInput Parquet Structure:[0m


└── 📁 data/test_data.oswpq
    ├── 📁 test_data.oswpq
    │   ├── 📄 precursors_features.parquet
    │   └── 📄 transition_features.parquet


[32m2025-06-02 00:56:04.762[0m | [1mINFO    [0m | [36mpyprophet.io._base[0m:[36m_collapse_ipf_peptide_ids[0m:[36m1369[0m - [1mCollapsed 13 of 6520 rows due to multiple transitions features mapping to different peptidoforms.[0m


In [4]:
osw_data

Unnamed: 0,feature_id,transition_id,area_intensity,apex_intensity,var_log_intensity,var_xcorr_coelution,main_var_xcorr_shape,var_log_sn_score,var_massdev_score,var_isotope_correlation_score,var_isotope_overlap_score,decoy,run_id,precursor_id,exp_rt,precursor_charge,product_charge,group_id
0,-9078977811506172301,3269,65086.0,8571.0,11.08350,0.0,0.988488,2.28037,1.339620,0.986416,0.380002,0,-8670811102654834151,69,2163.91,2,1,-8670811102654834151_-9078977811506172301_69_3269
1,-9078977811506172301,3274,28118.0,3659.0,10.24420,0.0,0.775015,2.96661,15.881300,0.982047,0.064225,0,-8670811102654834151,69,2163.91,2,1,-8670811102654834151_-9078977811506172301_69_3274
2,-9078977811506172301,3275,18643.0,2935.0,9.83323,0.0,0.987159,2.74691,3.497610,0.936990,0.082794,0,-8670811102654834151,69,2163.91,2,1,-8670811102654834151_-9078977811506172301_69_3275
3,-9078977811506172301,3276,10931.0,1571.0,9.29936,0.0,0.976596,2.54362,5.398260,0.982335,0.084023,0,-8670811102654834151,69,2163.91,2,1,-8670811102654834151_-9078977811506172301_69_3276
4,-9078977811506172301,3277,20099.0,3158.0,9.90843,0.0,0.980119,2.72363,2.800890,0.997938,0.090880,0,-8670811102654834151,69,2163.91,2,1,-8670811102654834151_-9078977811506172301_69_3277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6502,9218597765662578237,5082,20865.0,2567.0,9.94583,1.0,0.803354,1.33504,1.619140,0.392923,0.301909,0,-8670811102654834151,134,2889.39,2,2,-8670811102654834151_9218597765662578237_134_5082
6503,9218597765662578237,5083,14438.0,3050.0,9.57762,0.0,0.992086,3.27596,1.975210,0.988239,0.079672,0,-8670811102654834151,134,2889.39,2,1,-8670811102654834151_9218597765662578237_134_5083
6504,9218597765662578237,5085,15499.0,3234.0,9.64853,0.0,0.991741,4.85533,0.332315,0.997599,0.088435,0,-8670811102654834151,134,2889.39,2,1,-8670811102654834151_9218597765662578237_134_5085
6505,9218597765662578237,14329,1649.0,262.0,7.40792,0.5,0.690893,0.91223,5.561800,0.828575,0.309160,1,-8670811102654834151,134,2889.39,2,1,-8670811102654834151_9218597765662578237_134_1...


In [9]:
grouped = osw_data.groupby('group_id').size().reset_index(name='count')
filtered_groups = grouped[grouped['count'] > 1]
filtered_groups

Unnamed: 0,group_id,count


In [2]:
# parquet_data[~parquet_data['run_id'].isna()]
parquet_data

NameError: name 'parquet_data' is not defined

In [7]:
parquet_data[ parquet_data['group_id']=='-8670811102654834151_7058704691706873957_267_8638' ]

Unnamed: 0,group_id,feature_id,ipf_peptide_id,decoy,run_id,transition_id,var_log_intensity,var_xcorr_coelution,main_var_xcorr_shape,var_log_sn_score,var_massdev_score,var_isotope_correlation_score,var_isotope_overlap_score,precursor_charge,transition_charge
5739,-8670811102654834151_7058704691706873957_267_8638,7058704691706873957,"[853, 854]",0,,8638,10.2522,0.0,0.907322,1.67417,2.10102,0.997484,0.207736,2,1


In [6]:
split_parquet_data

Unnamed: 0,group_id,feature_id,ipf_peptide_id,decoy,run_id,transition_id,area_intensity,var_log_intensity,var_xcorr_coelution,main_var_xcorr_shape,var_log_sn_score,var_massdev_score,var_isotope_correlation_score,var_isotope_overlap_score,precursor_charge,transition_charge
0,-8670811102654834151_-9078977811506172301_69_3269,-9078977811506172301,[305],0,-8670811102654834151,3269,65086.0,11.08350,0.0,0.988488,2.28037,1.339620,0.986416,0.380002,2,1
1,-8670811102654834151_-9078977811506172301_69_3274,-9078977811506172301,[305],0,-8670811102654834151,3274,28118.0,10.24420,0.0,0.775015,2.96661,15.881300,0.982047,0.064225,2,1
2,-8670811102654834151_-9078977811506172301_69_3275,-9078977811506172301,[305],0,-8670811102654834151,3275,18643.0,9.83323,0.0,0.987159,2.74691,3.497610,0.936990,0.082794,2,1
3,-8670811102654834151_-9078977811506172301_69_3276,-9078977811506172301,[305],0,-8670811102654834151,3276,10931.0,9.29936,0.0,0.976596,2.54362,5.398260,0.982335,0.084023,2,1
4,-8670811102654834151_-9078977811506172301_69_3277,-9078977811506172301,[305],0,-8670811102654834151,3277,20099.0,9.90843,0.0,0.980119,2.72363,2.800890,0.997938,0.090880,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6502,-8670811102654834151_9218597765662578237_134_5082,9218597765662578237,[513],0,-8670811102654834151,5082,20865.0,9.94583,1.0,0.803354,1.33504,1.619140,0.392923,0.301909,2,2
6503,-8670811102654834151_9218597765662578237_134_5083,9218597765662578237,[513],0,-8670811102654834151,5083,14438.0,9.57762,0.0,0.992086,3.27596,1.975210,0.988239,0.079672,2,1
6504,-8670811102654834151_9218597765662578237_134_5085,9218597765662578237,[513],0,-8670811102654834151,5085,15499.0,9.64853,0.0,0.991741,4.85533,0.332315,0.997599,0.088435,2,1
6505,-8670811102654834151_9218597765662578237_134_1...,9218597765662578237,[476],1,-8670811102654834151,14329,1649.0,7.40792,0.5,0.690893,0.91223,5.561800,0.828575,0.309160,2,1


# IPF

SELECT 
SCORE_IPF_PRE_BATCHING.FEATURE_ID,
SCORE_IPF_PRE_BATCHING.PEPTIDE_ID,
SCORE_IPF_PRE_BATCHING.PRECURSOR_PEAKGROUP_PEP,
SCORE_IPF.PRECURSOR_PEAKGROUP_PEP AS PRECURSOR_PEAKGROUP_PEP_2,
SCORE_IPF_PRE_BATCHING.QVALUE,
SCORE_IPF.QVALUE AS QVALUE_2,
SCORE_IPF_PRE_BATCHING.PEP,
SCORE_IPF.PEP AS PEP_2
FROM SCORE_IPF_PRE_BATCHING
FULL JOIN SCORE_IPF ON SCORE_IPF.FEATURE_ID = SCORE_IPF_PRE_BATCHING.FEATURE_ID AND SCORE_IPF.PEPTIDE_ID = SCORE_IPF_PRE_BATCHING.PEPTIDE_ID
LIMIT 1000;

In [3]:
def create_reader_config(level, infile, outfile):
    """
    Common config generator to avoid repetition
    """
    return IPFIOConfig(
        infile=infile, outfile=outfile, subsample_ratio=1, context="ipf", level=level, ipf_ms1_scoring = False, ipf_ms2_scoring = False,
    )
    


## Peakgroup-Precursor

In [7]:
osw_config = create_reader_config(
    "peakgroup_precursor", "data/test_data.osw", "data/test_data.osw"
)
parquet_config = create_reader_config(
    "peakgroup_precursor", "data/test_data.parquet", "data/test_data.parquet"
)
split_parquet_config = create_reader_config(
    "peakgroup_precursor",
    "data/test_data_scored.oswpq",
    "data/test_data_scored.oswpq",
)

osw_reader = ReaderDispatcher.get_reader(osw_config)
parquet_reader = ReaderDispatcher.get_reader(parquet_config)
split_parquet_reader = ReaderDispatcher.get_reader(split_parquet_config)

# osw_data = osw_reader.read("peakgroup_precursor").sort_values(by=["feature_id", "ms2_peakgroup_pep", "ms1_precursor_pep", "ms2_precursor_pep"]).reset_index(drop=True)
# parquet_data = parquet_reader.read("peakgroup_precursor").sort_values(by=["feature_id", "ms2_peakgroup_pep", "ms1_precursor_pep", "ms2_precursor_pep"]).reset_index(drop=True)
split_parquet_data = split_parquet_reader.read("peakgroup_precursor").sort_values(by=["feature_id", "ms2_peakgroup_pep", "ms1_precursor_pep", "ms2_precursor_pep"]).reset_index(drop=True)

[32m2025-06-21 21:57:17.514[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m359[0m - [1mDetected 1 split_parquet run files[0m
[32m2025-06-21 21:57:17.516[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m360[0m - [1mInput Parquet Structure:[0m


└── 📁 data/test_data_scored.oswpq
    ├── 📁 test_data_scored.oswpq
    │   ├── 📄 precursors_features.parquet
    │   └── 📄 transition_features.parquet


[32m2025-06-21 21:57:17.533[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.split_parquet[0m:[36m_read_pyp_peakgroup_precursor[0m:[36m66[0m - [1mReading precursor-level data ...[0m


In [7]:
osw_data

Unnamed: 0,feature_id,ms2_peakgroup_pep,ms1_precursor_pep,ms2_precursor_pep
0,-9078977811506172301,0.003654135376405,,
1,-9059007664292712863,0.091383787890556,,
2,-9009602369958523731,0.003654135376405,,
3,-8990894093332793487,0.003654135376405,,
4,-8915955323477460297,0.003654135376405,,
...,...,...,...,...
367,9100151962100689925,0.003654135376405,,
368,9138695304810091733,0.003654135376405,,
369,9179217567582710769,0.003654135376405,,
370,9182851156161074378,0.003654135376405,,


In [8]:
parquet_data

Unnamed: 0,feature_id,ms2_peakgroup_pep,ms1_precursor_pep,ms2_precursor_pep
0,-9078977811506172301,0.003654135391116,,
1,-9059007664292712863,0.091383785009384,,
2,-9009602369958523731,0.003654135391116,,
3,-8990894093332793487,0.003654135391116,,
4,-8915955323477460297,0.003654135391116,,
...,...,...,...,...
367,9100151962100689925,0.003654135391116,,
368,9138695304810091733,0.003654135391116,,
369,9179217567582710769,0.003654135391116,,
370,9182851156161074378,0.003654135391116,,


In [9]:
split_parquet_data

Unnamed: 0,feature_id,ms2_peakgroup_pep,ms1_precursor_pep,ms2_precursor_pep
0,-9078977811506172301,0.003654135376405,,
1,-9059007664292712863,0.091383787890556,,
2,-9009602369958523731,0.003654135376405,,
3,-8990894093332793487,0.003654135376405,,
4,-8915955323477460297,0.003654135376405,,
...,...,...,...,...
367,9100151962100689925,0.003654135376405,,
368,9138695304810091733,0.003654135376405,,
369,9179217567582710769,0.003654135376405,,
370,9182851156161074378,0.003654135376405,,


In [8]:
split_parquet_data

Unnamed: 0,feature_id,ms2_peakgroup_pep,ms1_precursor_pep,ms2_precursor_pep
0,-9078977811506172301,0.003142319081467,,
1,-9059007664292712863,0.361467984298225,,
2,-9009602369958523731,0.003142319081467,,
3,-8990894093332793487,0.003142319081467,,
4,-8915955323477460297,0.003142319081467,,
...,...,...,...,...
384,9100151962100689925,0.003142319081467,,
385,9138695304810091733,0.003142319081467,,
386,9179217567582710769,0.003142319081467,,
387,9182851156161074378,0.003142319081467,,


## Transition

In [8]:
osw_config = create_reader_config(
    "transition", "data/test_data.osw", "data/test_data.osw"
)
parquet_config = create_reader_config(
    "transition", "data/test_data.parquet", "data/test_data.parquet"
)
split_parquet_config = create_reader_config(
    "transition",
    "data/test_data_scored.oswpq",
    "data/test_data_scored.oswpq",
)

osw_reader = ReaderDispatcher.get_reader(osw_config)
parquet_reader = ReaderDispatcher.get_reader(parquet_config)
split_parquet_reader = ReaderDispatcher.get_reader(split_parquet_config)

# osw_data = osw_reader.read("transition").sort_values(by=["feature_id", "transition_id", "pep"]).reset_index(drop=True)
# parquet_data = parquet_reader.read("transition").sort_values(by=["feature_id", "transition_id", "pep"]).reset_index(drop=True)
split_parquet_data = split_parquet_reader.read("transition").sort_values(by=["feature_id", "transition_id", "pep"]).reset_index(drop=True)

[32m2025-06-20 00:15:55.456[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m359[0m - [1mDetected 1 split_parquet run files[0m
[32m2025-06-20 00:15:55.457[0m | [1mINFO    [0m | [36mpyprophet.io.util[0m:[36mprint_parquet_tree[0m:[36m360[0m - [1mInput Parquet Structure:[0m


└── 📁 data/test_data_scored.oswpq
    ├── 📁 test_data_scored.oswpq
    │   ├── 📄 precursors_features.parquet
    │   └── 📄 transition_features.parquet


[32m2025-06-20 00:15:55.461[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.split_parquet[0m:[36m_read_pyp_transition[0m:[36m173[0m - [1mReading peptidoform-level data ...[0m
[32m2025-06-20 00:15:55.468[0m | [34m[1mDEBUG   [0m | [36mpyprophet.io.ipf.split_parquet[0m:[36m_read_pyp_transition[0m:[36m335[0m - [34m[1mProcessing features 0-1000: 1000 features[0m
[32m2025-06-20 00:15:55.501[0m | [34m[1mDEBUG   [0m | [36mpyprophet.io.ipf.split_parquet[0m:[36m_read_pyp_transition[0m:[36m406[0m - [34m[1mProcessed features 0-1000: 2344 rows[0m
[32m2025-06-20 00:15:55.502[0m | [34m[1mDEBUG   [0m | [36mpyprophet.io.ipf.split_parquet[0m:[36m_read_pyp_transition[0m:[36m335[0m - [34m[1mProcessing features 1000-2000: 1000 features[0m
[32m2025-06-20 00:15:55.526[0m | [34m[1mDEBUG   [0m | [36mpyprophet.io.ipf.split_parquet[0m:[36m_read_pyp_transition[0m:[36m406[0m - [34m[1mProcessed features 1000-2000: 2864 rows[0m
[32m2025-06-20 00:15:55.5

In [11]:
osw_data

Unnamed: 0,feature_id,transition_id,pep,peptide_id,bmask,num_peptidoforms
0,-9078977811506172301,3275,0.007548884271154,305,1.000000000000000,1
1,-9078977811506172301,3275,0.007548884271154,-1,0.000000000000000,1
2,-9078977811506172301,3276,0.013393521510058,305,1.000000000000000,1
3,-9078977811506172301,3276,0.013393521510058,-1,0.000000000000000,1
4,-9078977811506172301,3277,0.004850020600794,305,1.000000000000000,1
...,...,...,...,...,...,...
9310,9218597765662578237,5082,0.292798299079015,-1,0.000000000000000,1
9311,9218597765662578237,5083,0.000743727340380,513,1.000000000000000,1
9312,9218597765662578237,5083,0.000743727340380,-1,0.000000000000000,1
9313,9218597765662578237,5085,0.000743727340380,513,1.000000000000000,1


In [12]:
parquet_data

Unnamed: 0,feature_id,transition_id,pep,peptide_id,bmask,num_peptidoforms
0,-9078977811506172301,3275,0.007548884488642,305,1.000000000000000,1
1,-9078977811506172301,3275,0.007548884488642,-1,0.000000000000000,1
2,-9078977811506172301,3276,0.013393520377576,305,1.000000000000000,1
3,-9078977811506172301,3276,0.013393520377576,-1,0.000000000000000,1
4,-9078977811506172301,3277,0.004850020632148,305,1.000000000000000,1
...,...,...,...,...,...,...
9310,9218597765662578237,5082,0.292809933423996,-1,0.000000000000000,1
9311,9218597765662578237,5083,0.000743727316149,513,1.000000000000000,1
9312,9218597765662578237,5083,0.000743727316149,-1,0.000000000000000,1
9313,9218597765662578237,5085,0.000743727316149,513,1.000000000000000,1


In [13]:
split_parquet_data

Unnamed: 0,feature_id,transition_id,pep,peptide_id,bmask,num_peptidoforms
0,-9078977811506172301,3275,0.007548884271154,305,1.000000000000000,1
1,-9078977811506172301,3275,0.007548884271154,-1,0.000000000000000,1
2,-9078977811506172301,3276,0.013393521510058,305,1.000000000000000,1
3,-9078977811506172301,3276,0.013393521510058,-1,0.000000000000000,1
4,-9078977811506172301,3277,0.004850020600794,305,1.000000000000000,1
...,...,...,...,...,...,...
9310,9218597765662578237,5082,0.292798299079015,-1,0.000000000000000,1
9311,9218597765662578237,5083,0.000743727340380,513,1.000000000000000,1
9312,9218597765662578237,5083,0.000743727340380,-1,0.000000000000000,1
9313,9218597765662578237,5085,0.000743727340380,513,1.000000000000000,1


In [9]:
split_parquet_data

Unnamed: 0,feature_id,transition_id,pep,peptide_id,bmask,num_peptidoforms
0,-9078977811506172301,3275,0.007527684725314,305,1,1
1,-9078977811506172301,3275,0.007527684725314,-1,0,1
2,-9078977811506172301,3276,0.013336601138495,305,1,1
3,-9078977811506172301,3276,0.013336601138495,-1,0,1
4,-9078977811506172301,3277,0.004828594421420,305,1,1
...,...,...,...,...,...,...
9316,9218597765662578237,5082,0.293300720162134,-1,0,1
9317,9218597765662578237,5083,0.000740899956141,513,1,1
9318,9218597765662578237,5083,0.000740899956141,-1,0,1
9319,9218597765662578237,5085,0.000740899956141,513,1,1


## Test batching

In [4]:
file = "/home/singjc/Documents/github/synth_phospho/Justin_Synth_PhosPep/results/fragpipe_oswbench_20220512/data/arycal/merged.osw"

osw_config = create_reader_config(
    "peakgroup_precursor", file, file
)

# split_parquet_config = create_reader_config(
#     "peakgroup_precursor",
#     "data/test_data_scored.oswpq",
#     "data/test_data_scored.oswpq",
# )

osw_reader = ReaderDispatcher.get_reader(osw_config)
# split_parquet_reader = ReaderDispatcher.get_reader(split_parquet_config)

osw_data = osw_reader.read("peakgroup_precursor").sort_values(by=["feature_id", "ms2_peakgroup_pep", "ms1_precursor_pep", "ms2_precursor_pep"]).reset_index(drop=True)
# split_parquet_data = split_parquet_reader.read("peakgroup_precursor").sort_values(by=["feature_id", "ms2_peakgroup_pep", "ms1_precursor_pep", "ms2_precursor_pep"]).reset_index(drop=True)

[32m2025-06-26 17:21:39.925[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


In [6]:
config = osw_config
reader = osw_reader
batch_size = config.batch_size
peptide_ids = reader.read(level="peptide_ids")["peptide_id"].unique()

print(f"Batch size: {batch_size}")
print(f"Number of unique peptides: {len(peptide_ids)}")


Batch size: 100000
Number of unique peptides: 684


In [9]:
from pyprophet.ipf import precursor_inference, peptidoform_inference

In [10]:
precursor_table = reader.read(
            level="peakgroup_precursor", peptide_ids=peptide_ids
        )
precursor_data = precursor_inference(
    precursor_table,
    config.ipf_ms1_scoring,
    config.ipf_ms2_scoring,
    config.ipf_max_precursor_pep,
    config.ipf_max_precursor_peakgroup_pep,
)

[32m2025-06-26 11:28:37.769[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m
[32m2025-06-26 11:28:38.079[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m


In [18]:
precursor_data = precursor_data.sort_values('feature_id')
precursor_data

Unnamed: 0,feature_id,precursor_peakgroup_pep
69,-9078977811506172301,0.003142319081467
14,-9059007664292712863,0.361467984298225
71,-9009602369958523731,0.003142319081467
333,-8990894093332793487,0.003142319081467
270,-8915955323477460297,0.003142319081467
...,...,...
82,9100151962100689925,0.003142319081467
97,9138695304810091733,0.003142319081467
145,9179217567582710769,0.003142319081467
314,9182851156161074378,0.003142319081467


In [12]:
batch_size = 100

# Iterate over peptide_ids in batches to avoid memory issues
if batch_size > 0:
    peptide_id_batches = [
        peptide_ids[i : i + batch_size]
        for i in range(0, len(peptide_ids), batch_size)
    ]
else:
    peptide_id_batches = [peptide_ids]

all_peptidoform_data = []
for peptide_ids_batch in peptide_id_batches:
    print(
        f"Processing peptide IDs batch: {peptide_ids_batch[0]} to {peptide_ids_batch[-1]} of {len(peptide_ids)}..."
    )

    # precursor level
    precursor_table = reader.read(
        level="peakgroup_precursor", peptide_ids=peptide_ids_batch
    )
    tmp_precursor_data = precursor_inference(
        precursor_table,
        config.ipf_ms1_scoring,
        config.ipf_ms2_scoring,
        config.ipf_max_precursor_pep,
        config.ipf_max_precursor_peakgroup_pep,
    )
    
    all_peptidoform_data.append(tmp_precursor_data)
    

precursor_data_batched = pd.concat(all_peptidoform_data, ignore_index=True)

[32m2025-06-26 11:32:42.573[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 32 to 838 of 684...


[32m2025-06-26 11:32:42.865[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:32:43.046[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 924 to 1108 of 684...


[32m2025-06-26 11:32:43.441[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:32:43.548[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 1115 to 1295 of 684...


[32m2025-06-26 11:32:43.955[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m


Processing peptide IDs batch: 1299 to 557 of 684...


[32m2025-06-26 11:32:44.186[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m
[32m2025-06-26 11:32:44.565[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:32:44.707[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 483 to 1279 of 684...


[32m2025-06-26 11:32:44.953[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:32:45.063[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 1310 to 54 of 684...


[32m2025-06-26 11:32:45.277[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:32:45.419[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 105 to 1343 of 684...


[32m2025-06-26 11:32:45.730[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m


In [19]:
precursor_data_batched = precursor_data_batched.sort_values('feature_id')
precursor_data_batched

Unnamed: 0,feature_id,precursor_peakgroup_pep
58,-9078977811506172301,0.003142319081467
11,-9059007664292712863,0.361467984298225
60,-9009602369958523731,0.003142319081467
197,-8990894093332793487,0.003142319081467
324,-8915955323477460297,0.003142319081467
...,...,...
72,9100151962100689925,0.003142319081467
126,9138695304810091733,0.003142319081467
169,9179217567582710769,0.003142319081467
100,9182851156161074378,0.003142319081467


In [22]:
merged_df = pd.merge(precursor_data, precursor_data_batched, on='feature_id', how='outer', indicator=True)
merged_df

Unnamed: 0,feature_id,precursor_peakgroup_pep_x,precursor_peakgroup_pep_y,_merge
0,-9078977811506172301,0.003142319081467,0.003142319081467,both
1,-9059007664292712863,0.361467984298225,0.361467984298225,both
2,-9009602369958523731,0.003142319081467,0.003142319081467,both
3,-8990894093332793487,0.003142319081467,0.003142319081467,both
4,-8915955323477460297,0.003142319081467,0.003142319081467,both
...,...,...,...,...
384,9100151962100689925,0.003142319081467,0.003142319081467,both
385,9138695304810091733,0.003142319081467,0.003142319081467,both
386,9179217567582710769,0.003142319081467,0.003142319081467,both
387,9182851156161074378,0.003142319081467,0.003142319081467,both


In [23]:
merged_df[merged_df['_merge'] != 'both']

Unnamed: 0,feature_id,precursor_peakgroup_pep_x,precursor_peakgroup_pep_y,_merge


## Transition

In [24]:
peptidoform_table = reader.read(
            level="transition", peptide_ids=peptide_ids
        )
peptidoform_data = peptidoform_inference(
            peptidoform_table,
            precursor_data,
            config.ipf_grouped_fdr,
            config.propagate_signal_across_runs,
            config.across_run_confidence_threshold,
        )
peptidoform_data = peptidoform_data.sort_values('feature_id')
peptidoform_data

[32m2025-06-26 11:38:26.542[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:38:26.917[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:38:26.989[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m


Unnamed: 0,feature_id,hypothesis,likelihood_prior,likelihood_sum,posterior,pep,qvalue,precursor_peakgroup_pep
0,-9078977811506172301,-1,0.000000000000000,0.486771939725330,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
1,-9078977811506172301,305,0.486771939725330,0.486771939725330,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
2,-9009602369958523731,-1,0.000000000000000,0.357278539044749,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
3,-9009602369958523731,309,0.357278539044749,0.357278539044749,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
4,-8990894093332793487,-1,0.000000000000000,0.715743158621281,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
...,...,...,...,...,...,...,...,...
659,9179217567582710769,366,0.801162190369319,0.801162190369319,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
660,9182851156161074378,-1,0.000000000000000,0.141255891796635,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
661,9182851156161074378,975,0.141255891796635,0.141255891796635,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
662,9218597765662578237,-1,0.000000000000000,0.640933354407969,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467


In [25]:
batch_size = 100

# Iterate over peptide_ids in batches to avoid memory issues
if batch_size > 0:
    peptide_id_batches = [
        peptide_ids[i : i + batch_size]
        for i in range(0, len(peptide_ids), batch_size)
    ]
else:
    peptide_id_batches = [peptide_ids]

all_peptidoform_data = []
for peptide_ids_batch in peptide_id_batches:
    print(
        f"Processing peptide IDs batch: {peptide_ids_batch[0]} to {peptide_ids_batch[-1]} of {len(peptide_ids)}..."
    )

    # precursor level
    precursor_table = reader.read(
        level="peakgroup_precursor", peptide_ids=peptide_ids_batch
    )
    tmp_precursor_data = precursor_inference(
        precursor_table,
        config.ipf_ms1_scoring,
        config.ipf_ms2_scoring,
        config.ipf_max_precursor_pep,
        config.ipf_max_precursor_peakgroup_pep,
    )
    
    peptidoform_table = reader.read(
            level="transition", peptide_ids=peptide_ids_batch
        )
    
    tmp_peptidoform_data = peptidoform_inference(
            peptidoform_table,
            tmp_precursor_data,
            config.ipf_grouped_fdr,
            config.propagate_signal_across_runs,
            config.across_run_confidence_threshold,
        )
    
    all_peptidoform_data.append(tmp_peptidoform_data)
    

peptidoform_data_batched = pd.concat(all_peptidoform_data, ignore_index=True)
peptidoform_data_batched = peptidoform_data_batched.sort_values('feature_id')
peptidoform_data_batched


[32m2025-06-26 11:40:58.509[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 32 to 838 of 684...


[32m2025-06-26 11:40:58.891[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:40:59.084[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:40:59.586[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:40:59.617[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m
[32m2025-06-26 11:40:59.783[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 924 to 1108 of 684...


[32m2025-06-26 11:41:00.075[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:41:00.208[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:41:00.758[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:41:00.782[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m
[32m2025-06-26 11:41:00.948[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 1115 to 1295 of 684...


[32m2025-06-26 11:41:01.208[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:41:01.317[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:41:01.789[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:41:01.829[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m
[32m2025-06-26 11:41:02.058[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 1299 to 557 of 684...


[32m2025-06-26 11:41:02.295[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:41:02.424[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:41:02.696[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:41:02.718[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m
[32m2025-06-26 11:41:02.935[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 483 to 1279 of 684...


[32m2025-06-26 11:41:03.332[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:41:03.523[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:41:03.835[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:41:03.850[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m
[32m2025-06-26 11:41:04.036[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m


Processing peptide IDs batch: 1310 to 54 of 684...


[32m2025-06-26 11:41:04.352[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:41:04.557[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:41:04.899[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:41:04.926[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m


Processing peptide IDs batch: 105 to 1343 of 684...


[32m2025-06-26 11:41:05.184[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_peakgroup_precursor_duckdb[0m:[36m214[0m - [1mReading precursor-level data ...[0m
[32m2025-06-26 11:41:05.600[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mprecursor_inference[0m:[36m376[0m - [1mSkipping precursor-level inference.[0m
[32m2025-06-26 11:41:05.771[0m | [1mINFO    [0m | [36mpyprophet.io.ipf.osw[0m:[36m_read_pyp_transition_duckdb[0m:[36m335[0m - [1mInfo: Reading peptidoform-level data ...[0m
[32m2025-06-26 11:41:06.105[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m414[0m - [1mPreparing peptidoform-level data ... [0m
[32m2025-06-26 11:41:06.123[0m | [1mINFO    [0m | [36mpyprophet.ipf[0m:[36mpeptidoform_inference[0m:[36m420[0m - [1mConducting peptidoform-level inference ... [0m


Unnamed: 0,feature_id,hypothesis,likelihood_prior,likelihood_sum,posterior,pep,qvalue,precursor_peakgroup_pep
0,-9078977811506172301,-1.000000000000000,0.000000000000000,0.486771939725330,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
1,-9078977811506172301,305.000000000000000,0.486771939725330,0.486771939725330,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
3,-9009602369958523731,309.000000000000000,0.357278539044749,0.357278539044749,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
2,-9009602369958523731,-1.000000000000000,0.000000000000000,0.357278539044749,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
197,-8990894093332793487,1169.000000000000000,0.715743158621281,0.715743158621281,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
...,...,...,...,...,...,...,...,...
382,9179217567582710769,-1.000000000000000,0.000000000000000,0.801162190369319,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
194,9182851156161074378,-1.000000000000000,0.000000000000000,0.141255891796635,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467
195,9182851156161074378,975.000000000000000,0.141255891796635,0.141255891796635,1.000000000000000,0.000000000000000,0.000000000000000,0.003142319081467
384,9218597765662578237,-1.000000000000000,0.000000000000000,0.640933354407969,0.000000000000000,1.000000000000000,0.500000000000000,0.003142319081467


In [26]:
merged_df = pd.merge(precursor_data, precursor_data_batched, on='feature_id', how='outer', indicator=True)
merged_df

Unnamed: 0,feature_id,precursor_peakgroup_pep_x,precursor_peakgroup_pep_y,_merge
0,-9078977811506172301,0.003142319081467,0.003142319081467,both
1,-9059007664292712863,0.361467984298225,0.361467984298225,both
2,-9009602369958523731,0.003142319081467,0.003142319081467,both
3,-8990894093332793487,0.003142319081467,0.003142319081467,both
4,-8915955323477460297,0.003142319081467,0.003142319081467,both
...,...,...,...,...
384,9100151962100689925,0.003142319081467,0.003142319081467,both
385,9138695304810091733,0.003142319081467,0.003142319081467,both
386,9179217567582710769,0.003142319081467,0.003142319081467,both
387,9182851156161074378,0.003142319081467,0.003142319081467,both


In [27]:
merged_df[merged_df['_merge'] != 'both']

Unnamed: 0,feature_id,precursor_peakgroup_pep_x,precursor_peakgroup_pep_y,_merge
