In [1]:
import pathlib

metadata_files = [str(x) for x in pathlib.Path("/home/jspaezp/tmp/asdad/spec_metadata").glob("*.csv")]
spectrast_in = "/home/jspaezp/tmp/asdad/spectrast_in/ProteomeToolsTP.spectrast.mokapot.psms.tsv"


In [2]:
import pathlib
import pandas as pd
import elfragmentador.constants as CONSTANTS
from elfragmentador.evaluate import polyfit, apply_polyfit


def _read_spectrast_in(spectrast_in, only_irt=False):
    df = pd.read_csv(spectrast_in, sep = "\t", usecols=['SpecId', "ScanNr", "Peptide"])
    
    # R.PCYCSSGCGSSCCQSSCCK.S/2 to 'PCYCSSGCGSSCCQSSCCK'
    df['StripPeptide'] = [x[2:-4] for x in df['Peptide']]

    if only_irt:
        df = df[[x in CONSTANTS.IRT_PEPTIDES for x in df['StripPeptide']]].reset_index(drop=True)
        df["iRT"] = [CONSTANTS.IRT_PEPTIDES[x]['irt'] for x in df['StripPeptide']]

    df["File"] = [pathlib.Path(x).stem for x in df['SpecId']]
    return df


def calculate_irt_coefficients(spectrast_in, metadata_files):
    df = _read_spectrast_in(spectrast_in=spectrast_in, only_irt=True)

    coefficients_dict = {}
    for tmp_file_name, tmp_df in df.groupby("File"):
        curr_metadata_file_name = [x for x in metadata_files if tmp_file_name in x][0]
        tmp_metadata_df = pd.read_csv(curr_metadata_file_name)
        scan_times = {k: v for k,v in zip(tmp_metadata_df["ScanNr"], tmp_metadata_df["RetentionTime"])}
        tmp_df["RT"] = [scan_times[x] for x in tmp_df["ScanNr"]]
        fit_coefficients = polyfit(x = tmp_df["RT"], y = tmp_df["iRT"])
        coefficients_dict[tmp_file_name] = fit_coefficients

    return coefficients_dict


def calculate_irt_spectrast_in(spectrast_in, coefficients_dict, metadata_files):
    df = _read_spectrast_in(spectrast_in=spectrast_in, only_irt=False)

    outs = []

    for tmp_file_name, tmp_df in df.groupby("File"):
        curr_metadata_file_name = [x for x in metadata_files if tmp_file_name in x][0]
        tmp_metadata_df = pd.read_csv(curr_metadata_file_name)
        scan_times = {k: v for k,v in zip(tmp_metadata_df["ScanNr"], tmp_metadata_df["RetentionTime"])}
        tmp_df["RT"] = [scan_times[x] for x in tmp_df["ScanNr"]]
        tmp_df["iRT"] = apply_polyfit(tmp_df['RT'], polynomial=coefficients_dict[tmp_file_name]['polynomial'])
        outs.append(tmp_df)

    return pd.concat(outs).reset_index(drop=True)


coefficients_dict = calculate_irt_coefficients(
    spectrast_in=spectrast_in,
    metadata_files=metadata_files)

import json

with open("out_json_dicts.json", "w") as f:
    json.dump(
        coefficients_dict,
        fp=f,
        indent=2)

! head out_json_dicts.json

{
  "01812a_GA3-TUM_third_pool_1_01_01-2xIT_2xHCD-1h-R1": {
    "polynomial": [
      3.235674262893933,
      -78.27250749435467
    ],
    "determination": 0.9975691609710435
  },
  "01812a_GA3-TUM_third_pool_1_01_01-3xHCD-1h-R1": {
    "polynomial": [


In [10]:
with open("./out_json_dicts.json", "r") as f:
    coefficients_dict = json.load(f)

out_df = calculate_irt_spectrast_in(spectrast_in=spectrast_in, coefficients_dict=coefficients_dict, metadata_files=metadata_files)
out_df = out_df.groupby(["StripPeptide"])['iRT'].aggregate(["mean", "min", "max", "count"]).reset_index()
out_df.to_csv("asdad.csv", index=False)