FlashLFQ now enables selection of peptides to be quantified (#779)

* Added optional arg to FlashLFQ that allows users to select which peptides will be quantified/reported * added some comments * increasing test coverage * Increased test coverage * Renamed, added comment --------- Co-authored-by: Edwin Laboy <63374885+elaboy@users.noreply.github.com>
smith-chem-wisc · May 20, 2024 · 84b2e9d · 84b2e9d
1 parent a93dfa2
commit 84b2e9d
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 23 deletions.
diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs
@@ -1,4 +1,5 @@
-using MathNet.Numerics.Statistics;
+using Easy.Common.Extensions;
+using MathNet.Numerics.Statistics;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -13,20 +14,28 @@ public class FlashLfqResults
         public readonly Dictionary<string, Peptide> PeptideModifiedSequences;
         public readonly Dictionary<string, ProteinGroup> ProteinGroups;
         public readonly Dictionary<SpectraFileInfo, List<ChromatographicPeak>> Peaks;
+        private readonly HashSet<string> _peptideModifiedSequencesToQuantify;
 
-        public FlashLfqResults(List<SpectraFileInfo> spectraFiles, List<Identification> identifications)
+        public FlashLfqResults(List<SpectraFileInfo> spectraFiles, List<Identification> identifications, HashSet<string> peptides = null)
         {
             SpectraFiles = spectraFiles;
             PeptideModifiedSequences = new Dictionary<string, Peptide>();
             ProteinGroups = new Dictionary<string, ProteinGroup>();
             Peaks = new Dictionary<SpectraFileInfo, List<ChromatographicPeak>>();
+            if(peptides == null || !peptides.Any())
+            {
+                peptides = identifications.Select(id => id.ModifiedSequence).ToHashSet();
+            }
+            _peptideModifiedSequencesToQuantify = peptides;
 
             foreach (SpectraFileInfo file in spectraFiles)
             {
                 Peaks.Add(file, new List<ChromatographicPeak>());
             }
 
-            foreach (Identification id in identifications)
+
+            // Only quantify peptides within the set of valid peptide modified (full) sequences. This is done to enable pepitde-level FDR control of reported results
+            foreach (Identification id in identifications.Where(id => peptides.Contains(id.ModifiedSequence)))
             {
                 if (!PeptideModifiedSequences.TryGetValue(id.ModifiedSequence, out Peptide peptide))
                 {
@@ -120,6 +129,7 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides)
                 var groupedPeaks = filePeaks.Value
                     .Where(p => p.NumIdentificationsByFullSeq == 1)
                     .GroupBy(p => p.Identifications.First().ModifiedSequence)
+                    .Where(group => _peptideModifiedSequencesToQuantify.Contains(group.Key))
                     .ToList();
 
                 foreach (var sequenceWithPeaks in groupedPeaks)

diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs
@@ -11,6 +11,7 @@
 using System.Threading.Tasks;
 using UsefulProteomicsDatabases;
 using System.Runtime.CompilerServices;
+using Easy.Common.Extensions;
 
 [assembly: InternalsVisibleTo("TestFlashLFQ")]
 
@@ -57,6 +58,12 @@ public class FlashLfqEngine
         private Stopwatch _globalStopwatch;
         private List<Identification> _allIdentifications;
         /// <summary>
+        /// These peptides will be reported in the QuantifiedPeptides output and used for protein quant.
+        /// Other peptides may appear in the QuantifiedPeaks output, but this list is used to enable
+        /// peptide-level FDR filtering
+        /// </summary>
+        public HashSet<string> PeptidesModifiedSequencesToQuantify { get; init; }
+        /// <summary>
         /// Dictionary linking a modified sequence to a List of tuples containing
         /// the mass shifts (isotope mass - monoisotopic mass) and normalized abundances for the
         /// isotopes for a given peptide
@@ -67,6 +74,13 @@ public class FlashLfqEngine
         internal Dictionary<SpectraFileInfo, Ms1ScanInfo[]> _ms1Scans;
         internal PeakIndexingEngine _peakIndexingEngine;
 
+        /// <summary>
+        /// Create an instance of FlashLFQ that will quantify peptides based on their precursor intensity in MS1 spectra
+        /// </summary>
+        /// <param name="allIdentifications">A list of identifications corresponding to MS2 peptide detections. One ID per peptide per file</param>
+        /// <param name="integrate">Optional. Bool indicating whether peaks should be integrated before quantification. It is HIGHLY recommended this is set to FALSE</param>
+        /// <param name="peptideSequencesToUse">Optional. A list of strings corresponding to the modified sequences of peptides that should be quantified/used for
+        /// protein level quant. Reccommended use is to pass in the full sequence of every peptide at 1% peptide-level FDR</param>
         public FlashLfqEngine(
             List<Identification> allIdentifications,
             bool normalize = false,
@@ -93,6 +107,7 @@ public FlashLfqEngine(
             int mcmcBurninSteps = 1000,
             bool useSharedPeptidesForProteinQuant = false,
             bool pairedSamples = false,
+            List<string> peptideSequencesToUse = null,
             int? randomSeed = null)
         {
             Loaders.LoadElements();
@@ -128,6 +143,8 @@ public FlashLfqEngine(
             McmcSteps = mcmcSteps;
             McmcBurninSteps = mcmcBurninSteps;
             UseSharedPeptidesForProteinQuant = useSharedPeptidesForProteinQuant;
+            PeptidesModifiedSequencesToQuantify = peptideSequencesToUse.IsNotNullOrEmpty() ? new HashSet<string>(peptideSequencesToUse) 
+                : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet();
             RandomSeed = randomSeed;
 
             if (MaxThreads == -1 || MaxThreads >= Environment.ProcessorCount)
@@ -149,7 +166,7 @@ public FlashLfqResults Run()
         {
             _globalStopwatch.Start();
             _ms1Scans = new Dictionary<SpectraFileInfo, Ms1ScanInfo[]>();
-            _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications);
+            _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, PeptidesModifiedSequencesToQuantify);
 
             // build m/z index keys
             CalculateTheoreticalIsotopeDistributions();
@@ -588,7 +605,8 @@ private MbrScorer BuildMbrScorer(List<ChromatographicPeak> acceptorFileIdentifie
             // Construct a distribution of ppm errors for all MSMS peaks in the acceptor file
             var apexToAcceptorFilePeakDict = new Dictionary<IndexedMassSpectralPeak, ChromatographicPeak>();
             List<double> ppmErrors = new List<double>();
-            foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null))
+            foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null
+                && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence))) 
             {
                 if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak))
                 {
@@ -677,6 +695,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile)
                     !p.IsMbrPeak
                     && p.NumIdentificationsByFullSeq == 1
                     && p.IsotopicEnvelopes.Any()
+                    && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) // Only do MBR for peptides that we want to quantify
                     && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence)
                     && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList();
 
@@ -1060,11 +1079,35 @@ private void RunErrorChecking(SpectraFileInfo spectraFile)
 
                     if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak)
                     {
-                        storedPeak.MergeFeatureWith(tryPeak, Integrate);
+                        if (PeptidesModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence))
+                        {
+                            if (PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence))
+                            {
+                                storedPeak.MergeFeatureWith(tryPeak, Integrate);
+                            }
+                            else
+                            {
+                                // If the stored peak id isn't in the list of peptides to quantify, overwrite it
+                                errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak;
+                            }
+                        }
+                        else
+                        {
+                            continue; // if the tryPeak id isn't in the list of peptides to quantify, it is discarded
+                        }
+
                     }
                     else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak)
                     {
-                        continue;
+                        if(PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence))
+                        {
+                            continue;
+                        }
+                        else
+                        {
+                            // If the stored peak id isn't in the list of peptides to quantify, overwrite it
+                            errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak;
+                        }
                     }
                     else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak)
                     {

diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs
@@ -15,6 +15,7 @@
 using UsefulProteomicsDatabases;
 using ChromatographicPeak = FlashLFQ.ChromatographicPeak;
 using Stopwatch = System.Diagnostics.Stopwatch;
+using TopDownProteomics;
 
 namespace Test
 {
@@ -475,23 +476,39 @@ public static void TestFlashLfqMergeResults()
             Assert.AreEqual(4, resultsA.SpectraFiles.Count);
         }
 
+
+        /// <summary>
+        /// This test MatchBetweenRuns by creating two fake mzML files and a list of fake IDs. 
+        /// There are multiple sets of IDs, where most are shared between the two runs but one+ is/are missing
+        /// MBR is tested by ensuring that IDs are transferred between runs
+        /// </summary>
         [Test]
         public static void TestFlashLfqMatchBetweenRuns()
         {
             List<string> filesToWrite = new List<string> { "mzml_1", "mzml_2" };
-            List<string> pepSequences = new List<string> { "PEPTIDE", "PEPTIDEV", "PEPTIDEVV", "PEPTIDEVVV", "PEPTIDEVVVV", "PEPTIDEVVVVA", "PEPTIDEVVVVAA" };
+            List<string> pepSequences = new List<string> 
+                { 
+                "PEPTIDE", 
+                "PEPTIDEV", 
+                "PEPTIDEVV", 
+                "TARGETPEP",
+                "PEPTIDEVVV",
+                "PEPTIDEVVVV", 
+                "PEPTIDEVVVVA", 
+                "PEPTIDEVVVVAA"
+            };
             double intensity = 1e6;
 
-            double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.035, 1.04, 1.045, 1.05 };
-            double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.035, 1.04, 1.055, 1.07 };
+            double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.033, 1.035, 1.04, 1.045, 1.05 };
+            double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.031, 1.035, 1.04, 1.055, 1.07 };
 
             Loaders.LoadElements();
 
             // generate mzml files (5 peptides each)
             for (int f = 0; f < filesToWrite.Count; f++)
             {
                 // 1 MS1 scan per peptide
-                MsDataScan[] scans = new MsDataScan[7];
+                MsDataScan[] scans = new MsDataScan[8];
 
                 for (int p = 0; p < pepSequences.Count; p++)
                 {
@@ -533,39 +550,53 @@ public static void TestFlashLfqMatchBetweenRuns()
             Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV",
                 new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });
 
             Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE",
                 new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV",
                 new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List<ProteinGroup> { pg });
-            // missing ID 8 - MBR feature
+            // missing ID 8 - MBR feature - "PEPTIDEVV"
+
             Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });
 
             // Adding additional peaks to check interquartile range
             Identification id11 = new Identification(file1, "PEPTIDEVVVVA", "PEPTIDEVVVVA",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file1Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id12 = new Identification(file1, "PEPTIDEVVVVAA", "PEPTIDEVVVVAA",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file1Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
-
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file1Rt[7] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id13 = new Identification(file2, "PEPTIDEVVVVA", "PEPTIDEVVVVA",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file2Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
             Identification id14 = new Identification(file2, "PEPTIDEVVVVAA", "PEPTIDEVVVVAA",
-                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file2Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
+                new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file2Rt[7] + 0.001, 1, new List<ProteinGroup> { pg });
 
+            // Additional peaks, check that a non-confident ID (i.e., one that isn't included in the peptides to quantify list) is overwritten
+            // by a MBR transfer of a more confident ID
+            Identification id15 = new Identification(file1, "TARGETPEP", "TARGETPEP",
+                new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
+            Identification id16 = new Identification(file2, "DECOYPEP", "DECOYPEP",
+                new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
+
+            // Additional peaks to ensure that non-confident IDs aren't merged with confident IDs at the peak level
+            Identification id17 = new Identification(file1, "DECOYPEP", "DECOYPEP",
+                new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
+            Identification id18 = new Identification(file1, "DECOYPEP2", "DECOYPEP2",
+                new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
 
             // create the FlashLFQ engine
             FlashLfqEngine engine = new FlashLfqEngine(new List<Identification> { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true);
             FlashLfqEngine interquartileEngine = new FlashLfqEngine(
                 new List<Identification> { id1, id2, id3, id4, id5, id11, id12, id6, id7, id9, id10, id13, id14 }, matchBetweenRuns: true);
+            FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List<Identification> { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true,
+                peptideSequencesToUse: pepSequences);
 
 
-            // run the engine
+            //run the engine
             var results = engine.Run();
 
             Assert.That(results.Peaks[file2].Count == 5);
@@ -606,6 +637,15 @@ public static void TestFlashLfqMatchBetweenRuns()
             Assert.That(!peak.RtStdDev.HasValue);
             Assert.That(peak.RtInterquartileRange.HasValue);
             Assert.That(peak.RtInterquartileRange, Is.EqualTo(rtDiffs.InterquartileRange()).Within(0.01));
+
+            // The ambiguous engine tests that a non-confident ID (i.e., a PSM that didn't make the peptide level fdr cutoff) 
+            // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwriteen by confident MS2 ids
+            results = engineAmbiguous.Run();
+            Assert.False(results.PeptideModifiedSequences.Select(kvp => kvp.Key).Contains("DECOYPEP"));
+            Assert.False(results.Peaks[file1].Any(peak => peak.Identifications.Any(id => id.ModifiedSequence.Contains("DECOYPEP"))));
+            Assert.That(results.Peaks[file2].Any(peak => peak.Identifications.First().ModifiedSequence == "TARGETPEP"));
+            Assert.AreEqual(results.Peaks[file2].Count(peak => peak.IsMbrPeak), 2);
+
         }
 
         [Test]
@@ -1382,6 +1422,13 @@ public static void RealDataMbrTest()
             {
                 Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(f1r2) == 0);
             }
+
+            List<string> peptidesToUse = ids.Select(id => id.ModifiedSequence).Take(400).Distinct().ToList();
+            engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, peptideSequencesToUse: peptidesToUse);
+            results = engine.Run();
+            var test = results.PeptideModifiedSequences.Select(kvp => !peptidesToUse.Contains(kvp.Key)).ToList();
+
+            CollectionAssert.AreEquivalent(results.PeptideModifiedSequences.Select(kvp => kvp.Key), peptidesToUse);
         }
 
         [Test]
@@ -1620,7 +1667,8 @@ public static void TestAmbiguousFraction()
             peak1.CalculateIntensityForThisFeature(false);
             peak2.CalculateIntensityForThisFeature(false);
 
-            FlashLfqResults res = new FlashLfqResults(new List<SpectraFileInfo> { fraction1, fraction2 }, new List<Identification> { id1, id2, id3 });
+            FlashLfqResults res = new FlashLfqResults(new List<SpectraFileInfo> { fraction1, fraction2 }, new List<Identification> { id1, id2, id3 },
+                new HashSet<string> { "peptide1", "peptide2"});
             res.Peaks[fraction1].Add(peak1);
             res.Peaks[fraction2].Add(peak2);
             res.CalculatePeptideResults(quantifyAmbiguousPeptides: false);