<div style="display: flex; align-items: center;">
  <div style="flex: 1; max-width: 70%;">
    MSCI is a Python package designed to evaluate the information content of peptide fragmentation spectra. Our objective was to compute an information-content index for all peptides within a given proteome. This would allow us to devise data acquisition and analysis strategies that generate and prioritize the most informative fragment ions for peptide quantification.
  </div>
  <img src="https://github.com/proteomicsunitcrg/MSCI/raw/main/docs/MSCI_logo.png" width="300" style="margin-left: 20px;">
</div>


#Download MSCI package and necessary installations

In [1]:
!git clone https://github.com/proteomicsunitcrg/MSCI.git
! pip install matchms
# do not restart session
%cd MSCI
import sys
sys.path.append('/content/MSCI')


Cloning into 'MSCI'...
remote: Enumerating objects: 279, done.[K
remote: Counting objects: 100% (279/279), done.[K
remote: Compressing objects: 100% (264/264), done.[K
remote: Total 279 (delta 156), reused 53 (delta 3), pack-reused 0[K
Receiving objects: 100% (279/279), 1.59 MiB | 4.17 MiB/s, done.
Resolving deltas: 100% (156/156), done.
Collecting matchms
  Downloading matchms-0.27.0-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated>=1.2.14 (from matchms)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting pickydict>=0.4.0 (from matchms)
  Downloading pickydict-0.4.0-py3-none-any.whl (6.1 kB)
Collecting pillow!=9.4.0 (from matchms)
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecti

/content/MSCI


# Import


In [2]:
from MSCI.Preprocessing.Koina import PeptideProcessor
from MSCI.Grouping_MS1.Grouping_mw_irt import process_peptide_combinations
from MSCI.Preprocessing.read_msp_file import read_msp_file
from MSCI.Similarity.spectral_angle_similarity import process_spectra_pairs
from matchms.importing import load_from_msp
import random
import numpy as np
import pandas as pd


# Genrate predicted library

## Download the list of peptides of interest


In [3]:
# List of standard amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Function to generate a single tryptic peptide
def generate_tryptic_peptide(min_length=8, max_length=20):
    length = random.randint(min_length, max_length - 1)
    peptide = ''.join(random.choices(amino_acids, k=length))
    peptide += random.choice('KR')
    return peptide

# Generate a list of 100 random tryptic peptides
tryptic_peptides = [generate_tryptic_peptide() for _ in range(100)]

# Optional: Ensure the last peptide does not necessarily end with K or R
last_peptide_length = random.randint(5, 20)
last_peptide = ''.join(random.choices(amino_acids, k=last_peptide_length))
tryptic_peptides[-1] = last_peptide

# Save the peptides to a text file
with open('random_tryptic_peptides.txt', 'w') as f:
    for peptide in tryptic_peptides:
        f.write(f"{peptide}\n")

print("Generated 100 random tryptic peptides and saved to 'random_tryptic_peptides.txt'.")


Generated 100 random tryptic peptides and saved to 'random_tryptic_peptides.txt'.


## Predict with Koina

In [4]:
# Usage example
processor = PeptideProcessor(
    input_file="random_tryptic_peptides.txt",
    collision_energy=30,
    charge=1,
    model_intensity="Prosit_2020_intensity_HCD",
    model_irt="Prosit_2019_irt"
)

processor.process('output.msp')

# Load dataset


In [None]:
# You can use your own spectra
File= 'output.msp'
spectra = list(load_from_msp(File))

# Group within MS1 tolerance

In [11]:
process_peptide_combinations

tolerance1 = 5
tolerance2 = 50


mz_irt_df = read_msp_file(File)
Groups_df = process_peptide_combinations(mz_irt_df, tolerance1, tolerance2, use_ppm=False)

Groups_df

Unnamed: 0,index1,index2,peptide 1,peptide 2,m/z 1,m/z 2,iRT 1,iRT 2
0,23,64,GSLRHIEKTK/1,DLFRHEKVK/1,1168.679682,1171.658218,-14.610874,4.206432
1,24,81,PYFGMKATGFDK/1,TQIPSMTKVVKK/1,1361.655837,1359.802835,60.071228,24.020927
2,12,20,CLCDMEKHWQQFDWKVWDK/1,FACEPPFSYGIFKYHWPLYR/1,2525.119906,2521.216173,107.217804,134.617645
3,53,84,NLEIVNAMK/1,KICVICPEK/1,1031.555392,1032.558036,69.312454,25.48625
4,31,32,ILEFLRYPILGLVR/1,MWLTCCCFILNTEK/1,1702.041425,1704.761641,160.060791,145.835297
5,11,95,KSYYHCAIHAMMYK/1,FPAYGAFSEPKADADR/1,1745.796053,1741.818027,33.251549,58.834625
6,46,60,FMWACGGHMHKYK/1,PVPEWNMNWHQR/1,1595.706844,1593.737942,30.2792,73.798477
7,8,10,WQCDMRIVEYWK/1,VVCPMFLDRIAEMK/1,1656.766132,1651.836854,101.917091,122.498734
8,69,77,WCGAWCIRSVPRKTQYKKR/1,HKNSKNVNQHLAHNGYFTVR/1,2366.248493,2364.20682,28.35783,5.662121
9,15,75,TIWACWQYDADK/1,MRAEDGFDSWMR/1,1499.662379,1500.635845,99.425255,77.851349


# Calculate similarity within fragment tolerance


In [12]:
Groups_df.columns = Groups_df.columns.str.strip()
index_array = Groups_df[['index1','index2']].values.astype(int)
result = process_spectra_pairs(index_array, spectra,  mz_irt_df, tolerance =0, ppm=10)
result

Unnamed: 0,index1,index2,peptide 1,peptide 2,m/z 1,m/z 2,iRT 1,iRT 2,similarity_score
0,23,64,GSLRHIEKTK/1,DLFRHEKVK/1,1168.679682,1171.658218,-14.610874,4.206432,3e-05
1,24,81,PYFGMKATGFDK/1,TQIPSMTKVVKK/1,1361.655837,1359.802835,60.071228,24.020927,0.0
2,12,20,CLCDMEKHWQQFDWKVWDK/1,FACEPPFSYGIFKYHWPLYR/1,2525.119906,2521.216173,107.217804,134.617645,0.0
3,53,84,NLEIVNAMK/1,KICVICPEK/1,1031.555392,1032.558036,69.312454,25.48625,0.0
4,31,32,ILEFLRYPILGLVR/1,MWLTCCCFILNTEK/1,1702.041425,1704.761641,160.060791,145.835297,0.0
5,11,95,KSYYHCAIHAMMYK/1,FPAYGAFSEPKADADR/1,1745.796053,1741.818027,33.251549,58.834625,0.00018
6,46,60,FMWACGGHMHKYK/1,PVPEWNMNWHQR/1,1595.706844,1593.737942,30.2792,73.798477,0.0
7,8,10,WQCDMRIVEYWK/1,VVCPMFLDRIAEMK/1,1656.766132,1651.836854,101.917091,122.498734,0.0
8,69,77,WCGAWCIRSVPRKTQYKKR/1,HKNSKNVNQHLAHNGYFTVR/1,2366.248493,2364.20682,28.35783,5.662121,0.013045
9,15,75,TIWACWQYDADK/1,MRAEDGFDSWMR/1,1499.662379,1500.635845,99.425255,77.851349,0.0


# Plot results