# Set working dir

In [2]:
import os 
import warnings
import platform

warnings.filterwarnings("ignore")
if "macOS" in platform.platform():
    os.chdir("/Users/erjo3868/repos/hypedsearch/hypedsearch")
# Running on Fiji
else:
    os.chdir("/scratch/Shares/layer/hypedsearch/hypedsearch")


# Index protein-product ion DB

In [None]:
%%time
from src.lookups.protein_product_ion_db import load_existing_protein_product_ion_db
import os
db_path = "dbs/Uniprot_mouse.fasta_max_k=30_charges=(1, 2, 3).db"
size_gb = os.path.getsize(db_path) / (1024 ** 3)
print(f"DB size BEFORE indexing = {size_gb}")

db = load_existing_protein_product_ion_db(db_path=db_path)
db.create_index_on_product_ion_mass()

size_gb = os.path.getsize(db_path) / (1024 ** 3)
print(f"DB size AFTER indexing = {size_gb}")

DB size BEFORE indexing = 56.392478942871094


# Create protein-product ion DBs

In [None]:
import pandas as pd
from src.erik import load_comet_data
from src.constants import PRODUCT_ION_TABLE
from src.fasta_utils import get_proteins_from_fasta, get_specific_protein_from_fasta
from src.lookups.protein_product_ion_db import create_protein_product_ion_db, get_average_mass_search_time
from experiments.ions_per_mass import main

fasta_path = "fastas/Uniprot_mouse.fasta"
db_path = "dbs/test.proteinproduction.db"

In [None]:
proteins = list(get_proteins_from_fasta(fasta_path=fasta_path))
proteins = proteins[:10]

# protein_name = "sp|P99027|RLA2_MOUSE"
# protein = get_specific_protein_from_fasta(fasta_path=fasta_path, protein_name=protein_name)
db = create_protein_product_ion_db(
    db_path=db_path,
    protein_seqs=[protein.sequence for protein in proteins[:10]],
    charges_to_consider=[1, 2],
    max_kmer_len=30,
)

In [5]:
df = main(db_path=db_path, sample_size=10, ppm_tolerance=10, output_dir=None)
df

Generating random sample...
Finished generating random sample
Ion 1 of 10
Ion 2 of 10
Ion 3 of 10
Ion 4 of 10
Ion 5 of 10
Ion 6 of 10
Ion 7 of 10
Ion 8 of 10
Ion 9 of 10
Ion 10 of 10


Unnamed: 0,mass,time,num_matching_ions,len_search_ion,num_matching_ions_by_charge
0,2527.323138,0.823784,24,23,{1: 24}
1,982.41123,0.417797,8,8,"{1: 7, 2: 1}"
2,1631.340411,0.20817,18,28,{2: 18}
3,3724.859914,0.415554,3,30,{1: 3}
4,1719.840417,0.817771,19,15,"{2: 5, 1: 14}"
5,1868.837106,1.221565,10,16,{1: 10}
6,50.541483,0.819681,310,1,{2: 310}
7,3390.959707,0.807122,2,30,{1: 2}
8,487.248152,1.229158,24,8,"{1: 15, 2: 9}"
9,455.738771,0.832291,11,8,{2: 11}


In [None]:
db_path = "dbs/Uniprot_mouse.fasta_max_k=30_charges=(1, 2, 3).db"
df = main(db_path=db_path, sample_size=10, ppm_tolerance=10, output_dir=None)

Generating random sample...


# How long does it take to query the protein-product ion DB by mass? 

In [2]:
from src.lookups.protein_product_ion_db import load_existing_protein_product_ion_db

db_path = "dbs/Uniprot_mouse.fasta_max_k=30_charges=(1, 2, 3).db"
db = load_existing_protein_product_ion_db(db_path=db_path)
# query = f"SELECT {MASS} FROM {PRODUCT_ION_TABLE} ORDER BY RANDOM() LIMIT {sample_size};"
# db.run_query(query=query)

In [None]:
db.get_charges_in_db()

In [None]:
df = main(db_path=db_path, sample_size=10, ppm_tolerance=10, output_dir=None)

# Get all k-mers in FASTA

In [None]:
# Imports
from src.erik import generate_kmers
from src.constants import MOUSE_PROTEOME
from src.fasta_utils import get_proteins_from_fasta
from tqdm import tqdm
from Bio import SeqIO
import numpy as np

In [None]:
proteins = list(get_proteins_from_fasta(fasta_path=MOUSE_PROTEOME))

max_k = 30
uniq_kmers = set()
for protein in tqdm(proteins):
    # kmers = set([kmer.seq for kmer in generate_kmers(peptide=protein.sequence, max_k=max_k)])


 49%|████▉     | 10644/21759 [08:57<05:11, 35.71it/s]  

The above was really slow.
Let's try ChatGPT's suggested way

In [21]:
from Bio import SeqIO
from src.constants import MOUSE_PROTEOME

proteins = list(get_proteins_from_fasta(fasta_path=MOUSE_PROTEOME))

def get_unique_kmers(fasta_file, k):
    unique_kmers = set()
    
    # Read FASTA sequences one at a time
    for record in SeqIO.parse(fasta_file, "fasta"):
        seq = str(record.seq)
        
        # Generate kmers using a sliding window
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i + k]
            unique_kmers.add(kmer)  # Sets automatically store only unique kmers
    
    return unique_kmers

uniq_kmers = {}
for k in tqdm(np.arange(1, 31)):
    uniq_kmers[k] = get_unique_kmers(fasta_file=MOUSE_PROTEOME, k=k)



100%|██████████| 30/30 [01:21<00:00,  2.71s/it]


In [23]:
cum_sum = {}
for k, kmers in uniq_kmers.items():
    k, len(kmers)
    break

(1, 24)