In [1]:
import itertools
import os

import numpy as np
import pandas as pd

import sys          
# appending the directory of pca.py in the sys.path list
sys.path.append('../ms_processing')
import ms_functions_and_defs as msf

In [2]:
csv_save_folder = 'combinations_csvs'

no_of_residues = 5

In [3]:
aa_formula_dict = {
    'Ala': {'C':  3,'H':  7,'N': 1,'O': 2,'S':0},
    'Arg': {'C':  6,'H': 14,'N': 4,'O': 2,'S':0},
    'Asn': {'C':  4,'H':  8,'N': 2,'O': 0,'S':0},
    'Asp': {'C':  4,'H':  7,'N': 1,'O': 4,'S':0},
    'Cys': {'C':  3,'H':  7,'N': 1,'O': 2,'S':1},
    'Gln': {'C':  5,'H': 10,'N': 2,'O': 3,'S':0},
    'Glu': {'C':  5,'H':  9,'N': 1,'O': 4,'S':0},
    'Gly': {'C':  2,'H':  5,'N': 1,'O': 2,'S':0},
    'His': {'C':  6,'H':  9,'N': 3,'O': 2,'S':0},
    # 'Ile': {'C': 6,'H': 13,'N': 1,'O': 2,'S':0},
    # Leu and Ile have the same mass, so are indistinguishable
    'Leu': {'C':  6,'H': 13,'N': 1,'O': 2,'S':0},
    'Lys': {'C':  6,'H': 14,'N': 2,'O': 2,'S':0},
    'Met': {'C':  5,'H': 11,'N': 1,'O': 2,'S':1},
    'Phe': {'C':  9,'H': 11,'N': 1,'O': 2,'S':0},
    'Pro': {'C':  5,'H':  9,'N': 1,'O': 2,'S':0},
    'Ser': {'C':  3,'H':  7,'N': 1,'O': 3,'S':0},
    'Thr': {'C':  4,'H':  9,'N': 1,'O': 3,'S':0},
    'Trp': {'C': 11,'H': 12,'N': 2,'O': 2,'S':0},
    'Tyr': {'C':  9,'H': 11,'N': 1,'O': 3,'S':0},
    'Val': {'C':  5,'H': 11,'N': 1,'O': 2,'S':0},
}

aas = list(aa_formula_dict.keys())
elements = list(aa_formula_dict[aas[0]].keys())

In [4]:
aa_mass_dict = {}

for aa in aas:
    aa_mass_dict[aa] = msf.calc_mass(aa_formula_dict[aa],mode='monoisotopic')

In [5]:
csv_file_name = f'combinations_of_{no_of_residues}_aas.csv'

if csv_file_name in os.listdir(csv_save_folder):
    
    combinations_df = pd.read_csv(f'{csv_save_folder}/{csv_file_name}')

else:
    
    combinations_list = list(itertools.combinations_with_replacement(aas, r=no_of_residues))

    combinations_df = pd.DataFrame(columns=aas)

    for i in range(len(combinations_list)):
        count_list = []
        for aa in aas:
            count_list.append(combinations_list[i].count(aa))

        combinations_df.loc[len(combinations_df.index)] = count_list

    elem_matrix = combinations_df.copy().to_numpy()
    element_sum_lists = {}
    for i in range(len(elem_matrix[:,0])):
        element_sum = {}
        for j in range(len(elem_matrix[i,:])):
            
            aa = aas[j]

            for e in elements:
                if j == 0:
                    element_sum[e] = 0
                
                element_sum[e] += int(elem_matrix[i,j] * aa_formula_dict[aa][e])

        for e in elements:
            if i == 0:
                element_sum_lists[e] = []
            
            element_sum_lists[e].append(element_sum[e])

    for e in elements:
        combinations_df[e] = element_sum_lists[e]
    
    elements_minus_C = elements.copy()
    elements_minus_C.remove('C')
    for e in elements_minus_C:
        combinations_df[f'{e}/C'] = combinations_df[e].to_numpy() / combinations_df['C'].to_numpy()

    mass_matrix = combinations_df.copy().to_numpy().astype(float)

    for i in range(len(aas)):
        mass_matrix[:,i] = mass_matrix[:,i] * aa_mass_dict[aas[i]]

    combinations_df['mass'] = np.sum(mass_matrix,axis=1)
    combinations_df.to_csv(f'{csv_save_folder}/{csv_file_name}',index=False)