In [1]:
import pandas as pd
import re
import dms_tools2

### Get list of targeted, viable mutations from primer pool

In [2]:
# load targeted primer dataset
targeted_primers = pd.read_csv('hk19_primers.csv',
                              header=None).rename(columns={0:'name', 1:'primer'})

targeted_primers.tail()

Unnamed: 0,name,primer
5005,hk19-rev-mut502S,gacttcagctcaacTGACTTGATCTGGAACC
5006,hk19-rev-mut502T,gacttcagctcaacTGTCTTGATCTGGAACC
5007,hk19-rev-mut502V,acttcagctcaacCACCTTGATCTGGAAC
5008,hk19-rev-mut502W,acttcagctcaacCCACTTGATCTGGAAC
5009,hk19-rev-mut502Y,tgacttcagctcaacATACTTGATCTGGAACC


In [3]:
# set up function to pull targeted mutations from hk19_primers.csv
def get_targeted_sites(primer_name, re_primer_prefix, targeted_muts_list):
    primer_name = re_primer_prefix.sub('', primer_name) # gets rid of prefix
    site_number = int((re.findall(r'\d+', primer_name))[0]) # pull site number
    chimeric_site = site_number + 21 # converts to chimeric sequential numbering
    
    aa_mut = primer_name[-1]
    targeted_muts_list.append(str(chimeric_site)+aa_mut)
    
    return targeted_muts_list

In [4]:
# extract a list of targeted sites + aa mutations based on primer names
re_primer_prefix = re.compile(r'^(.*mut)')
targeted_muts_list = []
targeted_primers.apply(lambda x: get_targeted_sites(
    x['name'], re_primer_prefix, targeted_muts_list), axis=1)

# then, concat into df
# targeted_muts = pd.DataFrame({'site':targeted_muts_sites, 'amino_acid':targeted_muts_aa})
targeted_muts_list[:10]

['22A', '22D', '22E', '22F', '22G', '22H', '22K', '22L', '22M', '22N']

In [8]:
# convert to DF, with all labeled 'viable_mutation'
targeted_muts = pd.DataFrame({'mutation':targeted_muts_list, 'mutation_type':'viable_mutation'})

# split mutation into site and amino acid
targeted_muts['site'] = targeted_muts['mutation'].str.replace('([A-Z]+)', '').astype(int)
targeted_muts['amino_acid'] = targeted_muts['mutation'].str.extract('([A-Z]+)')
targeted_muts = targeted_muts.drop(['mutation'], axis=1)

# set column order
cols = ['site', 'amino_acid', 'mutation_type']
# targeted_muts = targeted_muts[cols]
targeted_muts = targeted_muts[cols].drop_duplicates()
targeted_muts

Unnamed: 0,site,amino_acid,mutation_type
0,22,A,viable_mutation
1,22,D,viable_mutation
2,22,E,viable_mutation
3,22,F,viable_mutation
4,22,G,viable_mutation
...,...,...,...
2500,523,S,viable_mutation
2501,523,T,viable_mutation
2502,523,V,viable_mutation
2503,523,W,viable_mutation


### Get list of single and paired epitope mutations
Note that the final csv file should list each individual mutation. So we need to take the list of epitope sites, and then expand each site into 19 possible amino acid mutations.

In [9]:
# set up function to pull epitope sites and convert to mutation list
# handles both single and paired epitopes
def get_epi_muts(epitope_df, aa_list, mutation_type):
    stripped_name_numeric = epitope_df['name'].str.findall(r'\d+').tolist()
    
    epitope_sites = []
    for entry in stripped_name_numeric:
        entry = entry[1:] # get rid of '19' from hk19
        
        # for handling single or paired epitope primers
        for site in entry:
            site=int(site)+3 # converts to chimeric sequential numbering
            if site not in epitope_sites:
                epitope_sites.append(site)
    
    
    epitope_muts = []
    for site in epitope_sites:
        for aa in aa_list:
            epitope_muts.append(str(site)+aa)
            
    # convert to DF, with all labeled 'viable_mutation'
    muts_df = pd.DataFrame({'mutation':epitope_muts, 'mutation_type':mutation_type})

    # split mutation into site and amino acid
    muts_df['site'] = muts_df['mutation'].str.replace('(\D+)', '').astype(int)
    muts_df['amino_acid'] = muts_df['mutation'].str.extract('(\D+)')
    muts_df = muts_df.drop(['mutation'], axis=1)

    # set column order
    cols = ['site', 'amino_acid', 'mutation_type']
    muts_df = muts_df[cols]
    
    return muts_df

In [10]:
# read in df of single epitope primers
single_epi_df = pd.read_csv('hk19_single_epitope_primers.csv', 
                              header=None).rename(columns={0:'name', 1:'primer'})

paired_epi_df = pd.read_csv('hk19_paired_epitope_primers.csv', 
                              header=None).rename(columns={0:'name', 1:'primer'})

In [11]:
# there's probably a biopython tool for this??
# would avoid having to incorporate dms_tools2 + updated pandas in same environment
aa_list = dms_tools2.AAS_WITHSTOP

In [13]:
single_epi_muts = get_epi_muts(single_epi_df, aa_list, 'epitope_mutation').drop_duplicates()
single_epi_muts

Unnamed: 0,site,amino_acid,mutation_type
0,111,A,epitope_mutation
1,111,C,epitope_mutation
2,111,D,epitope_mutation
3,111,E,epitope_mutation
4,111,F,epitope_mutation
...,...,...,...
457,102,T,epitope_mutation
458,102,V,epitope_mutation
459,102,W,epitope_mutation
460,102,Y,epitope_mutation


In [15]:
paired_epi_muts = get_epi_muts(paired_epi_df, aa_list, 'paired_epitope_mutation').drop_duplicates()
paired_epi_muts

Unnamed: 0,site,amino_acid,mutation_type
0,150,A,paired_epitope_mutation
1,150,C,paired_epitope_mutation
2,150,D,paired_epitope_mutation
3,150,E,paired_epitope_mutation
4,150,F,paired_epitope_mutation
...,...,...,...
247,217,T,paired_epitope_mutation
248,217,V,paired_epitope_mutation
249,217,W,paired_epitope_mutation
250,217,Y,paired_epitope_mutation


### Concat into final df

In [17]:
aggregated_mutations = pd.concat([targeted_muts, single_epi_muts, paired_epi_muts])
aggregated_mutations

Unnamed: 0,site,amino_acid,mutation_type
0,22,A,viable_mutation
1,22,D,viable_mutation
2,22,E,viable_mutation
3,22,F,viable_mutation
4,22,G,viable_mutation
...,...,...,...
247,217,T,paired_epitope_mutation
248,217,V,paired_epitope_mutation
249,217,W,paired_epitope_mutation
250,217,Y,paired_epitope_mutation


In [18]:
aggregated_mutations.to_csv('aggregated_mutations.csv', index=False)

scratch code -

In [None]:
def get_targeted_muts_df(primer_df):
    mut_sites=[]
    mut_aa=[]
    re_primer_prefix = re.compile(r'^(.*mut)')
    
    primer_name_list = primer_df['name'].str.tolist()
    for primer in primer_name_list:
        primer_name = re_primer_prefix.sub('', primer_name) # gets rid of prefix
        site_number = int((re.findall(r'\d+', primer_name))[0]) # pull site number
        mut_sites.append(site_number+21) #converts to chimeric sequential numbering

        aa_mut = primer_name[-1]
        mut_aa.append(aa_mut)
    
    primer_

# set up function to pull epitope sites and convert to mutation list
# handles both single and paired epitopes
def get_muts_df(primer_df, mut_type, aa_list):
    
    if mut_type = targeted:
        mut_sites=[]
        mut_aa=[]
        primer_name_list = primer_df['name'].str.tolist()
        for primer in primer_name_list:
            re_primer_prefix = re.compile(r'^(.*mut)')
            primer_name = re_primer_prefix.sub('', primer_name) # gets rid of prefix
            site_number = int((re.findall(r'\d+', primer_name))[0]) # pull site number
            mut_sites.append(site_number+21) #converts to chimeric sequential numbering
            
            aa_mut = primer_name[-1]
            mut_aa.append(aa_mut)
        
    else:
        numeric_site_list = primer_df['name'].str.findall(r'\d+').tolist()
        
        mut_sites=[]
        for entry in stripped_name_numeric:
            entry = entry[1:] # get rid of '19' from hk19
            
            for site in entry:
                site=int(site)+3 # converts to chimeric sequential numbering
                if site not in mut_sites:
                    mut_sites.append(site)
                    
    
    
        primer_name = re_primer_prefix.sub('', primer_name) # gets rid of prefix
    site_number = int((re.findall(r'\d+', primer_name))[0]) # pull site number
    chimeric_site = site_number + 21 # converts to chimeric sequential numbering
    targeted_muts_sites.append(str(targeted_muts_sites))
    
    aa_mut = primer_name[-1]
    targeted_muts_aa.append(aa_mut)
    
    
    # note that the first number per primer is from hk19
    stripped_name_numeric = epitope_df['name'].str.findall(r'\d+').tolist()
        
        
    mut_sites = []
    for entry in stripped_name_numeric:
        entry = entry[1:] # get rid of '19' from hk19
        
        # for handling single or paired epitope primers
        for site in entry:
            site=int(site)+3 # converts to chimeric sequential numbering
            if site not in epitope_sites:
                epitope_sites.append(site)
    
    
    epitope_muts = []
    for site in epitope_sites:
        for aa in aa_list:
            epitope_muts.append(str(site)+aa)
    
    return epitope_muts

In [None]:
# convert to df and remove duplicates from rev primers in original df
targeted_muts = pd.DataFrame({})

In [None]:
# get list of paired epitopes from these primer names
stripped_name = paired_epitope_df['name'].str.findall(r'\d+').tolist()

paired_epitopes = []
for entry in stripped_name:
    entry = entry[1:]
    for site in entry:
        site=int(site)+3 # to account for numbering shift in chimeras
        if site not in paired_epitopes:
            paired_epitopes.append(site)

paired_epitopes