In [1]:
import pandas as pd

pull dataframe listing all possible mutations, whether they're targeted or not, and with mutation type defined. This was originally generated in `barcoded_h3_dms/mutation_distribution.ipynb`.

In [4]:
mut_counts_full = pd.read_csv('mut_counts_epitope_labels.csv')
mut_counts_full

Unnamed: 0,library,sample,mutation,mut_count,mutation_type,site,epitope,stripped_mut,targeted
0,libA_rep1,barcoded variants,Y178S,206,nonsynonymous,178,paired_epitope,178S,targeted
1,libA_rep1,barcoded variants,T179S,195,nonsynonymous,179,paired_epitope,179S,targeted
2,libA_rep1,barcoded variants,T179K,193,nonsynonymous,179,paired_epitope,179K,targeted
3,libA_rep1,barcoded variants,T179R,187,nonsynonymous,179,paired_epitope,179R,targeted
4,libA_rep1,barcoded variants,I211S,162,nonsynonymous,211,paired_epitope,211S,targeted
...,...,...,...,...,...,...,...,...,...
45675,libB_rep2,barcoded variants,Y530R,0,nonsynonymous,530,other,530R,off-target
45676,libB_rep2,barcoded variants,Y530S,0,nonsynonymous,530,other,530S,off-target
45677,libB_rep2,barcoded variants,Y530T,0,nonsynonymous,530,other,530T,off-target
45678,libB_rep2,barcoded variants,Y530V,0,nonsynonymous,530,other,530V,off-target


trim to just the mutations targeted by primers from a single library.

In [50]:
mut_counts = mut_counts_full.loc[(mut_counts_full['library']=='libA_rep1') & 
                                 (mut_counts_full['targeted'] == 'targeted')]

mut_counts.tail()

Unnamed: 0,library,sample,mutation,mut_count,mutation_type,site,epitope,stripped_mut,targeted
3432,libA_rep1,barcoded variants,I233D,1,nonsynonymous,233,other,233D,targeted
3572,libA_rep1,barcoded variants,S133G,1,nonsynonymous,133,other,133G,targeted
3573,libA_rep1,barcoded variants,S133V,1,nonsynonymous,133,other,133V,targeted
3608,libA_rep1,barcoded variants,T49G,1,nonsynonymous,49,other,49G,targeted
10295,libA_rep1,barcoded variants,T31L,0,nonsynonymous,31,other,31L,targeted


pull just the column defining site/AA, and the column describing type of mutation. Convert to df needed for analysis.

In [80]:
aggregated_muts = mut_counts[['stripped_mut', 'epitope']]

# split 'stripped_mut' into site and aa columns
aggregated_muts[['site', 'amino_acid']] = aggregated_muts['stripped_mut'].str.extract('(\d+\.?\d+)([a-zA-Z*]+)',
                                                                                      expand=True)

# convert 'epitope' column into appropriate mutation type labels
mutation_type_conversion = {'paired_epitope':'paired_epitope_mutation', 
                            'epitope':'epitope_mutation',
                            'other':'viable_mutation'}

aggregated_muts['mutation_type'] = aggregated_muts['epitope'].map(mutation_type_conversion)

# drop unneeded columns
aggregated_muts = aggregated_muts.drop(['stripped_mut', 'epitope'], axis=1)

# sort in site order
aggregated_muts['site']=aggregated_muts['site'].astype(int)
aggregated_muts = aggregated_muts.sort_values(by='site').reset_index(drop=True)

aggregated_muts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_muts['mutation_type'] = aggregated_muts['epitope'].map(mutation_type_conversion)


Unnamed: 0,site,amino_acid,mutation_type
0,22,Y,viable_mutation
1,22,R,viable_mutation
2,22,G,viable_mutation
3,22,E,viable_mutation
4,22,A,viable_mutation
...,...,...,...
2684,523,N,viable_mutation
2685,523,R,viable_mutation
2686,523,C,viable_mutation
2687,523,S,viable_mutation


In [82]:
aggregated_muts.to_csv('aggregated_mutations.csv', index=False)