In [1]:
import pandas as pd

In [2]:
codon_to_residue = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 

In [3]:
codon_tbl_df = pd.DataFrame({"residue" : codon_to_residue.values(), "codon" : codon_to_residue.keys()})
codon_tbl_df = codon_tbl_df.sort_values(by = "residue")
codon_tbl_df

Unnamed: 0,residue,codon
37,A,GCC
39,A,GCT
36,A,GCA
38,A,GCG
61,C,TGT
...,...,...
56,Y,TAC
57,Y,TAT
62,_,TGA
58,_,TAA


In [4]:
residues = set(codon_tbl_df["residue"])
residues

{'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y',
 '_'}

In [5]:
expec_df_list = []

for residue in residues:
    residue_df = codon_tbl_df[codon_tbl_df["residue"] == residue]
    codons = residue_df["codon"]

    base_options = ["A", "T", "G", "C"]
    counts = {}

    # Counting all possible paths between one residue to another
    
    # Exploring all codons corresponding to the residue
    for codon in codons:
        
        # Changing each position of codon
        for position in range(3):
            curr_base = codon[position]
            
            # Substitute each position with one of 3 other base options
            for base in base_options:
                if curr_base != base:
                    
                    # Translate new codon, record/update count of path 
                    new_codon = codon[:position] + base + codon[position + 1:]
                    new_residue = codon_to_residue[new_codon]
                    if new_residue in counts.keys():
                        counts[new_residue] += 1
                    else:
                        counts[new_residue] = 1
                        
    # Convert counts to a dataframe to save results                    
    counts_df = pd.DataFrame.from_dict(counts, orient = 'index').reset_index().rename(columns = {0 : "paths",
                                                                                                "index" : "new_AA"})
    
    # Remove rows where nonsense or synonymous mutation
    counts_df_no_stop = counts_df[counts_df["new_AA"] != "_"]
    counts_df_no_stop["orig_AA"] = residue
    counts_df_no_stop_or_same = counts_df_no_stop[counts_df_no_stop["orig_AA"] != counts_df_no_stop["new_AA"]]
    
    # Calcuating expected frequency using sum of observed paths as denominator
    counts_df_no_stop_or_same["exp_freq"] = counts_df_no_stop_or_same["paths"] / sum(counts_df_no_stop_or_same["paths"])
    counts_df_no_stop_or_same = counts_df_no_stop_or_same[["orig_AA", "new_AA", "paths", "exp_freq"]]
    
    expec_df_list.append(counts_df_no_stop_or_same)
    
# Put together results from each residue
expec_df = pd.concat(expec_df_list)
expec_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_df_no_stop_or_same["exp_freq"] = counts_df_no_stop_or_same["paths"] / sum(counts_df_no_stop_or_same["paths"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_df_no_stop["orig_AA"] = residue
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_df_no_stop_or_same["exp_freq"] = counts_df

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,F,I,2,0.125000
1,F,V,2,0.125000
2,F,L,6,0.375000
3,F,Y,2,0.125000
4,F,C,2,0.125000
...,...,...,...,...
0,W,R,2,0.285714
1,W,G,1,0.142857
3,W,L,1,0.142857
4,W,S,1,0.142857


In [6]:
expec_df

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,F,I,2,0.125000
1,F,V,2,0.125000
2,F,L,6,0.375000
3,F,Y,2,0.125000
4,F,C,2,0.125000
...,...,...,...,...
0,W,R,2,0.285714
1,W,G,1,0.142857
3,W,L,1,0.142857
4,W,S,1,0.142857


In [7]:
expec_df.to_csv("../data/residue_change_expec.csv")
expec_df

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,F,I,2,0.125000
1,F,V,2,0.125000
2,F,L,6,0.375000
3,F,Y,2,0.125000
4,F,C,2,0.125000
...,...,...,...,...
0,W,R,2,0.285714
1,W,G,1,0.142857
3,W,L,1,0.142857
4,W,S,1,0.142857


In [8]:
expec_df.groupby("orig_AA").agg(sum)

  expec_df.groupby("orig_AA").agg(sum)


Unnamed: 0_level_0,new_AA,paths,exp_freq
orig_AA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,TSPDVGE,24,1.0
C,SGRYFW,14,1.0
D,NYHVGAE,16,1.0
E,KQVGAD,14,1.0
F,IVLYCS,16,1.0
G,REVASCDW,23,1.0
H,NYDLRPQ,16,1.0
I,LVKRTMFNS,21,1.0
K,EQIRTNM,14,1.0
L,MVWSFIHRPQ,33,1.0


In [9]:
expec_df[expec_df["orig_AA"] == "I"]

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,I,L,4,0.190476
1,I,V,3,0.142857
2,I,K,1,0.047619
3,I,R,1,0.047619
4,I,T,3,0.142857
6,I,M,3,0.142857
7,I,F,2,0.095238
8,I,N,2,0.095238
9,I,S,2,0.095238


In [10]:
expec_df[expec_df["orig_AA"] == "A"]

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,A,T,4,0.166667
1,A,S,4,0.166667
2,A,P,4,0.166667
3,A,D,2,0.083333
4,A,V,4,0.166667
5,A,G,4,0.166667
7,A,E,2,0.083333


In [11]:
expec_df[expec_df["orig_AA"] == "T"]

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,T,S,6,0.25
1,T,A,4,0.166667
2,T,P,4,0.166667
3,T,K,2,0.083333
4,T,I,3,0.125
5,T,R,2,0.083333
7,T,N,2,0.083333
8,T,M,1,0.041667


In [12]:
expec_df[expec_df["orig_AA"] == "D"]

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,D,N,2,0.125
1,D,Y,2,0.125
2,D,H,2,0.125
3,D,V,2,0.125
4,D,G,2,0.125
5,D,A,2,0.125
6,D,E,4,0.25


In [13]:
expec_df[expec_df["orig_AA"] == "E"]

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,E,K,2,0.142857
2,E,Q,2,0.142857
3,E,V,2,0.142857
4,E,G,2,0.142857
5,E,A,2,0.142857
6,E,D,4,0.285714


In [14]:
expec_df[expec_df["orig_AA"] == "A"]

Unnamed: 0,orig_AA,new_AA,paths,exp_freq
0,A,T,4,0.166667
1,A,S,4,0.166667
2,A,P,4,0.166667
3,A,D,2,0.083333
4,A,V,4,0.166667
5,A,G,4,0.166667
7,A,E,2,0.083333
