#### 与四种模式生物进行密码子偏好性比较分析
（1）先从http://www.kazusa.or.jp/codon 下载模式生物的密码子偏好统计数据，并纠正格式；

（2）计算待分析物种的密码子使用频率 = （某个密码子数量/所有密码子总数量）\*1000

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
!head -n 4 Compare_analysis/Arabidopsis_thaliana.txt

UUU 21.8(678320)  UCU 25.2(782818)  UAU 14.6(455089)  UGU 10.5(327640)
UUC 20.7(642407)  UCC 11.2(348173)  UAC 13.7(427132)  UGC  7.2(222769)
UUA 12.7(394867)  UCA 18.3(568570)  UAA  0.9( 29405)  UGA  1.2( 36260)
UUG 20.9(649150)  UCG  9.3(290158)  UAG  0.5( 16417)  UGG 12.5(388049)


In [3]:
model_org_dir = 'Compare_analysis'
my_org_dir = 'RSCU'

model_orgs = [os.path.join(model_org_dir, f) for f in os.listdir(model_org_dir) 
                  if f.endswith('.txt')]
my_orgs = [os.path.join(my_org_dir, f) for f in os.listdir(my_org_dir) 
                  if f.endswith('.txt')]

model_orgs

['Compare_analysis/Arabidopsis_thaliana.txt',
 'Compare_analysis/Populus_trichocarpa.txt',
 'Compare_analysis/Escherichia_coli.txt',
 'Compare_analysis/Saccharomyces_cerevisiae.txt']

In [7]:
import re

def parse_model_org(infile):
    """解析下载下来的模式物种数据
    """
    codons = []
    freq_kilo = []
    numbers = []
    with open(infile) as f:
        for line in f:
            if line.strip():
                line = [i.strip().replace('U', 'T') for i in re.split(r'\s+|\(|\)', 
                                                                      line.strip()) if i]
                for idx in range(0, len(line), 3):
                    codons.append(line[idx])
                    
                    freq_kilo.append(float(line[idx+1]))
                    numbers.append(int(line[idx+2]))
                    
    df = pd.DataFrame({'Codon': codons, 'freq_kilo': freq_kilo})
    
    return df


def parse_my_org(infile):
    """解析emboss上面计算得到的数据
    """
    df = pd.read_csv(infile, comment='#', header=0, delimiter=r"\s+")

    # 生成氨基酸简称的对应关系
    aa_tri = ['Gly', 'Ala', 'Val', 'Leu', 'Ile', 'Pro', 'Phe', 'Tyr', 'Trp', 'Ser', 
              'Thr', 'Cys', 'Met', 'Asn', 'Gln', 'Asp', 'Glu', 'Lys', 'Arg', 'His', 'Ter']
    aa_sing = ['G', 'A', 'V', 'L', 'I', 'P', 'F', 'Y', 'W', 'S', 'T', 'C', 'M', 'N', 'Q', 
               'D', 'E', 'K', 'R', 'H', '*']
    df_amino = pd.DataFrame({'Amino_acid':aa_tri, 'AA':aa_sing}).sort_values(by='AA')
    df_merge = df.merge(df_amino, on='AA', how='inner')
    
    return df_merge[['Amino_acid', 'Codon', 'Frequency']]

In [8]:
my_orgs

['RSCU/Miscanthus_floridulus.txt',
 'RSCU/Miscanthus_giganteus.txt',
 'RSCU/Miscanthus_sacchariflorus.txt',
 'RSCU/Miscanthus_sinensis.txt',
 'RSCU/Miscanthus_transmorrisonensis.txt',
 'RSCU/Saccharum_spontaneum.txt',
 'RSCU/Sorghum_bicolor.txt']

In [36]:
writer = pd.ExcelWriter('Step3_Compare_analysis_bak.xlsx', engine='xlsxwriter')

from collections import defaultdict

# 密码子评率比较差异性统计（ratio>2 或者 ratio<0.5）的数量
dd = pd.DataFrame(np.zeros((7, 8)))
dd.index = [forg.split('/')[-1].split('.')[0].split('_')[-1] for forg in my_orgs]
dd.columns = sorted([fm.split('/')[-1].split('.')[0].split('_')[-1]+'_gt2' for fm in model_orgs] + \
                [fm.split('/')[-1].split('.')[0].split('_')[-1]+'_lt05' for fm in model_orgs])

for forg in my_orgs:
    my_species = forg.split('/')[-1].split('.')[0].split('_')[-1]
    df_org = parse_my_org(forg)
    for fm in model_orgs:
        model_species = fm.split('/')[-1].split('.')[0].split('_')[-1]
        name = my_species + '--' + model_species
        print(name)
        dfm = parse_model_org(fm)
        df = df_org.merge(dfm, on='Codon', how='inner')
        df['Ratio'] = df['Frequency']/df['freq_kilo']
        df[['Amino_acid', 'Codon', 'Ratio']].to_excel(writer, 
                                                      sheet_name=name, 
                                                      index=False)
        df_gt2 = df[df['Ratio']>2]
        df_lt05 = df[df['Ratio']<0.5]
        dd.loc[dd.index==my_species, dd.columns==model_species+'_gt2'] = len(df_gt2)
        dd.loc[dd.index==my_species, dd.columns==model_species+'_lt05'] = len(df_lt05)
        
#         print("大于2的数量：", len(df_gt2))
#         print("小于0.5的数量：", len(df_lt05))
        
writer.save()
dd.to_excel('Step3_Compare_analysis_stat.xlsx', index=True, header=True)

floridulus--thaliana
大于2的数量： 5
小于0.5的数量： 9
floridulus--trichocarpa
大于2的数量： 6
小于0.5的数量： 7
floridulus--coli
大于2的数量： 12
小于0.5的数量： 15
floridulus--cerevisiae
大于2的数量： 7
小于0.5的数量： 3
giganteus--thaliana
大于2的数量： 5
小于0.5的数量： 8
giganteus--trichocarpa
大于2的数量： 6
小于0.5的数量： 7
giganteus--coli
大于2的数量： 12
小于0.5的数量： 15
giganteus--cerevisiae
大于2的数量： 7
小于0.5的数量： 3
sacchariflorus--thaliana
大于2的数量： 7
小于0.5的数量： 7
sacchariflorus--trichocarpa
大于2的数量： 6
小于0.5的数量： 6
sacchariflorus--coli
大于2的数量： 12
小于0.5的数量： 15
sacchariflorus--cerevisiae
大于2的数量： 8
小于0.5的数量： 3
sinensis--thaliana
大于2的数量： 7
小于0.5的数量： 7
sinensis--trichocarpa
大于2的数量： 6
小于0.5的数量： 7
sinensis--coli
大于2的数量： 12
小于0.5的数量： 15
sinensis--cerevisiae
大于2的数量： 8
小于0.5的数量： 3
transmorrisonensis--thaliana
大于2的数量： 5
小于0.5的数量： 9
transmorrisonensis--trichocarpa
大于2的数量： 6
小于0.5的数量： 7
transmorrisonensis--coli
大于2的数量： 12
小于0.5的数量： 15
transmorrisonensis--cerevisiae
大于2的数量： 7
小于0.5的数量： 3
spontaneum--thaliana
大于2的数量： 4
小于0.5的数量： 9
spontaneum--trichocarpa
大于2的数量： 4
小于0.5的数量： 8


In [52]:
len(set(df_org['Amino_acid']))

21

In [53]:
len(set(df['Amino_acid']))

21

In [35]:
dd

Unnamed: 0,cerevisiae_gt2,cerevisiae_lt05,coli_gt2,coli_lt05,thaliana_gt2,thaliana_lt05,trichocarpa_gt2,trichocarpa_lt05
floridulus,7.0,3.0,12.0,15.0,5.0,9.0,6.0,7.0
giganteus,7.0,3.0,12.0,15.0,5.0,8.0,6.0,7.0
sacchariflorus,8.0,3.0,12.0,15.0,7.0,7.0,6.0,6.0
sinensis,8.0,3.0,12.0,15.0,7.0,7.0,6.0,7.0
transmorrisonensis,7.0,3.0,12.0,15.0,5.0,9.0,6.0,7.0
spontaneum,5.0,4.0,12.0,15.0,4.0,9.0,4.0,8.0
bicolor,5.0,4.0,12.0,15.0,5.0,9.0,4.0,8.0


In [17]:
dd = pd.DataFrame(np.zeros((7, 8)))
dd

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
dd.index = [forg.split('/')[-1].split('.')[0].split('_')[-1] for forg in my_orgs]
dd

Unnamed: 0,0,1,2,3,4,5,6,7
floridulus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
giganteus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sacchariflorus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sinensis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
transmorrisonensis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spontaneum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bicolor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
dd.columns = sorted([fm.split('/')[-1].split('.')[0].split('_')[-1]+'_gt2' for fm in model_orgs] + \
                [fm.split('/')[-1].split('.')[0].split('_')[-1]+'_lt05' for fm in model_orgs])
dd

Unnamed: 0,cerevisiae_gt2,cerevisiae_lt05,coli_gt2,coli_lt05,thaliana_gt2,thaliana_lt05,trichocarpa_gt2,trichocarpa_lt05
floridulus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
giganteus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sacchariflorus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sinensis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
transmorrisonensis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spontaneum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bicolor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
dd.loc[dd.index=='sinensis', dd.columns=='cerevisiae_gt2']

Unnamed: 0,cerevisiae_gt2
sinensis,0.0
