# GC content Deviation

In [1]:
from data.utils.NCBI.data_loader import NCBIDataLoader
# from data.utils.HGTREE.data_loader import HGTREEDataLoader
# test out combination
import math 
from Bio.SeqUtils import GC123


aquifex_aeolicus_VF5 = NCBIDataLoader('AE000657')


In [2]:
def GC_Content_Deviation(genome, return_genomic_strips = False):
    # Init list of extraneous genes
    list_of_extraneous_genes = []
    
    # part 1: GCT, GC1, GC3 deviation
    for gene in genome.genes:
        dev_GCT = genome[gene]['GCT'] - genome.mean_GCT
        
        dev_GC1 = genome[gene]['GC1'] - genome.mean_GC1
        dev_GC3 = genome[gene]['GC3'] - genome.mean_GC3
        equal_sign_check = dev_GC1*dev_GC3
        
        # consider only more than 300bp
        # either gct > 1.5
        # or sign gc1 and gc3 equal and one of is > 1.5
        if len(genome[gene]['sequence']) > 300:
            if abs(dev_GCT) > (1.5*genome.std_GCT):
                #print(f'HGT found at {i}')
                list_of_extraneous_genes.append(gene)
            elif (equal_sign_check > 0):
                if abs(dev_GC1) > (1.5*genome.std_GC1):
                    #print(f'HGT found at {i}')
                    list_of_extraneous_genes.append(gene)
                elif abs(dev_GC3) > (1.5*genome.std_GC3):
                    #print(f'HGT found at {i}')
                    list_of_extraneous_genes.append(gene)
                else:
                    pass
            else:
                pass
    
    # part two: 11 genes window
    list_of_genes = list(genome.genes.keys())
    genomic_strips = []
    
    for k in range(len(list_of_genes)-10):
        window = {}
        j = 0
        while j < 11:
            # get window
            locust_tag = list_of_genes[k + j]
            
            # take genes that are more than 300bp
            if len(genome[locust_tag]['sequence'])>300:
                data = genome[locust_tag]
                window[locust_tag] = data
            # iterate        
            j+=1
            
        # count total extraneous genes in window
        extraneous_counter = 0
        for l in window.keys():
            if l in list_of_extraneous_genes:
                extraneous_counter+=1
        
        # check windows with more than or
        # equal to 5 extraneous genes
        if extraneous_counter >=5:
            # add their sequences together
            window_sequences = ''
            for m in window.keys():
                window_sequences += window[m]['sequence']
            
            # get standard deviation of strip
            GCT, GC1, GC2, GC3 = GC123(window_sequences)
            SDT = (GCT - genome.mean_GCT)/genome.std_GCT
            SD1 = (GC1- genome.mean_GC1)/genome.std_GC1
            SD2 = (GC2 - genome.mean_GC2)/genome.std_GC2
            SD3 = (GC3 - genome.mean_GC3)/genome.std_GC3
            
            # tag genes as extraneous if they have equal deviation to the its strip
            for n in window.keys():
                # check only genes not in current list
                if n not in list_of_extraneous_genes:
                    check_SDT = window[n]['SDT']*SDT
                    check_SD1 = window[n]['SD1']*SD1
                    check_SD2 = window[n]['SD2']*SD2
                    check_SD3 = window[n]['SD3']*SD3
                    
                    if (check_SDT > 0) and (check_SD1 > 0) and (check_SD2 > 0) and (check_SD3 > 0):
                        list_of_extraneous_genes.append(n)
                
            
            genomic_strips.append(window)
    
    if return_genomic_strips:
        return list_of_extraneous_genes, genomic_strips
    else:
        return list_of_extraneous_genes

In [3]:
hgt_candidates=GC_Content_Deviation(aquifex_aeolicus_VF5)
print(len(hgt_candidates))
print(hgt_candidates)

261
['aq_005', 'aq_013', 'aq_018', 'aq_032', 'aq_035', 'aq_050', 'aq_059', 'aq_062', 'aq_081', 'aq_082', 'aq_118', 'aq_125', 'aq_132', 'aq_141', 'aq_156', 'aq_171', 'aq_172', 'aq_182', 'aq_183', 'aq_200', 'aq_220', 'aq_221', 'aq_254', 'aq_255', 'aq_264', 'aq_293', 'aq_326', 'aq_340', 'aq_351', 'aq_359', 'aq_369', 'aq_372', 'aq_376', 'aq_377', 'aq_378', 'aq_380', 'aq_381', 'aq_382', 'aq_383', 'aq_384', 'aq_385', 'aq_386', 'aq_387', 'aq_388', 'aq_402', 'aq_407', 'aq_418', 'aq_434', 'aq_451', 'aq_465', 'aq_473', 'aq_476', 'aq_488', 'aq_503', 'aq_504', 'aq_505', 'aq_506', 'aq_507', 'aq_509', 'aq_510', 'aq_511', 'aq_515', 'aq_516', 'aq_518', 'aq_519', 'aq_520', 'aq_522', 'aq_531', 'aq_558', 'aq_573', 'aq_585', 'aq_599', 'aq_626', 'aq_629', 'aq_645', 'aq_647', 'aq_652', 'aq_662', 'aq_666', 'aq_673', 'aq_678', 'aq_706', 'aq_711', 'aq_734', 'aq_737', 'aq_740', 'aq_771', 'aq_780', 'aq_782', 'aq_812', 'aq_820', 'aq_821', 'aq_833', 'aq_837', 'aq_850', 'aq_862', 'aq_895', 'aq_908', 'aq_917', 'aq_9

# AA Deviation

In [4]:
CODE = {
    'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C',
    'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C',
    'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*',
    'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W',
    'ctt': 'L', 'cct': 'P', 'cat': 'H', 'cgt': 'R',
    'ctc': 'L', 'ccc': 'P', 'cac': 'H', 'cgc': 'R',
    'cta': 'L', 'cca': 'P', 'caa': 'Q', 'cga': 'R',
    'ctg': 'L', 'ccg': 'P', 'cag': 'Q', 'cgg': 'R',
    'att': 'I', 'act': 'T', 'aat': 'N', 'agt': 'S',
    'atc': 'I', 'acc': 'T', 'aac': 'N', 'agc': 'S',
    'ata': 'I', 'aca': 'T', 'aaa': 'K', 'aga': 'R',
    'atg': 'M', 'acg': 'T', 'aag': 'K', 'agg': 'R',
    'gtt': 'V', 'gct': 'A', 'gat': 'D', 'ggt': 'G',
    'gtc': 'V', 'gcc': 'A', 'gac': 'D', 'ggc': 'G',
    'gta': 'V', 'gca': 'A', 'gaa': 'E', 'gga': 'G',
    'gtg': 'V', 'gcg': 'A', 'gag': 'E', 'ggg': 'G'
}

In [5]:
list_of_amino_acid = set([CODE[cds] for cds in CODE])
list_of_amino_acid = {v: [cds.upper() for cds in CODE if CODE[cds]==v] for v in list_of_amino_acid}
list_of_amino_acid

{'K': ['AAA', 'AAG'],
 'P': ['CCT', 'CCC', 'CCA', 'CCG'],
 '*': ['TAA', 'TGA', 'TAG'],
 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
 'D': ['GAT', 'GAC'],
 'F': ['TTT', 'TTC'],
 'C': ['TGT', 'TGC'],
 'Y': ['TAT', 'TAC'],
 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
 'E': ['GAA', 'GAG'],
 'A': ['GCT', 'GCC', 'GCA', 'GCG'],
 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
 'W': ['TGG'],
 'I': ['ATT', 'ATC', 'ATA'],
 'V': ['GTT', 'GTC', 'GTA', 'GTG'],
 'Q': ['CAA', 'CAG'],
 'H': ['CAT', 'CAC'],
 'M': ['ATG'],
 'T': ['ACT', 'ACC', 'ACA', 'ACG'],
 'N': ['AAT', 'AAC'],
 'G': ['GGT', 'GGC', 'GGA', 'GGG']}

In [6]:
def calculate_amino_acid_content_genome_mean(genome):
    amino_acid_count_genome = {amino_acid: 0 for amino_acid in list_of_amino_acid}
    for locust_tag in aquifex_aeolicus_VF5.genes:
        for amino_acid in list_of_amino_acid:
            for codons in list_of_amino_acid[amino_acid]:
                amino_acid_count_genome[amino_acid] += aquifex_aeolicus_VF5[locust_tag]['cub'][codons]

    total_amino_acid_count = sum([amino_acid_count_genome[amino_acid] for amino_acid in amino_acid_count_genome ])
    for i in amino_acid_count_genome:
        amino_acid_count_genome[i]=amino_acid_count_genome[i]*100/total_amino_acid_count

    genome.mean_AA = amino_acid_count_genome

In [7]:
def calculate_amino_acid_content_gene_mean(genome):
    # calculate mean
    for locust_tag in genome.genes:
        amino_acid_count_gene = {amino_acid: 0 for amino_acid in list_of_amino_acid}
        for amino_acid in list_of_amino_acid:
            for codons in list_of_amino_acid[amino_acid]:
                amino_acid_count_gene[amino_acid]  += genome[locust_tag]['cub'][codons]
                
        total_amino_acid_count = sum([amino_acid_count_gene[amino_acid] for amino_acid in amino_acid_count_gene ])
        for AA in amino_acid_count_gene:
            amino_acid_count_gene[AA] = amino_acid_count_gene[AA]*100/total_amino_acid_count

        genome[locust_tag]['AA_Content_mean']=amino_acid_count_gene

In [8]:
def calculate_amino_acid_content_genome_std(genome):
    amino_acid_std = {amino_acid: 0 for amino_acid in list_of_amino_acid}
    
    for amino_acid in list_of_amino_acid:
        AA_mean = genome.mean_AA[amino_acid]
        sum_diff = 0
        for locust_tag in genome.genes:
            diff = (genome[locust_tag]['AA_Content_mean'][amino_acid]) - AA_mean
            diff = diff*diff
            sum_diff+=diff
        
        amino_acid_std[amino_acid]=math.sqrt(sum_diff/len(genome))
            
    genome.std_AA = amino_acid_std

The bottom functions assign new attributes to genome and gene

- genome_mean: add AA mean to genome
- gene_mean: add AA mean to each gene
- genome_std: add AA std to genome

In [9]:
calculate_amino_acid_content_genome_mean(aquifex_aeolicus_VF5)
calculate_amino_acid_content_gene_mean(aquifex_aeolicus_VF5)
calculate_amino_acid_content_genome_std(aquifex_aeolicus_VF5)

In [10]:
def check_amino_acid_deviation(genome):
    list_of_non_extraneous_genes_AA = []
    for locust_tag in genome.genes:
        for amino_acid in list_of_amino_acid:
            devAA = genome[locust_tag]['AA_Content_mean'][amino_acid] - genome.mean_AA[amino_acid]
            if devAA > 3*genome.std_AA[amino_acid]:
                list_of_non_extraneous_genes_AA.append(locust_tag)
                break
    return list_of_non_extraneous_genes_AA

In [11]:
list_of_non_extraneous_genes_AA = check_amino_acid_deviation(aquifex_aeolicus_VF5)
list_of_non_extraneous_genes_AA

['aq_008',
 'aq_012',
 'aq_020',
 'aq_022',
 'aq_039',
 'aq_045',
 'aq_055',
 'aq_063',
 'aq_064a',
 'aq_064b',
 'aq_064c',
 'aq_067',
 'aq_069',
 'aq_073',
 'aq_074',
 'aq_075',
 'aq_075a',
 'aq_080',
 'aq_087',
 'aq_088',
 'aq_090',
 'aq_098',
 'aq_106',
 'aq_108a',
 'aq_108b',
 'aq_124a',
 'aq_125',
 'aq_141',
 'aq_156',
 'aq_157',
 'aq_158',
 'aq_175a',
 'aq_176',
 'aq_177',
 'aq_183a',
 'aq_194',
 'aq_250',
 'aq_252',
 'aq_254',
 'aq_267',
 'aq_293',
 'aq_314',
 'aq_355',
 'aq_357a',
 'aq_363',
 'aq_380',
 'aq_388',
 'aq_389',
 'aq_391',
 'aq_392',
 'aq_394',
 'aq_401',
 'aq_406',
 'aq_419',
 'aq_420',
 'aq_449',
 'aq_453',
 'aq_481',
 'aq_488',
 'aq_504',
 'aq_507',
 'aq_509',
 'aq_519',
 'aq_531',
 'aq_538',
 'aq_539',
 'aq_553',
 'aq_577',
 'aq_591',
 'aq_615',
 'aq_628',
 'aq_652',
 'aq_655',
 'aq_665',
 'aq_666',
 'aq_671',
 'aq_678',
 'aq_702',
 'aq_722',
 'aq_735',
 'aq_737',
 'aq_754',
 'aq_758',
 'aq_766',
 'aq_770',
 'aq_780',
 'aq_792',
 'aq_792a',
 'aq_808',
 'aq_818',

find out extraneous genes to be taken out

In [12]:
take_out_list = []
list_of_hgt =[]
for i in hgt_candidates:
    if i in list_of_non_extraneous_genes_AA:
        take_out_list.append(i)
    else:
        list_of_hgt.append(i)
        

print(len(take_out_list))
print(len(list_of_hgt))

58
203


In [13]:
list_of_hgt

['aq_005',
 'aq_013',
 'aq_018',
 'aq_032',
 'aq_035',
 'aq_050',
 'aq_059',
 'aq_062',
 'aq_081',
 'aq_082',
 'aq_118',
 'aq_132',
 'aq_171',
 'aq_172',
 'aq_182',
 'aq_183',
 'aq_200',
 'aq_220',
 'aq_221',
 'aq_255',
 'aq_264',
 'aq_326',
 'aq_340',
 'aq_351',
 'aq_359',
 'aq_369',
 'aq_372',
 'aq_376',
 'aq_377',
 'aq_378',
 'aq_381',
 'aq_382',
 'aq_383',
 'aq_384',
 'aq_385',
 'aq_386',
 'aq_387',
 'aq_402',
 'aq_407',
 'aq_418',
 'aq_434',
 'aq_451',
 'aq_465',
 'aq_473',
 'aq_476',
 'aq_503',
 'aq_505',
 'aq_506',
 'aq_510',
 'aq_511',
 'aq_515',
 'aq_516',
 'aq_518',
 'aq_520',
 'aq_522',
 'aq_558',
 'aq_573',
 'aq_585',
 'aq_599',
 'aq_626',
 'aq_629',
 'aq_645',
 'aq_647',
 'aq_662',
 'aq_673',
 'aq_706',
 'aq_711',
 'aq_734',
 'aq_740',
 'aq_771',
 'aq_782',
 'aq_812',
 'aq_820',
 'aq_821',
 'aq_833',
 'aq_837',
 'aq_850',
 'aq_862',
 'aq_895',
 'aq_908',
 'aq_920',
 'aq_926',
 'aq_940',
 'aq_944',
 'aq_946',
 'aq_993',
 'aq_996',
 'aq_1008',
 'aq_1018',
 'aq_1019',
 'aq_10

In [14]:
aquifex_aeolicus_VF5.mean_AA

{'K': 9.407543465954214,
 'P': 4.0494915226625725,
 '*': 0.3164492520475529,
 'S': 4.7950003461800055,
 'D': 4.304646550214428,
 'F': 5.135274928014923,
 'C': 0.7892904124429312,
 'Y': 4.133185629864339,
 'R': 4.911479736251563,
 'E': 9.608327869119522,
 'A': 5.858180233528959,
 'L': 10.569079201912542,
 'W': 0.9346860147350501,
 'I': 7.3015472209890975,
 'V': 7.923042148433841,
 'Q': 2.04083294982019,
 'H': 1.5419264713668408,
 'M': 1.865706594118198,
 'T': 4.201199819171856,
 'N': 3.603530221514476,
 'G': 6.7095794116568985}

In [15]:
aquifex_aeolicus_VF5.std_AA

{'K': 2.9440643095706154,
 'P': 1.547124235237513,
 '*': 0.2661738967919316,
 'S': 1.7614338898397268,
 'D': 1.6051740744098855,
 'F': 2.387013257951703,
 'C': 1.0981742652812894,
 'Y': 1.6028704011973323,
 'R': 2.0304425684913956,
 'E': 3.283793599655885,
 'A': 2.2809981503501815,
 'L': 3.080711147423685,
 'W': 0.8144887685797604,
 'I': 1.99739850196166,
 'V': 2.151390016150716,
 'Q': 1.2232858395522948,
 'H': 1.03805532649652,
 'M': 1.0923907381102593,
 'T': 1.4265356828926017,
 'N': 1.477769120166431,
 'G': 2.2531517690362106}

In [16]:
aquifex_aeolicus_VF5['aq_132']['AA_Content_mean']

{'K': 4.516129032258065,
 'P': 2.5806451612903225,
 '*': 0.6451612903225806,
 'S': 4.516129032258065,
 'D': 4.516129032258065,
 'F': 3.225806451612903,
 'C': 0.6451612903225806,
 'Y': 1.2903225806451613,
 'R': 7.096774193548387,
 'E': 9.03225806451613,
 'A': 12.258064516129032,
 'L': 9.67741935483871,
 'W': 1.2903225806451613,
 'I': 9.67741935483871,
 'V': 7.096774193548387,
 'Q': 1.2903225806451613,
 'H': 2.5806451612903225,
 'M': 1.2903225806451613,
 'T': 4.516129032258065,
 'N': 2.5806451612903225,
 'G': 9.67741935483871}