# Mouse-to-Human alignment/mutation modeling

This notebook checks whether mutations in MSK IMPACT dataset can be modelled in mouse based on amino acid conservation.

It produces files "flanksize_[flank size value].csv" which provide a quantification of mutational concordance/homology between human and mouse as a function of flank size, which corresponds to the homology requirement of the codons flanking the site of the mutation.

## Loading in required files

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
from Bio import SeqIO
import gzip
from Bio.Seq import Seq
import re
import gffutils



In [2]:
pd.set_option('display.max_columns', 50)

In [3]:
filepath = '/Volumes/Sam_G_SSD/2020-06-16-MSK-IMPACT_EDITED.txt'
impact_data = pd.read_csv(filepath, sep='\t')
impact_data

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,...,MOTIF_SCORE_CHANGE,PHENO,PICK,PUBMED,PolyPhen,SAS_MAF,SIFT,SOMATIC,SWISSPROT,SYMBOL,SYMBOL_SOURCE,TREMBL,TSL,Transcript,UNIPARC,VARIANT_CLASS,all_effects,amino_acid_change,cDNA_Change,cDNA_position,cdna_change,comments,n_depth,t_depth,transcript
0,BRCA2,675,MSKCC,GRCh37,13,32937315,32937315,+,splice_acceptor_variant,Splice_Site,SNP,G,G,C,rs81002874,,P-0029279-T01-IM6,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,BRCA2,0,MSKCC,37,13,32914437,32914438,+,,,DEL,GT,GT,G,rs80359550,,P-0034227-T01-IM6,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,MUTYH,4595,MSKCC,GRCh37,1,45798475,45798475,+,missense_variant,Missense_Mutation,SNP,T,T,C,rs34612342,,P-0030735-T01-IM6,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,BRCA2,675,MSKCC,GRCh37,13,32893302,32893302,+,frameshift_variant,Frame_Shift_Ins,INS,T,T,GCCGGGCGCGGTGG,,,P-0038798-T01-IM6,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,BRCA1,0,MSKCC,37,17,41251824,41251825,+,,,DEL,TG,TG,T,rs80357872,,P-0030162-T01-IM6,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422817,SMARCA4,6597,MSKCC,GRCh37,19,11144132,11144132,+,missense_variant,Missense_Mutation,SNP,C,C,G,,,P-0052864-T01-XS1,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
422818,BRAF,673,MSKCC,GRCh37,7,140453149,140453149,+,missense_variant,Missense_Mutation,SNP,C,C,G,rs121913361,,P-0052867-T01-XS1,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
422819,NRAS,4893,MSKCC,GRCh37,1,115258747,115258747,+,missense_variant,Missense_Mutation,SNP,C,C,T,rs121913237,,P-0052951-T01-XS1,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
422820,TERT,7015,MSKCC,GRCh37,5,1295521,1295521,+,upstream_gene_variant,5'Flank,SNP,A,A,T,,,P-0052951-T01-XS1,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
#homology dataframe

homology_df = np.load('/Volumes/Sam_G_SSD/homology_table.npy', allow_pickle=True)
homology_df = pd.DataFrame(homology_df,columns=['gene','human gene name','mouse gene name','mouse id','mouse id version','mouse transcript'])

homology_df

Unnamed: 0,gene,human gene name,mouse gene name,mouse id,mouse id version,mouse transcript
0,ABL1,ABL1,Abl1,ENSMUSG00000026842,ENSMUSG00000026842.16,ENSMUST00000028190.12
1,AC004906.3,NONE,NONE,NONE,NONE,NONE
2,AC008738.1,NONE,NONE,NONE,NONE,NONE
3,ACTG1,ACTG1,Actg1,ENSMUSG00000062825,ENSMUSG00000062825.15,ENSMUST00000071555.12
4,ACVR1,ACVR1,Acvr1,ENSMUSG00000026836,ENSMUSG00000026836.15,ENSMUST00000056376.11
...,...,...,...,...,...,...
589,XRCC2,XRCC2,Xrcc2,ENSMUSG00000028933,ENSMUSG00000028933.11,ENSMUST00000030773.11
590,YAP1,YAP1,Yap1,ENSMUSG00000053110,ENSMUSG00000053110.13,ENSMUST00000086580.11
591,YES1,YES1,Yes1,ENSMUSG00000014932,ENSMUSG00000014932.15,ENSMUST00000168707.5
592,ZFHX3,ZFHX3,Zfhx3,ENSMUSG00000038872,ENSMUSG00000038872.10,ENSMUST00000043896.9


In [5]:
filename1 = '/Users/samgould/Desktop/FSR Lab/2022-03-17/gene_info.csv'
df1 = pd.read_csv(filename1)
df1

Unnamed: 0,gene,gene_id,transcript_id,chrom,gene_start,gene_end,transcript_start,transcript_end,strand
0,ABL1,ENSG00000097007.13,ENST00000318560.5,chr9,133589333,133763062,133710453,133763062,+
1,AC004906.3,ENSG00000237286.1,ENST00000423194.1,chr7,2983669,2986725,2983669,2986725,+
2,AC008738.1,ENSG00000230259.2,ENST00000425420.2,chr19,33790853,33793430,33790853,33793430,-
3,ACTG1,ENSG00000184009.5,ENST00000575842.1,chr17,79476997,79490873,79477015,79479807,-
4,ACVR1,ENSG00000115170.9,ENST00000263640.3,chr2,158592958,158732374,158592958,158731623,-
...,...,...,...,...,...,...,...,...,...
589,XRCC2,ENSG00000196584.2,ENST00000359321.1,chr7,152341864,152373250,152343589,152373250,-
590,YAP1,ENSG00000137693.9,ENST00000282441.5,chr11,101981192,102104154,101981192,102104154,+
591,YES1,ENSG00000176105.9,ENST00000314574.4,chr18,721588,812547,721748,812239,-
592,ZFHX3,ENSG00000140836.10,ENST00000268489.5,chr16,72816784,73093597,72816784,73082274,-


In [16]:
#loading in annotation databases for human and mouse
file = '/Volumes/Sam_G_SSD/gencode_v19.db'
db = gffutils.FeatureDB(file)

file_mouse = '/Volumes/Sam_G_SSD/GRCm38.p6 (mouse)/gencode_vM25.db'
db_mouse = gffutils.FeatureDB(file_mouse)

In [6]:
#loading in necessary genes for human
path = '/Volumes/Sam_G_SSD/human genome GrCh37 IMPACT genes/'
impact_genes = np.load(path + 'human_impact_genes_plusminus5000.npy', allow_pickle=True)
unique_genes = np.load(path + 'human_impact_genes_NAMES.npy', allow_pickle=True)

In [18]:
#loading in mouse genes
#loading in necessary genes for human
path = '/Volumes/Sam_G_SSD/mouse genome GRCm38.p6 IMPACT genes/'
mouse_genes = np.load(path + 'mouse_impact_genes_plusminus5000.npy', allow_pickle=True)
mouse_gene_names = np.load(path + 'mouse_impact_genes_NAMES.npy', allow_pickle=True)

In [19]:
db_mouse['ENSMUSG00000026842.16'].strand

'+'

In [20]:
#loading in orthologous protein alignmnet

path = '/Volumes/Sam_G_SSD/human_mouse_alignments/'
human_prot_align = np.load(path+'human_alignment_idx.npy', allow_pickle=True)
mouse_prot_align = np.load(path+'mouse_alignment_idx.npy', allow_pickle=True)

In [21]:
#loading in mapping between codons and amino acids
path = '/Volumes/Sam_G_SSD/human_mouse_alignments/'

human_codon_locations = np.load(path+'human_codon_locations.npy', allow_pickle=True)
human_codon_seqs = np.load(path+'human_codon_seqs.npy', allow_pickle=True)
human_aa = np.load(path+'human_aa.npy', allow_pickle=True)

mouse_codon_locations = np.load(path+'mouse_codon_locations.npy', allow_pickle=True)
mouse_codon_seqs = np.load(path+'mouse_codon_seqs.npy', allow_pickle=True)
mouse_aa = np.load(path+'mouse_aa.npy', allow_pickle=True)

excluded_genes = []
for i in range(len(mouse_aa)):
    
    if len(mouse_aa[i])==1:
        excluded_genes.append(unique_genes[i])


# Putting everything together to determine if homologous mutation can be modeled

- In the below script, I am able to determine whether a given mutation falls in a region of alignment (i.e. homology) at varying stringencies of homologous flanking region.
- However, the script is currently incapable of modeling the effects of each mutation in human/mouse. I do, however, record if both the amino acid and DNA sequence are conserved, but this does not take into account potentially synonymous mutations with differing DNA sequences and conserved amino acid sequences. In short, it needs to be fixed, but it is not the current focus of this analysis.
    - Note: the main issue preventing it from being fixed easily is that all mutations are reported on the + strand, but not all genes are transcribed in the + direction (and this directionality doesn't need to match between species). Essentially, it is a difficult indexing problem that could be fixed if enough time were devoted to it.

In [22]:
#function that takes in original codon(s) and mutation types, and spits out resulting codon sequence
#see above for errors associated with it

def mutation_modeling(h_codon, m_codon, codon_locs, codon_idx,human_aa_idx, mouse_aa_idx, within_codon_idx, mutation, gene_idx):
    variant_type = mutation['Variant_Type'].values[0]
    start = mutation['Start_Position'].values[0]
    end = mutation['End_Position'].values[0]
    
    ref_allele = mutation['Reference_Allele'].values[0]
    mut_allele = mutation['Tumor_Seq_Allele2'].values[0]
    
    #for recording 
    aa_concordant = 0
    dna_concordant = 0
    ref_aa = human_aa[gene_idx][codon_idx]
    mut_aa_h = 'none'
    mut_aa_m = 'none'
    
    #defining which strand the gene falls on in mouse and human
    tx = homology_df[homology_df['gene']==gene_name]['mouse transcript'].values[0]
    strand_m = db_mouse[tx].strand
    strand_h = df1[df1['gene']==gene_name]['strand'].values[0]
    
    #modeling mutations
    
    if variant_type=='DEL':
        
        end_ind = np.where(np.array(codon_locs) == end) #where does the mutation fall (which codon)
        codon_idx_end = end_ind[0][0] #which codon is it
        within_codon_idx_end = end_ind[1][0]
        
        diff = abs(codon_idx_end - codon_idx)
        
        seq_ref_h = sum(human_codon_seqs[gene_idx][human_aa_idx:human_aa_idx+diff+1], Seq(''))
        #need to be mindful of strand...
        aa_ref_h = sum(human_aa[gene_idx][human_aa_idx:human_aa_idx+diff+1], Seq(''))
        
        ref_aa = aa_ref_h #record reference amino acids that are spanned
        
        aa_ref_m = sum(mouse_aa[gene_idx][mouse_aa_idx:mouse_aa_idx+diff+1], Seq(''))
        seq_ref_m = sum(mouse_codon_seqs[gene_idx][mouse_aa_idx:mouse_aa_idx+diff+1], Seq(''))
        
        tx = homology_df[homology_df['gene']==gene_name]['mouse transcript'].values[0]
        strand_m = db_mouse[tx].strand
        if strand_m=='-': #if minus strand take complement
            seq_ref_m = seq_ref_m.complement()
        elif strand_m=='+':
            seq_ref_m = seq_ref_m
 
        #if aa level is concordant records
        if aa_ref_h == aa_ref_m:
            aa_concordant = 1
        if seq_ref_h == seq_ref_m:
            dna_concordant = 1
    
      
    elif variant_type=='INS':
        
        end_ind = np.where(np.array(codon_locs) == end) #where does the mutation fall (which codon)
        codon_idx_end = end_ind[0][0] #which codon is it
        within_codon_idx_end = end_ind[1][0]

        diff = abs(codon_idx_end - codon_idx)

        tx = homology_df[homology_df['gene']==gene_name]['mouse transcript'].values[0]
        strand_m = db_mouse[tx].strand
        strand_h = df1[df1['gene']==gene_name]['strand'].values[0]    

        seq_ref_h = sum(human_codon_seqs[gene_idx][human_aa_idx:human_aa_idx+diff+1], Seq(''))
                #need to be mindful of strand...        
        aa_ref_h = sum(human_aa[gene_idx][human_aa_idx:human_aa_idx+diff+1], Seq(''))

        seq_ref_m = sum(mouse_codon_seqs[gene_idx][mouse_aa_idx:mouse_aa_idx+diff+1], Seq(''))

        within_codon_idx_h = within_codon_idx
        within_codon_idx_end_h = within_codon_idx_end


        if strand_h=='-':
            within_codon_idx_h = 2- within_codon_idx
            within_codon_idx_end_h = 2 - within_codon_idx_end

        #ref_aa = aa_ref_h #record reference amino acids that are spanned
        ind_2 = np.where(np.array(codon_locs) == codon_idx+diff)
        if len(ind_2[0])>0:
            if strand_h==strand_m:
                mutant_seq_h = h_codon[0:within_codon_idx_h] + mut_allele + human_codon_seqs[gene_idx][codon_idx+diff][within_codon_idx_h:]
                mutant_seq_m = m_codon[0:within_codon_idx_h] + mut_allele + mouse_codon_seqs[gene_idx][mouse_aa_idx+diff][within_codon_idx_h:]

            else:
                mutant_seq_h = h_codon[0:within_codon_idx_h] + mut_allele + human_codon_seqs[gene_idx][codon_idx+diff][within_codon_idx_h:]
                mutant_seq_m = m_codon[0:within_codon_idx_h] + Seq(mut_allele).complement() + mouse_codon_seqs[gene_idx][mouse_aa_idx+diff][within_codon_idx_h:]


            right_flank_idx_h = min(human_aa_idx+1, max(human_prot_align[gene_idx]))
            right_flank_idx_m = min(mouse_aa_idx+1, max(mouse_prot_align[gene_idx]))
            mutant_seq_h_flanked = human_codon_seqs[gene_idx][human_aa_idx-1] + mutant_seq_h + human_codon_seqs[gene_idx][right_flank_idx_h]
            mutant_seq_m_flanked = mouse_codon_seqs[gene_idx][mouse_aa_idx-1] + mutant_seq_m + mouse_codon_seqs[gene_idx][right_flank_idx_m]

            strand_h = df1[df1['gene']==gene_name]['strand'].values[0]
            if strand_h=='-': #if minus strand
                mutant_aa_h = mutant_seq_h_flanked.complement().transcribe().translate()
                ref_aa = seq_ref_h.complement().transcribe().translate()
            elif strand_h=='+':
                mutant_aa_h = mutant_seq_h_flanked.transcribe().translate()
                ref_aa = seq_ref_h.transcribe().translate()

            tx = homology_df[homology_df['gene']==gene_name]['mouse transcript'].values[0]
            strand_m = db_mouse[tx].strand
            if strand_m=='-': #if minus strand take complement
                mutant_aa_m = mutant_seq_m_flanked.complement().transcribe().translate()
            elif strand_m=='+':
                mutant_aa_m = mutant_seq_m_flanked.transcribe().translate()

            mut_aa_h = mutant_aa_h
            mut_aa_m = mutant_aa_m

            if mut_aa_h==mut_aa_m:
                aa_concordant=1
            else:
                aa_concordant=0        

            if mutant_aa_h==mutant_aa_m:
                aa_concordant=1
            else:
                aa_concordant=0

            #need to double check that this is correct...
            if strand_h==strand_m:
                if mutant_seq_m_flanked == mutant_seq_h_flanked:
                    dna_concordant=1
                else:
                    dna_concordant=0

            else:
                if mutant_seq_m_flanked.complement() == mutant_seq_h_flanked:
                    dna_concordant=1
                else:
                    dna_concordant=0
        
        
        #else:continue
            
            
    else: #SNPs, ONPs, DNPs
        end_ind = np.where(np.array(codon_locs) == end) #where does the mutation fall (which codon)
        codon_idx_end = end_ind[0][0] #which codon is it
        within_codon_idx_end = end_ind[1][0]

        diff = abs(codon_idx_end - codon_idx)

        tx = homology_df[homology_df['gene']==gene_name]['mouse transcript'].values[0]
        strand_m = db_mouse[tx].strand
        strand_h = df1[df1['gene']==gene_name]['strand'].values[0]    

        seq_ref_h = sum(human_codon_seqs[gene_idx][human_aa_idx:human_aa_idx+diff+1], Seq(''))
                #need to be mindful of strand...        
        aa_ref_h = sum(human_aa[gene_idx][human_aa_idx:human_aa_idx+diff+1], Seq(''))

        seq_ref_m = sum(mouse_codon_seqs[gene_idx][mouse_aa_idx:mouse_aa_idx+diff+1], Seq(''))

        within_codon_idx_h = within_codon_idx
        within_codon_idx_end_h = within_codon_idx_end


        if strand_h=='-': #flip indexing if reverse strand
            within_codon_idx_h = 2- within_codon_idx
            within_codon_idx_end_h = 2 - within_codon_idx_end

        #ref_aa = aa_ref_h #record reference amino acids that are spanned
        if strand_h==strand_m:
            mutant_seq_h = h_codon[0:within_codon_idx_h] + mut_allele + human_codon_seqs[gene_idx][codon_idx+diff][within_codon_idx_end_h+1:]
            mutant_seq_m = m_codon[0:within_codon_idx_h] + mut_allele + mouse_codon_seqs[gene_idx][mouse_aa_idx+diff][within_codon_idx_end_h+1:]

        else:
            mutant_seq_h = h_codon[0:within_codon_idx_h] + mut_allele + human_codon_seqs[gene_idx][codon_idx+diff][within_codon_idx_end_h+1:]
            mutant_seq_m = m_codon[0:within_codon_idx_h] + Seq(mut_allele).complement() + mouse_codon_seqs[gene_idx][mouse_aa_idx+diff][within_codon_idx_end_h+1:]


        #need to double check that this is correct...

        if strand_h=='-':
            mutant_aa_h = mutant_seq_h.complement().transcribe().translate()
            ref_aa = seq_ref_h.complement().transcribe().translate()
        elif strand_h=='+':
            mutant_aa_h = mutant_seq_h.transcribe().translate()
            ref_aa = seq_ref_h.transcribe().translate()
            
        if strand_m=='-':
            mutant_aa_m = mutant_seq_m.complement().transcribe().translate()
        elif strand_m=='+':
            mutant_aa_m = mutant_seq_m.transcribe().translate()

        if strand_h==strand_m:
            if mutant_seq_m == mutant_seq_h:
                dna_concordant=1
            else:
                dna_concordant=0

        else:
            if mutant_seq_m.complement() == mutant_seq_h:
                dna_concordant=1
            else:
                dna_concordant=0

        if mutant_aa_h==mutant_aa_m:
            aa_concordant=1
        else:
            aa_concordant=0

    
        mut_aa_h = mutant_aa_h
        mut_aa_m = mutant_aa_m
    
    #all of the amino acids at the target site are concordanta t this point
    #this is referring to the CONSEQUENCE...
    return aa_concordant, dna_concordant, ref_aa, mut_aa_h, mut_aa_m

In [23]:
#flank_size=2
def ortholog_PE(flank_size):
    unique_gene_list = list(unique_genes)
    num_mutations =  len(impact_data)
    #num_mutations=10000
    excluded_mutations = np.zeros(num_mutations)
    non_coding_mutations = np.zeros(num_mutations)

    coding_mutations = np.zeros(num_mutations)
    homologous = np.zeros(num_mutations) #refers to homologous at site of mutation (notconsequenece)
    non_homologous = np.zeros(num_mutations)

    #recording mutational concsequences
    aa_concordant_record = np.zeros(num_mutations)
    dna_concordant_record = np.zeros(num_mutations)

    #only recording these for homologous mutations
    original_aa = []
    new_aa_h = []
    new_aa_m = []

    #iterate over the mutations
    for i in range(num_mutations):
        
        mutation = impact_data.iloc[[i]]
        gene_name = mutation['Hugo_Symbol'].values[0]
        gene_idx = unique_gene_list.index(gene_name)

        start = mutation['Start_Position'].values[0]
        end = mutation['End_Position'].values[0]

        codon_locs = human_codon_locations[gene_idx]
        ind = np.where(np.array(codon_locs) == start) #where does the mutation fall (which codon)
        end_ind = np.where(np.array(codon_locs) == end) #where does the mutation fall (which codon)


        #first check if the mutation falls in an excluded gene
        if gene_name in excluded_genes:
            excluded_mutations[i]=1
            original_aa.append('none')
            new_aa_h.append('none')
            new_aa_m.append('none')

        #then check if it falls in a coding sequence
        elif len(ind[0])==0:
            non_coding_mutations[i]=1
            original_aa.append('none')
            new_aa_h.append('none')
            new_aa_m.append('none')

            #then check if it falls in a coding sequence
        elif len(end_ind[0])==0:
            non_coding_mutations[i]=1
            original_aa.append('none')
            new_aa_h.append('none')
            new_aa_m.append('none')

        #this leaves the coding mutations
        else:
            coding_mutations[i]=1

            codon_idx = ind[0][0] #which codon is it
            within_codon_idx = ind[1][0]#where does it fall in the codon (0,1,2)

            #checking homology
            #see if the codon falls in a region of alignment (checking list of codons/AAs that fall in region of hom.)
            if codon_idx not in human_prot_align[gene_idx]:
                non_homologous[i]=1
                original_aa.append('none')
                new_aa_h.append('none')
                new_aa_m.append('none')

            else: 

                aln_idx = human_prot_align[gene_idx].index(codon_idx)

                human_aa_idx = human_prot_align[gene_idx][aln_idx] #finding index in mouse & human prot. sequence
                mouse_aa_idx = mouse_prot_align[gene_idx][aln_idx]

                #checking if flanking region is homologous
                min_mouse = max(0, mouse_aa_idx-flank_size)
                min_human = max(0, human_aa_idx-flank_size) #preventing weird errors
                mouse_aa_flank = mouse_aa[gene_idx][min_mouse:mouse_aa_idx+flank_size+1]
                human_aa_flank = human_aa[gene_idx][min_human: human_aa_idx+flank_size+1]

                #flank matches = homologous at AA level
                if mouse_aa_flank==human_aa_flank:
                    #original_aa.append(human_aa[gene_idx][human_aa_idx]) #recording original amino acid
                    homologous[i]=1

                    ref_allele = mutation['Reference_Allele'].values[0]
                    mut_allele = mutation['Tumor_Seq_Allele2'].values[0]

                    h_codon = human_codon_seqs[gene_idx][human_aa_idx]
                    m_codon = mouse_codon_seqs[gene_idx][mouse_aa_idx]

                    #need to correct for strand differences with codon
                    #checking human and mouse strand

                    strand_h = df1[df1['gene']==gene_name]['strand'].values[0]
                    if strand_h=='-': #if minus strand
                        h_codon_true = h_codon.complement()
                    elif strand_h=='+':
                        h_codon_true = h_codon


                    tx = homology_df[homology_df['gene']==gene_name]['mouse transcript'].values[0]
                    strand_m = db_mouse[tx].strand
                    if strand_m=='-': #if minus strand take complement
                        m_codon_true = m_codon.complement()
                    elif strand_m=='+':
                        m_codon_true = m_codon


                    aa_concordant, dna_concordant, ref_aa, mut_aa_h, mut_aa_m = mutation_modeling(h_codon, m_codon, codon_locs, codon_idx,human_aa_idx, mouse_aa_idx, within_codon_idx, mutation, gene_idx)


                    aa_concordant_record[i] = aa_concordant
                    dna_concordant_record[i] = dna_concordant
                    original_aa.append(str(ref_aa))
                    new_aa_h.append(str(mut_aa_h))
                    new_aa_m.append(str(mut_aa_m))

                    #if amino acid AND codon matches = homologous
                    #if h_codon_true==m_codon_true:
                    #    homologous[i]=1

                    #if amino acid matches AND codon DOESN'T MATCH
                    #else:
                    #    aa_homologous_dna_non_homologous[i]=1


                    #special case for deletion??
                    #if mutation['Variant_Type'].values[0]=='DEL':
                        #need to consider the entire size of the deletion (start and end site)????
                    #NOT CURRENTLY CONSIDERING CASE WHERE INSERTION CAUSES DIFFERENT AA SEQUENCE???   

                #flank doesn't match == non-homologous
                else:
                    non_homologous[i]=1
                    original_aa.append('none')
                    new_aa_h.append('none')
                    new_aa_m.append('none')


                    
    df = pd.DataFrame(homologous, columns=['homologous'])
    df['non_homologous']=non_homologous
    df['coding_mutations']=coding_mutations
    df['non_coding_mutations']=non_coding_mutations
    df['excluded_mutations']=excluded_mutations
    df['aa_concordant'] = aa_concordant_record
    df['dna_concordant'] = dna_concordant_record

    df['original_aa'] =  original_aa
    df['new_aa_h'] = new_aa_h
    df['new_aa_m'] = new_aa_m
                  

    return df


# Look at each mutation in dataset and quantifying homology
- Iterating over different homology thresholds for region flanking the mutation of interest

In [24]:
for i in range(21):
    flank_size=i
    df = ortholog_PE(flank_size)
    df['variant_type']=np.asarray(impact_data['Variant_Type'])
    
    path = '/Volumes/Sam_G_SSD/'
    df.to_csv(path+'flanksize_' + str(i)+  '.csv')
    
    #these files are provided as well in the dropbox link

