### Load Python libraries and define functions
- Links included for instructions on how to install libraries using pip or conda (if libraries not included with conda)

In [None]:
from Bio import SeqIO     # https://biopython.org/wiki/SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser

import pandas as pd
import numpy as np
import os
import random
import pickle
import gzip
import re
from liftover import ChainFile    # https://pypi.org/project/liftover/
from liftover import get_lifter

import kaleido    # https://github.com/plotly/Kaleido
import plotly     # https://plotly.com/python/getting-started/
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'iframe'
pio.templates.default = "none"

In [None]:
bases = ['A', 'T', 'G', 'C']
def reverse_complement(dna):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    return ''.join([complement[base] for base in dna[::-1]]) 
def repeat_frames_RC(input_seq):
    return list(pd.Series([''.join(input_seq*2)[i:len(input_seq)+i] for i in range(len(input_seq))] + [reverse_complement(seq) for seq in [''.join(input_seq*2)[i:len(input_seq)+i] for i in range(len(input_seq))]]).sort_values().drop_duplicates())

In [None]:
def get_sequence(chrom, start, end, genome):
    chrom_list_current = genome.keys()
    start_min = min(start, end); end_max = max(start, end)
    if chrom in chrom_list_current:
        return genome[chrom][int(start_min):int(end_max)]
    elif chrom[3:] in chrom_list_current:
        return genome[chrom[3:]][int(start_min):int(end_max)]
    elif int(chrom[3:]) in chrom_list_current:
        return genome[int(chrom[3:])][int(start_min):int(end_max)]
    else:
        return np.nan

### Load reference genome

In [None]:
chrom_list = list(range(1,23))
chrom_list_XY = list(range(1,23)) + ['X', 'Y']

#### T2T-CHM13 genome

In [None]:
CHM13_genome = dict()
with gzip.open('genomes/hs1.fa.gz', 'rt') as fasta_file:
    for sequence in SimpleFastaParser(fasta_file):
        chrom = sequence[0][3:]
        if chrom in [str(chrom) for chrom in chrom_list]:
            CHM13_genome[int(chrom)] = sequence[1].upper()
            print(chrom, end=', ')
        elif chrom in ['X', 'Y']:
            CHM13_genome[chrom] = sequence[1].upper()
            print(chrom, end=', ')

#### hg38 genome
- download "hg38.fa.gz" from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/

In [None]:
hg38_genome = dict()
with gzip.open('genomes/hg38.fa.gz', 'rt') as fasta_file:
    for sequence in SimpleFastaParser(fasta_file):
        chrom = sequence[0][3:]
        if chrom in [str(chrom) for chrom in chrom_list]:
            hg38_genome[int(chrom)] = sequence[1].upper()
            print(chrom, end=', ')
        elif chrom in ['X', 'Y']:
            hg38_genome[chrom] = sequence[1].upper()
            print(chrom, end=', ')

# de novo instability rate measurement

## Prepare de novo mutation database
- Gather de novo data from all available public sources (trio/family sequencing)
- Place files in directory './denovo/download/'

### de novo data aligned to hg19

#### data from Goldman 2016 (hg19)
- Parent-of-origin-specific signatures of de novo mutations
- phased
- 816 trios

In [None]:
trio_gold = pd.read_excel('./denovo/download/41588_2016_BFng3597_MOESM69_ESM_hg19.xlsx', usecols = ['Chromosome', 'Start.position', 'Reference', 'Variant', 'parentOfOrigin'])
trio_gold.columns = ['chrom', 'pos', 'ref', 'alt', 'parent']

#### data from Goes et al 2021 (hg19)
- De novo variation in bipolar disorder
- unphased
- 97 trios

In [None]:
goes2019 = pd.read_excel('./denovo/download/41380_2019_611_MOESM3_ESM.xlsx', usecols = ['chr_bp_ref_alt', 'SNV'])
goes2019[['chrom', 'pos', 'ref', 'alt']] = goes2019['chr_bp_ref_alt'].str.split('_', expand = True)
goes2019 = goes2019.dropna()[['chrom', 'pos', 'ref', 'alt']]
goes2019['chrom'] = ['chr' + str(chrom) for chrom in goes2019['chrom']]

#### data from Yuen et al. 2016 (hg19)
- Genome-wide characteristics of de novo mutations in autism
- 192 trios
- phased

In [None]:
yuen_trios = pd.read_excel('./denovo/download/41525_2016_BFnpjgenmed201627_MOESM431_ESM.xlsx', sheet_name='Table S4', skiprows = 1, usecols = ['Chromosome', 'Start', 'Reference', 'Allel', 'Parental Origin'])
yuen_trios.columns = ['chrom', 'pos', 'ref', 'alt', 'parent']

#### data from Sasani 2019 (hg19)
- Large, three-generation human families reveal post-zygotic mosaicism and variability in germline mutation accumulation
- phased
- 350 3rd generation offspring, 70 2nd generation offspring (420 genomes)

In [None]:
dnm_ceph_gen2 = pd.read_csv('./denovo/download/ceph-dnm-manuscript-master/data/second_gen.dnms.txt', sep = '\t', usecols = ['chrom', 'start', 'ref', 'alt', 'paternal_age_at_birth', 'maternal_age_at_birth', 'phase'])
dnm_ceph_gen3 = pd.read_csv('./denovo/download/ceph-dnm-manuscript-master/data/third_gen.dnms.txt', sep = '\t', usecols = ['chrom', 'start', 'ref', 'alt', 'paternal_age_at_birth', 'maternal_age_at_birth', 'phase'])
dnm_ceph_gen2_gon = pd.read_csv('./denovo/download/ceph-dnm-manuscript-master/data/gonosomal.dnms.txt', sep = '\t', usecols = ['chrom', 'start', 'ref', 'alt', 'paternal_age_at_birth', 'maternal_age_at_birth', 'phase'])
dnm_ceph_gen3_gon = pd.read_csv('./denovo/download/ceph-dnm-manuscript-master/data/post-pgcs.dnms.txt', sep = '\t', usecols = ['chrom', 'start', 'ref', 'alt', 'paternal_age_at_birth', 'maternal_age_at_birth', 'phase'])

dnm_ceph = pd.concat([dnm_ceph_gen2, dnm_ceph_gen2_gon, dnm_ceph_gen3, dnm_ceph_gen3_gon])
dnm_ceph['chrom'] = ['chr' + str(chrom) for chrom in dnm_ceph['chrom']]

dnm_ceph.columns = ['chrom', 'pos', 'ref', 'alt', 'paternal_age', 'maternal_age', 'phase']

#### Liftover hg19 data to hg38

In [None]:
trio_hg19 = pd.concat([trio_gold, goes2019, dnm_ceph, yuen_trios]).reset_index(drop = True)
trio_hg19['pos'] = trio_hg19['pos'].astype(int)

# Liftover to hg38
lo = get_lifter('hg19', 'hg38')
trio_hg19['hg38_lo'] = [lo.convert_coordinate(chrom, pos) for chrom,pos in zip(trio_hg19['chrom'], trio_hg19['pos'])]
trio_hg19['hg38_chr'] = [pos[0][0] if len(pos) >0 else np.nan for pos in trio_hg19['hg38_lo']]
trio_hg19['hg38_pos'] = [pos[0][1] if len(pos) >0 else np.nan for pos in trio_hg19['hg38_lo']]

trio_hg19 = trio_hg19.dropna(subset = ['hg38_chr'], axis=0)
trio_hg19 = trio_hg19.loc[trio_hg19['hg38_chr'].isin(['chr'+str(n) for n in range(1,23)])].copy()
trio_hg19['hg38_chr'] = [chrom[3:] for chrom in trio_hg19['hg38_chr']]
trio_hg19['hg38_chr'] = trio_hg19['hg38_chr'].astype(int)
trio_hg19['hg38_pos'] = trio_hg19['hg38_pos'].astype(int)
trio_hg19['hg38_base'] = [reference_lookup(c,p,0, hg38_genome) for c,p in zip(trio_hg19['hg38_chr'], trio_hg19['hg38_pos'])]
trio_hg19 = trio_hg19.loc[trio_hg19['ref'] == trio_hg19['hg38_base']].copy()
trio_hg19 = trio_hg19[['hg38_chr', 'hg38_pos', 'ref', 'alt', 'parent']].copy()
trio_hg19.columns = ['chrom', 'pos', 'ref', 'alt', 'parent']

### de novo data aligned to hg38

#### data from Halldorsson 2019 in Science (hg38)
- Characterizing mutagenic effects of recombination through a sequence-level genetic map
- phased
- 2976 trios

In [None]:
trio_set1 = pd.read_csv('./denovo/download/aau1043_DataS5_revision1.tsv', skiprows = 11, sep = '\t', low_memory=False)
trio_set1_ages = pd.read_csv('./denovo/download/aau1043_DataS7.tsv', skiprows = 4, sep = '\t')

trio_set1_ages.index = trio_set1_ages['Proband_id']
trio_set1['Father_age'] = [trio_set1_ages['Father_age'][pro] for pro in trio_set1['Proband_id']]
trio_set1['Mother_age'] = [trio_set1_ages['Mother_age'][pro] for pro in trio_set1['Proband_id']]

trio_set1['Chr'] = [chrom[3:] if chrom.startswith('chr') == True else chrom for chrom in trio_set1['Chr']]
trio_set1['Chr'] = [int(chrom) if chrom not in ['X', 'Y'] else chrom for chrom in trio_set1['Chr']]

trio_set1 = trio_set1[['Chr', 'Pos', 'Ref', 'Alt', 'Phase_combined', 'Father_age', 'Mother_age']]
trio_set1.columns = ['chrom', 'pos', 'ref', 'alt', 'parent', 'paternal_age', 'maternal_age']

#### data from Jonnson 2017 in Nature (hg38)
- Parental influence on human germline de novo mutations in 1,548 trios from Iceland
- 1548 trios
- phased

In [None]:
trio_set2 = pd.read_csv('./denovo/download/decode_DNMs.tsv', sep = '\t', usecols = ['Chr', 'Pos_hg38', 'Ref', 'Alt', 'Discordant_in_3_gen_or_mz_twins', 'Fathers_age_at_conception', 'Mothers_age_at_conception', 'Phase_combined'])

trio_set2['Chr'] = [chrom[3:] if chrom.startswith('chr') == True else chrom for chrom in trio_set2['Chr']]
trio_set2['Chr'] = [int(chrom) if chrom not in ['X', 'Y'] else chrom for chrom in trio_set2['Chr']]

trio_set2.columns = ['chrom', 'pos', 'ref', 'alt', 'discordant', 'paternal_age', 'maternal_age', 'parent']
trio_set2 = trio_set2.loc[trio_set2['discordant'] != 'Discordant']
del trio_set2['discordant']

#### data from AN 2018 in Science (hg38)
- Genome-wide de novo risk score implicates promoter variation in autism spectrum disorder
- unphased
- 1902 quartets (3804 genomes)

In [None]:
trio_an = pd.read_excel('./denovo/download/aat6576_Table-S2_hg38_notphased.xlsx', skiprows = 1, usecols = ['Chr', 'Pos', 'Ref', 'Alt'])

trio_an['Chr'] = [chrom[3:] if chrom.startswith('chr') == True else chrom for chrom in trio_an['Chr']]
trio_an['Chr'] = [int(chrom) if chrom not in ['X', 'Y'] else chrom for chrom in trio_an['Chr']]

trio_an.columns = ['chrom', 'pos', 'ref', 'alt']

#### data from Jonsson et al 2021 (hg38)
- Differences between germline genomes of monozygotic twins
- 451 offspring in quads
- 608 offspring in three-generation approach

In [None]:
# de novo mutations from quads
jonsson_quads = pd.read_csv('./denovo/download/41588_2020_755_MOESM5_ESM.tsv', sep = '\t', usecols = ['Chr', 'Pos', 'Ref', 'Alt', 'Child_Fathers_Age_at_birth', 'Child_Mothers_Age_at_birth'])
jonsson_quads.columns = ['chrom', 'pos', 'ref', 'alt', 'paternal_age', 'maternal_age']

# de novo mutations from three-generation approach
jonsson_3gen = pd.read_csv('./denovo/download/41588_2020_755_MOESM4_ESM.tsv', sep = '\t', usecols = ['Chr', 'Pos', 'Ref', 'Alt'])
jonsson_3gen.columns = ['chrom', 'pos', 'ref', 'alt']

jonsson_quads['chrom'] = [chrom[3:] if chrom.startswith('chr') == True else chrom for chrom in jonsson_quads['chrom']]
jonsson_quads['chrom'] = [int(chrom) if chrom not in ['X', 'Y'] else chrom for chrom in jonsson_quads['chrom']]

jonsson_3gen['chrom'] = [chrom[3:] if chrom.startswith('chr') == True else chrom for chrom in jonsson_3gen['chrom']]
jonsson_3gen['chrom'] = [int(chrom) if chrom not in ['X', 'Y'] else chrom for chrom in jonsson_3gen['chrom']]

### Combine all into single de novo database

#### SNVs

In [None]:
denovo_combined = pd.concat([trio_set1, trio_set2, trio_an, jonsson_3gen, jonsson_quads, trio_hg19]).reset_index(drop = True)
denovo_combined = denovo_combined.loc[(denovo_combined['ref'].str.len() == 1) & (denovo_combined['alt'].str.len() == 1)].copy()
denovo_combined['parent'] = denovo_combined['parent'].str.lower()
denovo_combined['parent'] = denovo_combined['parent'].fillna('unassigned')
denovo_combined['tri'] = [tri_function(chrom, pos) for chrom, pos in zip(denovo_combined['chrom'], denovo_combined['pos'])]

# Save database
denovo_combined.to_csv('./denovo/all_denovo_snvs.csv', index = False)

In [None]:
denovo_n_genomes_snv = (816 + 97 + 192 + 420 + 2976+ 1548 + 3804 + 451 + 608) * 2

#### Indels
- note: hg19 files do not contain any indels
- all hg38 files contain indels

In [None]:
denovo_combined_indel = pd.concat([trio_set1, trio_set2, trio_an, jonsson_3gen, jonsson_quads]).reset_index(drop = True)
denovo_combined_indel = denovo_combined_indel.loc[(denovo_combined_indel['ref'].str.len() != 1) | (denovo_combined_indel['alt'].str.len() != 1)].copy()
denovo_combined_indel['parent'] = denovo_combined_indel['parent'].str.lower()
denovo_combined_indel['parent'] = denovo_combined_indel['parent'].fillna('unassigned')

# Save database
denovo_combined_indel.to_csv('./denovo/all_denovo_indels.csv', index = False)

In [None]:
denovo_n_genomes_indel = (2976+ 1548 + 3804 + 451 + 608) * 2

## Calculate de novo substitution rate considering context for A-mononucleotide repeats

In [None]:
denovo_combined_snv = pd.read_csv('denovo/all_denovo_snvs.csv')
denovo_n_genomes_snv = (816 + 97 + 192 + 420 + 2976+ 1548 + 3804 + 451 + 608) * 2

In [None]:
denovo_combined_snv

In [None]:
denovo_combined_snv['tri_AB'] = denovo_combined_snv['tri'].str.replace('C', 'B').str.replace('T', 'B').str.replace('G', 'B')
denovo_combined_snv['pos_100kb'] = (denovo_combined_snv['pos'] / 1e5).round() * 1e5
count_by_100kb = denovo_combined_snv.groupby(['chrom', 'pos_100kb'])['ref'].count()

In [None]:
triplet_counts_mutregions = dict()
for chrom, pos in count_by_100kb.index:
    seq = hg38_genome[int(chrom)][int(pos):int(pos+1e5)]
    triplet_counts_mutregions[(chrom, pos, 'A')] = pd.Series(re.findall('...', seq), dtype=object).value_counts()
    triplet_counts_mutregions[(chrom, pos, 'B')] = pd.Series(re.findall('...', seq[1:]), dtype=object).value_counts()
    triplet_counts_mutregions[(chrom, pos, 'C')] = pd.Series(re.findall('...', seq[2:]), dtype=object).value_counts()
    print ('\r' + 'finished chr'+str(chrom) + ' '+str(pos), end = ' ')
triplet_counts_mutregions_sum = pd.concat(triplet_counts_mutregions, axis=1).sum(axis=1).reindex(denovo_combined_snv['tri'].value_counts().index).sort_index()

triplet_counts_mutregions_AB = triplet_counts_mutregions_sum.reset_index()
triplet_counts_mutregions_AB['tri_AB'] = triplet_counts_mutregions_AB['tri'].str.replace('C', 'B').str.replace('T', 'B').str.replace('G', 'B')
triplet_counts_mutregions_AB = triplet_counts_mutregions_AB.groupby(['tri_AB'])[0].sum()

In [None]:
Afusion = len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'A') & (denovo_combined_snv['tri_AB'] == 'ABA')]) / triplet_counts_mutregions_AB['ABA'] / denovo_n_genomes_snv
Aexpansion = len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'A') & (denovo_combined_snv['tri_AB'] == 'BBB')]) / triplet_counts_mutregions_AB['BBB'] / denovo_n_genomes_snv
A01 = len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'A') & ((denovo_combined_snv['tri_AB'] == 'ABB') | (denovo_combined_snv['tri_AB'] == 'BBA'))]) / triplet_counts_mutregions_AB[['ABB', 'BBA']].sum() / denovo_n_genomes_snv
Afission = len(denovo_combined_snv.loc[(denovo_combined_snv['tri_AB'] == 'AAA')]) / triplet_counts_mutregions_AB['AAA'] / denovo_n_genomes_snv
Acontraction = len(denovo_combined_snv.loc[(denovo_combined_snv['tri_AB'] == 'BAB')]) / triplet_counts_mutregions_AB['BAB'] / denovo_n_genomes_snv
A10 = len(denovo_combined_snv.loc[((denovo_combined_snv['tri_AB'] == 'AAB') | (denovo_combined_snv['tri_AB'] == 'BAA'))]) / triplet_counts_mutregions_AB[['AAB', 'BAA']].sum() / denovo_n_genomes_snv

denovo_mut_freq_AB = pd.Series([Afission, Acontraction, A10, Afusion, Aexpansion, A01], index = ['Afission', 'Acontraction', 'A10', 'Afusion', 'Aexpansion', 'A01'])
denovo_mut_freq_AB.to_pickle('denovo/denovo_mut_freq_AB.pickle')

In [None]:
denovo_mut_freq_AB = pd.read_pickle('denovo/denovo_mut_freq_AB.pickle')
denovo_mut_freq_AB

In [None]:
# A>B and B>A rates, ignoring triplet context
snvs_all_A_B = denovo_combined_snv.loc[denovo_combined_snv['ref'] == 'A']['tri'].value_counts()
snvs_all_B_A = denovo_combined_snv.loc[denovo_combined_snv['alt'] == 'A']['tri'].value_counts()

rates_mu_nu = pd.Series([snvs_all_A_B.sum() / triplet_counts_mutregions_sum.reindex(snvs_all_A_B.index).sum() / denovo_n_genomes_snv, snvs_all_B_A.sum() / triplet_counts_mutregions_sum.reindex(snvs_all_B_A.index).sum() / denovo_n_genomes_snv], index = ['A>B', 'B>A'])
rates_mu_nu.to_pickle('denovo/denovo_mut_freq_mu_nu.pickle')

In [None]:
rates_mu_nu = pd.read_pickle('denovo/denovo_mut_freq_mu_nu.pickle')

In [None]:
triplet_counts_mutregions_CD = triplet_counts_mutregions_sum.reset_index()
triplet_counts_mutregions_CD['tri_CD'] = triplet_counts_mutregions_CD['tri'].str.replace('A', 'D').str.replace('T', 'D').str.replace('G', 'D')
triplet_counts_mutregions_CD = triplet_counts_mutregions_CD.groupby(['tri_CD'])[0].sum()

denovo_combined_snv['tri_CD'] = denovo_combined_snv['tri'].str.replace('A', 'D').str.replace('T', 'D').str.replace('G', 'D')

Cfusion = len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'C') & (denovo_combined_snv['tri_CD'] == 'CDC')]) / triplet_counts_mutregions_CD['CDC'] / denovo_n_genomes_snv
Cexpansion = len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'C') & (denovo_combined_snv['tri_CD'] == 'DDD')]) / triplet_counts_mutregions_CD['DDD'] / denovo_n_genomes_snv
C01 = len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'C') & ((denovo_combined_snv['tri_CD'] == 'CDD') | (denovo_combined_snv['tri_CD'] == 'DDC'))]) / triplet_counts_mutregions_CD[['CDD', 'DDC']].sum() / denovo_n_genomes_snv
Cfission = len(denovo_combined_snv.loc[(denovo_combined_snv['tri_CD'] == 'CCC')]) / triplet_counts_mutregions_CD['CCC'] / denovo_n_genomes_snv
Ccontraction = len(denovo_combined_snv.loc[(denovo_combined_snv['tri_CD'] == 'DCD')]) / triplet_counts_mutregions_CD['DCD'] / denovo_n_genomes_snv
C10 = len(denovo_combined_snv.loc[((denovo_combined_snv['tri_CD'] == 'CCD') | (denovo_combined_snv['tri_CD'] == 'DCC'))]) / triplet_counts_mutregions_CD[['CCD', 'DCC']].sum() / denovo_n_genomes_snv

denovo_mut_freq_CD = pd.Series([Cfission, Ccontraction, C10, Cfusion, Cexpansion, C01], index = ['Afission', 'Acontraction', 'A10', 'Afusion', 'Aexpansion', 'A01'])
denovo_mut_freq_CD.to_pickle('denovo/denovo_mut_freq_CD.pickle')

In [None]:
denovo_mut_freq_CD = pd.read_pickle('denovo/denovo_mut_freq_CD.pickle')

#### Poisson random sampling for substitution rates

In [None]:
denovo_mut_freq_AB_poisson = dict()
for i in range(200):
    Afusion = np.random.poisson(len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'A') & (denovo_combined_snv['tri_AB'] == 'ABA')])) / triplet_counts_mutregions_AB['ABA'] / denovo_n_genomes_snv
    Aexpansion = np.random.poisson(len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'A') & (denovo_combined_snv['tri_AB'] == 'BBB')])) / triplet_counts_mutregions_AB['BBB'] / denovo_n_genomes_snv
    A01 = np.random.poisson(len(denovo_combined_snv.loc[(denovo_combined_snv['alt'] == 'A') & ((denovo_combined_snv['tri_AB'] == 'ABB') | (denovo_combined_snv['tri_AB'] == 'BBA'))])) / triplet_counts_mutregions_AB[['ABB', 'BBA']].sum() / denovo_n_genomes_snv
    Afission = np.random.poisson(len(denovo_combined_snv.loc[(denovo_combined_snv['tri_AB'] == 'AAA')])) / triplet_counts_mutregions_AB['AAA'] / denovo_n_genomes_snv
    Acontraction = np.random.poisson(len(denovo_combined_snv.loc[(denovo_combined_snv['tri_AB'] == 'BAB')])) / triplet_counts_mutregions_AB['BAB'] / denovo_n_genomes_snv
    A10 = np.random.poisson(len(denovo_combined_snv.loc[((denovo_combined_snv['tri_AB'] == 'AAB') | (denovo_combined_snv['tri_AB'] == 'BAA'))])) / triplet_counts_mutregions_AB[['AAB', 'BAA']].sum() / denovo_n_genomes_snv
    
    denovo_mut_freq_AB_poisson[i] = pd.Series([Afission, Acontraction, A10, Afusion, Aexpansion, A01], index = ['Afission', 'Acontraction', 'A10', 'Afusion', 'Aexpansion', 'A01'])
denovo_mut_freq_AB_poisson = pd.concat(denovo_mut_freq_AB_poisson, axis=1)

In [None]:
denovo_mut_freq_AB_poisson.to_pickle('denovo/denovo_mut_freq_AB_poisson.pickle')

In [None]:
denovo_mut_freq_AB_poisson = pd.read_pickle('denovo/denovo_mut_freq_AB_poisson.pickle')

## Calculate de novo indel rates

#### denominator for repeat lengths (count repeats in sequenceable regions)

In [None]:
def starting_counts(input_seq, input_nuc):
    if len(input_nuc) > 1:
        input_seq = re.findall('.'*len(input_nuc), input_seq)
    seq_array = np.array([1 if pos == input_nuc else 0 for pos in input_seq], dtype = float)
    
    rep_pos_index = np.where(seq_array == 1)[0]
    pos_list = pd.DataFrame(rep_pos_index[np.where(rep_pos_index != np.roll(rep_pos_index, 1) +1)[0]], columns = ['start'])
    pos_list['end'] = rep_pos_index[np.where(rep_pos_index != np.roll(rep_pos_index, -1) -1)[0]]
    pos_list['length'] = pos_list['end'] - pos_list['start'] + 1
    A_count = pos_list['length'].value_counts()
    pos_list['length_B'] = pos_list['start'] - (pos_list['end'].shift(1).fillna(-1).astype(int) +1)
    B_count = pos_list['length_B'].value_counts()
    return A_count, B_count

In [None]:
reps_1_4 = ['A', 'C', 'AC', 'AG', 'AT', 'CG', 'AAC', 'AAG', 'AAT', 'ACC', 'ACG', 'ACT', 'AGC', 'AGG', 'ATC', 'CCG', 'AAAC', 'AAAG', 'AAAT', 'AACC', 'AACG', 'AACT', 'AAGC', 'AAGT', 'AATC', 'AATG', 'AATT', 'ACAG', 'ACAT', 'ACCC', 'ACCT', 'ACGT', 'ACTG', 'AGAT', 'AGCC', 'AGCT', 'AAGG', 'AGGC', 'ACTC', 'ATCC', 'ATCG', 'ATGC', 'ACGC', 'AGGG', 'CCCG', 'AGCG', 'ACGG', 'CCGG', 'ACCG']
reps_tri_noCpG = ['AAC', 'AAG', 'AAT', 'ACC', 'ACT', 'AGC', 'AGG', 'ATC']

In [None]:
repeat_counts_mutregions_A = dict(); repeat_counts_mutregions_B = dict()
for motif in reps_1_4:
    for repeat in repeat_frames_RC(motif):
        repeat_counts_mutregions_A[repeat] = dict(); repeat_counts_mutregions_B[repeat] = dict()
for chrom, pos in count_by_100kb.index:
    seq = hg38_genome[int(chrom)][int(pos):int(pos+1e5)]
    for motif in reps_1_4:
        for repeat in repeat_frames_RC(motif):
            repeat_counts_mutregions_A[repeat][(chrom, pos)], repeat_counts_mutregions_B[repeat][(chrom, pos)] = starting_counts(seq, motif)
    print ('\r' + 'finished chr'+str(chrom) + ' '+str(pos), end = ' ')

for motif in reps_1_4:
    for repeat in repeat_frames_RC(motif):
        repeat_counts_mutregions_A[repeat] = pd.concat(repeat_counts_mutregions_A[repeat], axis=1).sum(axis=1).sort_index()
        repeat_counts_mutregions_B[repeat] = pd.concat(repeat_counts_mutregions_B[repeat], axis=1).sum(axis=1).sort_index()

In [None]:
repeat_counts_mutregions_A_RC = dict(); repeat_counts_mutregions_B_RC = dict()
for motif in reps_1_4:
    repeat_counts_mutregions_A_RC[motif] = repeat_counts_mutregions_A[repeat_frames_RC(motif)[0]]; repeat_counts_mutregions_B_RC[motif] = repeat_counts_mutregions_B[repeat_frames_RC(motif)[0]]
    for repeat in repeat_frames_RC(motif)[1:]:
        repeat_counts_mutregions_A_RC[motif] = repeat_counts_mutregions_A_RC[motif].add(repeat_counts_mutregions_A[repeat], fill_value = 0)
        repeat_counts_mutregions_B_RC[motif] = repeat_counts_mutregions_B_RC[motif].add(repeat_counts_mutregions_B[repeat], fill_value = 0)
repeat_counts_mutregions_A_RC = pd.concat(repeat_counts_mutregions_A_RC, axis=1).sort_index()
repeat_counts_mutregions_B_RC = pd.concat(repeat_counts_mutregions_B_RC, axis=1).sort_index()

repeat_counts_mutregions_A_RC.to_pickle('denovo/repeat_counts_mutregions_A.pickle')
repeat_counts_mutregions_B_RC.to_pickle('denovo/repeat_counts_mutregions_B.pickle')

In [None]:
repeat_counts_mutregions_A_RC = pd.read_pickle('denovo/repeat_counts_mutregions_A.pickle')
repeat_counts_mutregions_B_RC = pd.read_pickle('denovo/repeat_counts_mutregions_B.pickle')

#### indel counts

In [None]:
denovo_combined_indel = pd.read_csv('denovo/all_denovo_indels.csv')
denovo_combined_indel['chrom'] = [int(chrom) if chrom not in ['X', 'Y'] else chrom for chrom in denovo_combined_indel['chrom']]
denovo_combined_indel['REF+100'] = [get_sequence(chrom, pos, pos+101, hg38_genome) for chrom, pos in zip(denovo_combined_indel['chrom'], denovo_combined_indel['pos'])]
denovo_combined_indel['REF-100'] = [get_sequence(chrom, pos-101, pos, hg38_genome) for chrom, pos in zip(denovo_combined_indel['chrom'], denovo_combined_indel['pos'])]
denovo_combined_indel['indel'] = (denovo_combined_indel['ref'].str.len() > 1).replace(False, 'ins').replace(True, 'del')
denovo_combined_indel['indel_seq'] = denovo_combined_indel['alt'].str[1:] + denovo_combined_indel['ref'].str[1:]
denovo_combined_indel['indel_len'] = denovo_combined_indel['indel_seq'].str.len()

for unit_len in range(1,6):
    denovo_combined_indel['down_'+str(unit_len)] = denovo_combined_indel['REF+100'].str[:unit_len]
    denovo_combined_indel['up_'+str(unit_len)] = denovo_combined_indel['REF-100'].str[-unit_len:]
    denovo_combined_indel['repeat_seq_down_'+str(unit_len)] = [re.search('(?:' + motif + ')+', ref)[0] for ref, motif in zip(denovo_combined_indel['REF+100'], denovo_combined_indel['down_'+str(unit_len)])]
    denovo_combined_indel['repeat_seq_up_'+str(unit_len)] = [re.search('(?:' + motif[::-1] + ')+', ref[::-1])[0] for ref, motif in zip(denovo_combined_indel['REF-100'], denovo_combined_indel['up_'+str(unit_len)])]
    denovo_combined_indel['repeat_units_up_'+str(unit_len)] = denovo_combined_indel['repeat_seq_up_'+str(unit_len)].str.len() / unit_len
    denovo_combined_indel['repeat_units_down_'+str(unit_len)] = denovo_combined_indel['repeat_seq_down_'+str(unit_len)].str.len() / unit_len
    denovo_combined_indel['same_bothsides_'+str(unit_len)] = denovo_combined_indel['up_'+str(unit_len)] == denovo_combined_indel['down_'+str(unit_len)]
    
    # for indels in the middle of a repeat, put the total count in the 'up' category and remove the 'down' count
    denovo_combined_indel['repeat_units_up_'+str(unit_len)] += (denovo_combined_indel['repeat_units_down_'+str(unit_len)] * denovo_combined_indel['same_bothsides_'+str(unit_len)])
    denovo_combined_indel['repeat_units_down_'+str(unit_len)] -= (denovo_combined_indel['repeat_units_down_'+str(unit_len)] * denovo_combined_indel['same_bothsides_'+str(unit_len)])
    denovo_combined_indel['repeat_units_down_'+str(unit_len)] = denovo_combined_indel['repeat_units_down_'+str(unit_len)].replace(0, np.nan)
    
    denovo_combined_indel['indel_is_rep_up_'+str(unit_len)] = [''.join(rep * int(indel_len / unit_len)) == indel_seq for rep, indel_len, indel_seq in zip(denovo_combined_indel['up_'+str(unit_len)], denovo_combined_indel['indel_len'], denovo_combined_indel['indel_seq'])]
    denovo_combined_indel['indel_is_rep_down_'+str(unit_len)] = [''.join(rep * int(indel_len / unit_len)) == indel_seq for rep, indel_len, indel_seq in zip(denovo_combined_indel['down_'+str(unit_len)], denovo_combined_indel['indel_len'], denovo_combined_indel['indel_seq'])]

In [None]:
exp_counts = dict(); con_counts = dict(); nonexp_counts = dict()
for unit_len in range(1,6):
    # A>AA
    exp_up = denovo_combined_indel.groupby(['indel', 'indel_is_rep_up_'+str(unit_len), 'indel_len', 'repeat_units_up_'+str(unit_len), 'up_'+str(unit_len)])['chrom'].count()['ins'].reindex([True], level = 0).unstack().droplevel(0)
    exp_down = denovo_combined_indel.groupby(['indel', 'indel_is_rep_down_'+str(unit_len), 'indel_len', 'repeat_units_down_'+str(unit_len), 'down_'+str(unit_len)])['chrom'].count()['ins'].reindex([True], level = 0).unstack().droplevel(0)
    exp_up.index.names = ['indel_len', 'repeat_units']; exp_down.index.names = ['indel_len', 'repeat_units']
    exp_counts[unit_len] = exp_up.add(exp_down, fill_value = 0)
    #AA>A
    con_up = denovo_combined_indel.groupby(['indel', 'indel_is_rep_up_'+str(unit_len), 'indel_len', 'repeat_units_up_'+str(unit_len), 'up_'+str(unit_len)])['chrom'].count()['del'].reindex([True], level = 0).unstack().droplevel(0)
    con_down = denovo_combined_indel.groupby(['indel', 'indel_is_rep_down_'+str(unit_len), 'indel_len', 'repeat_units_down_'+str(unit_len), 'down_'+str(unit_len)])['chrom'].count()['del'].reindex([True], level = 0).unstack().droplevel(0)
    con_up.index.names = ['indel_len', 'repeat_units']; con_down.index.names = ['indel_len', 'repeat_units']
    con_counts[unit_len] = con_up.add(con_down, fill_value = 0)
    # AA>ABA interruptions (requires A on both sides, length is sum of both sides)
    nonexp_counts[unit_len] = denovo_combined_indel.groupby(['indel', 'same_bothsides_'+str(unit_len), 'indel_is_rep_up_'+str(unit_len), 'indel_len', 'repeat_units_up_'+str(unit_len), 'up_'+str(unit_len)])['chrom'].count()['ins'].reindex([True], level = 0).reindex([False], level = 1).unstack().droplevel(0).droplevel(0)
    nonexp_counts[unit_len].index.names = ['indel_len', 'repeat_units']
exp_counts = pd.concat(exp_counts, axis=1).droplevel(0, axis=1); con_counts = pd.concat(con_counts, axis=1).droplevel(0, axis=1); nonexp_counts = pd.concat(nonexp_counts, axis=1).droplevel(0, axis=1)

#BB>BAB (0-class for expansions)
B_exp = dict()
for unit_len in range(1,6):
    B_exp[unit_len] = denovo_combined_indel.groupby(['indel', 'indel_is_rep_up_'+str(unit_len), 'indel_is_rep_down_'+str(unit_len), 'indel_len', 'indel_seq'])['chrom'].count()['ins'].reindex([False], level = 0).reindex([False], level = 1).droplevel(0).droplevel(0).unstack().reindex([motif for motif in exp_counts.columns if len(motif) == unit_len], axis=1)
    B_exp[unit_len]['repeat_units'] = 0
    B_exp[unit_len].set_index('repeat_units', append = True, inplace = True)
B_exp = pd.concat(B_exp, axis=1).droplevel(0, axis=1)
exp_counts = pd.concat([exp_counts, B_exp]).sort_index()

# BB>B deletions (0-class for contractions)
B_del = dict()
B_del_counts = denovo_combined_indel.groupby(['indel', 'indel_len', 'indel_seq'])['chrom'].count()['del'].unstack().reindex([col for col in con_counts.columns], axis=1)
for motif in con_counts.columns:
    B_del[motif] = B_del_counts.reindex([col for col in con_counts.columns if (len(col) == len(motif)) & (col != motif)], axis=1).sum(axis=1)
    B_del[motif] = pd.DataFrame(B_del[motif], columns = [motif])
    B_del[motif]['repeat_units'] = 0
    B_del[motif].set_index('repeat_units', append = True, inplace = True)
B_del = pd.concat(B_del, axis=1).droplevel(0, axis=1)
con_counts = pd.concat([con_counts, B_del]).sort_index()

# B>BB insertions (length-independent) (0-class for non-expansions)
B_ins = dict()
B_ins_counts = denovo_combined_indel.groupby(['indel', 'indel_len', 'indel_seq'])['chrom'].count()['ins'].unstack().reindex([col for col in exp_counts.columns], axis=1)
for motif in exp_counts.columns:
    B_ins[motif] = B_ins_counts.reindex([col for col in exp_counts.columns if (len(col) == len(motif)) & (col != motif)], axis=1).sum(axis=1)
    B_ins[motif] = pd.DataFrame(B_ins[motif], columns = [motif])
    B_ins[motif]['repeat_units'] = 0
    B_ins[motif].set_index('repeat_units', append = True, inplace = True)
B_ins = pd.concat(B_ins, axis=1).droplevel(0, axis=1)
nonexp_counts = pd.concat([nonexp_counts, B_ins]).sort_index()

In [None]:
exp_counts_std = dict(); con_counts_std = dict(); nonexp_counts_std = dict()
for repeat in exp_counts:
    if repeat_frames_RC(repeat)[0] not in exp_counts_std.keys():
        exp_counts_std[repeat_frames_RC(repeat)[0]] = exp_counts.reindex(repeat_frames_RC(repeat), axis=1).sum(axis=1)
        con_counts_std[repeat_frames_RC(repeat)[0]] = con_counts.reindex(repeat_frames_RC(repeat), axis=1).sum(axis=1)
        nonexp_counts_std[repeat_frames_RC(repeat)[0]] = nonexp_counts.reindex(repeat_frames_RC(repeat), axis=1).sum(axis=1)
exp_counts_std = pd.concat(exp_counts_std, axis=1); con_counts_std = pd.concat(con_counts_std, axis=1); nonexp_counts_std = pd.concat(nonexp_counts_std, axis=1)

In [None]:
exp_counts_std.to_pickle('denovo/denovo_combined_expansion_counts_all.pickle')
con_counts_std.to_pickle('denovo/denovo_combined_contraction_deletion_counts_all.pickle')
nonexp_counts_std.to_pickle('denovo/denovo_combined_nonexpansion_insertion_counts_all.pickle')

In [None]:
exp_counts_std = pd.read_pickle('denovo/denovo_combined_expansion_counts_all.pickle')
con_counts_std = pd.read_pickle('denovo/denovo_combined_contraction_deletion_counts_all.pickle')
nonexp_counts_std = pd.read_pickle('denovo/denovo_combined_nonexpansion_insertion_counts_all.pickle')

In [None]:
exp_counts_inframe = dict(); con_counts_inframe = dict(); nonexp_counts_inframe = dict()
for motif in exp_counts_std:
    exp_counts_inframe[motif] = exp_counts_std[motif].loc[len(motif)]
    con_counts_inframe[motif] = con_counts_std[motif].loc[len(motif)]
    nonexp_counts_inframe[motif] = nonexp_counts_std[motif].loc[len(motif)]
exp_counts_inframe = pd.concat(exp_counts_inframe, axis=1)
con_counts_inframe = pd.concat(con_counts_inframe, axis=1)
nonexp_counts_inframe = pd.concat(nonexp_counts_inframe, axis=1)

In [None]:
exp_counts_inframe.to_pickle('denovo/denovo_combined_expansion_counts.pickle')
con_counts_inframe.to_pickle('denovo/denovo_combined_contraction_deletion_counts.pickle')
nonexp_counts_inframe.to_pickle('denovo/denovo_combined_nonexpansion_insertion_counts.pickle')

In [None]:
exp_counts_inframe = pd.read_pickle('denovo/denovo_combined_expansion_counts.pickle')
con_counts_inframe = pd.read_pickle('denovo/denovo_combined_contraction_deletion_counts.pickle')
nonexp_counts_inframe = pd.read_pickle('denovo/denovo_combined_nonexpansion_insertion_counts.pickle')

#### calculate rates

In [None]:
repeat_counts_mutregions_totalbases = dict()
repeat_counts_mutregions_totalbases['A'] = repeat_counts_mutregions_A_RC.mul(repeat_counts_mutregions_A_RC.index, axis=0) # rate per repeat unit
repeat_counts_mutregions_totalbases['B'] = repeat_counts_mutregions_B_RC.mul(repeat_counts_mutregions_B_RC.index, axis=0)
repeat_counts_mutregions_forexp = repeat_counts_mutregions_totalbases['A']
repeat_counts_mutregions_forcon = repeat_counts_mutregions_totalbases['A']
# BB>BAB, denominator is number of B bases excluding single Bs 
repeat_counts_mutregions_forexp.loc[0] = repeat_counts_mutregions_totalbases['B'][2:].sum()
# B>BB insertions, treating as length-independent, denominator is total number of B bases
# B>_ deletions, treating as length-independent, denominator is total number of B bases, includes fusion events (subtract later if distinguishing)
repeat_counts_mutregions_forcon.loc[0] = repeat_counts_mutregions_totalbases['B'].sum()

repeat_counts_mutregions_forexp = repeat_counts_mutregions_forexp.sort_index()
repeat_counts_mutregions_forcon = repeat_counts_mutregions_forcon.sort_index()

In [None]:
denovo_exp_rate = (exp_counts_inframe[repeat_counts_mutregions_forexp.columns] / repeat_counts_mutregions_forexp / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
denovo_con_rate = (con_counts_inframe[repeat_counts_mutregions_forcon.columns] / repeat_counts_mutregions_forcon / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
denovo_nonexp_rate = (nonexp_counts_inframe[repeat_counts_mutregions_forcon.columns] / repeat_counts_mutregions_forcon / denovo_n_genomes_indel).dropna(how = 'all', axis=0)

In [None]:
denovo_exp_rate.columns

In [None]:
for unit_len in [1,2,3,4]:
    current_reps = [rep for rep in denovo_exp_rate.columns if len(rep) == unit_len]
    denovo_exp_rate['unit_'+str(unit_len)] = (exp_counts_inframe[current_reps].sum(axis=1) / repeat_counts_mutregions_forexp[current_reps].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_con_rate['unit_'+str(unit_len)] = (con_counts_inframe[current_reps].sum(axis=1) / repeat_counts_mutregions_forcon[current_reps].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_nonexp_rate['unit_'+str(unit_len)] = (nonexp_counts_inframe[current_reps].sum(axis=1) / repeat_counts_mutregions_forcon[current_reps].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)

In [None]:
# Combine counts for all non-CpG trinucleotide repeats
denovo_exp_rate['tri_noCpG'] = (exp_counts_inframe[reps_tri_noCpG].sum(axis=1) / repeat_counts_mutregions_forexp[reps_tri_noCpG].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
denovo_con_rate['tri_noCpG'] = (con_counts_inframe[reps_tri_noCpG].sum(axis=1) / repeat_counts_mutregions_forcon[reps_tri_noCpG].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
denovo_nonexp_rate['tri_noCpG'] = (nonexp_counts_inframe[reps_tri_noCpG].sum(axis=1) / repeat_counts_mutregions_forcon[reps_tri_noCpG].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)

In [None]:
denovo_exp_rate.to_pickle('denovo/denovo_exp_rate.pickle')
denovo_con_rate.to_pickle('denovo/denovo_con_rate.pickle')
denovo_nonexp_rate.to_pickle('denovo/denovo_nonexp_rate.pickle')

In [None]:
denovo_exp_rate = pd.read_pickle('denovo/denovo_exp_rate.pickle')
denovo_con_rate = pd.read_pickle('denovo/denovo_con_rate.pickle')
denovo_nonexp_rate = pd.read_pickle('denovo/denovo_nonexp_rate.pickle')

#### Poisson random sampling for indel rates

In [None]:
denovo_exp_rate_poisson = dict()
denovo_con_rate_poisson = dict()
denovo_nonexp_rate_poisson = dict()
for i in range(200):
    denovo_exp_rate_poisson[i] = (exp_counts_inframe[repeat_counts_mutregions_forexp.columns].fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forexp / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_con_rate_poisson[i] = (con_counts_inframe[repeat_counts_mutregions_forcon.columns].fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forcon / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_nonexp_rate_poisson[i] = (nonexp_counts_inframe[repeat_counts_mutregions_forcon.columns].fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forcon / denovo_n_genomes_indel).dropna(how = 'all', axis=0)

denovo_exp_rate_poisson = pd.concat(denovo_exp_rate_poisson)
denovo_con_rate_poisson = pd.concat(denovo_con_rate_poisson)
denovo_nonexp_rate_poisson = pd.concat(denovo_nonexp_rate_poisson)

In [None]:
denovo_exp_rate_poisson_units = dict()
denovo_con_rate_poisson_units = dict()
denovo_nonexp_rate_poisson_units = dict()
for unit_len in [1,2,3,4]:
    for i in range(200):
        current_reps = [rep for rep in denovo_exp_rate.columns if len(rep) == unit_len]
        denovo_exp_rate_poisson_units[i] = (exp_counts_inframe[current_reps].sum(axis=1).fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forexp[current_reps].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
        denovo_con_rate_poisson_units[i] = (con_counts_inframe[current_reps].sum(axis=1).fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forcon[current_reps].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
        denovo_nonexp_rate_poisson_units[i] = (nonexp_counts_inframe[current_reps].sum(axis=1).fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forcon[current_reps].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_exp_rate_poisson['unit_'+str(unit_len)] = pd.concat(denovo_exp_rate_poisson_units, axis=0).replace(0, np.nan)
    denovo_con_rate_poisson['unit_'+str(unit_len)] = pd.concat(denovo_con_rate_poisson_units, axis=0).replace(0, np.nan)
    denovo_nonexp_rate_poisson['unit_'+str(unit_len)] = pd.concat(denovo_nonexp_rate_poisson_units, axis=0).replace(0, np.nan)

In [None]:
denovo_exp_rate_poisson_tri_noCpG = dict()
denovo_con_rate_poisson_tri_noCpG = dict()
denovo_nonexp_rate_poisson_tri_noCpG = dict()
for i in range(200):
    denovo_exp_rate_poisson_tri_noCpG[i] = (exp_counts_inframe[reps_tri_noCpG].sum(axis=1).fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forexp[reps_tri_noCpG].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_con_rate_poisson_tri_noCpG[i] = (con_counts_inframe[reps_tri_noCpG].sum(axis=1).fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forcon[reps_tri_noCpG].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
    denovo_nonexp_rate_poisson_tri_noCpG[i] = (nonexp_counts_inframe[reps_tri_noCpG].sum(axis=1).fillna(0).apply(lambda x: np.random.poisson(x)) / repeat_counts_mutregions_forcon[reps_tri_noCpG].sum(axis=1) / denovo_n_genomes_indel).dropna(how = 'all', axis=0)
denovo_exp_rate_poisson['tri_noCpG'] = pd.concat(denovo_exp_rate_poisson_tri_noCpG, axis=0).replace(0, np.nan)
denovo_con_rate_poisson['tri_noCpG'] = pd.concat(denovo_con_rate_poisson_tri_noCpG, axis=0).replace(0, np.nan)
denovo_nonexp_rate_poisson['tri_noCpG'] = pd.concat(denovo_nonexp_rate_poisson_tri_noCpG, axis=0).replace(0, np.nan)

In [None]:
denovo_exp_rate_poisson.to_pickle('denovo/denovo_exp_rate_poisson.pickle')
denovo_con_rate_poisson.to_pickle('denovo/denovo_con_rate_poisson.pickle')
denovo_nonexp_rate_poisson.to_pickle('denovo/denovo_nonexp_rate_poisson.pickle')

In [None]:
denovo_exp_rate_poisson = pd.read_pickle('denovo/denovo_exp_rate_poisson.pickle')
denovo_con_rate_poisson = pd.read_pickle('denovo/denovo_con_rate_poisson.pickle')
denovo_nonexp_rate_poisson = pd.read_pickle('denovo/denovo_nonexp_rate_poisson.pickle')

### Insertion/deletion size per event

In [None]:
denovo_combined_indel['indel_len_dir'] = [nt if indel == 'ins' else -nt for nt, indel in zip(denovo_combined_indel['indel_len'], denovo_combined_indel['indel'])]

In [None]:
unit_fraction_of_events_per_length = dict()
unit_fraction_of_events_per_length[1] = denovo_combined_indel.groupby(['repeat_units_down_1', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).div(denovo_combined_indel.groupby(['repeat_units_down_1', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).sum(axis=1), axis=0) 
unit_fraction_of_events_per_length[2] = denovo_combined_indel.groupby(['repeat_units_down_2', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).div(denovo_combined_indel.groupby(['repeat_units_down_2', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).sum(axis=1), axis=0)
unit_fraction_of_events_per_length[3] = denovo_combined_indel.groupby(['repeat_units_down_3', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).div(denovo_combined_indel.groupby(['repeat_units_down_3', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).sum(axis=1), axis=0)
unit_fraction_of_events_per_length[4] = denovo_combined_indel.groupby(['repeat_units_down_4', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).div(denovo_combined_indel.groupby(['repeat_units_down_4', 'indel_len_dir'])['chrom'].count().unstack().fillna(0).sum(axis=1), axis=0)

In [None]:
fig = make_subplots(rows = 4, cols = 1, vertical_spacing = 0.015, shared_xaxes = True, row_titles = ['unit=1', 'unit=2', 'unit=3', 'unit=4'], x_title = 'repeat length (units)', y_title = 'fraction of events per length bin')
for unit_len in range(1,5):
    for inslen in [1,2,3,4]:
        fig.add_trace(go.Bar(x = unit_fraction_of_events_per_length[unit_len].index, y = unit_fraction_of_events_per_length[unit_len][inslen], name = str(np.abs(inslen)), legendgroup = str(np.abs(inslen)), showlegend = True if unit_len == 1 else False), row = unit_len, col = 1)
    fig.add_trace(go.Bar(x = unit_fraction_of_events_per_length[unit_len].index, y = unit_fraction_of_events_per_length[unit_len][[col for col in unit_fraction_of_events_per_length[unit_len].columns if col > 4]].sum(axis=1), name = '>4', legendgroup = 'other', showlegend = True if unit_len == 1 else False), row = unit_len, col = 1)
    for inslen in [-1,-2,-3,-4]:
        fig.add_trace(go.Bar(x = unit_fraction_of_events_per_length[unit_len].index, y = -unit_fraction_of_events_per_length[unit_len][inslen], name = str(np.abs(inslen)), legendgroup = str(np.abs(inslen)), showlegend = False), row = unit_len, col = 1)
    fig.add_trace(go.Bar(x = unit_fraction_of_events_per_length[unit_len].index, y = - unit_fraction_of_events_per_length[unit_len][[col for col in unit_fraction_of_events_per_length[unit_len].columns if col < -4]].sum(axis=1), name = '>4', legendgroup = 'other', showlegend = False), row = unit_len, col = 1)
    fig.update_xaxes(range = [0.5,10.5], dtick = 1, row = unit_len, col = 1)
fig.update_yaxes(range = [-1,1], tickvals = [-0.5, 0, 0.5])
fig.update_layout(font=dict(family = 'Arial', size = 14), height = 400, width = 600, title = 'de novo', colorway = plotly.colors.DEFAULT_PLOTLY_COLORS[:5], legend_title_text='nt added', margin={'t':30,'l':80,'b':60,'r':20})
fig.update_layout(barmode = 'relative')
fig.show()

In [None]:
fig.write_image('plots/figS4a.pdf')

## Calculate substitution context rates for trinucleotide repeats

#### Count mutations, checking each triplet frame for repeat status

In [None]:
def check_tri_context(seq):
    trisplit = re.findall('.'*3, seq)
    if trisplit[0] == trisplit[1]:
        if trisplit[1] == trisplit[2]:
            return 'AAA', repeat_frames_RC(trisplit[1])[0]
        else:
            return 'AAB', repeat_frames_RC(trisplit[1])[0]
    else:
        if trisplit[1] == trisplit[2]:
            return 'BAA', repeat_frames_RC(trisplit[1])[0]
        else:
            return 'BAB', repeat_frames_RC(trisplit[1])[0]

In [None]:
# 9-mer context in three frames -> 11nt context needed
denovo_combined_snv['11nt'] = [get_sequence(chrom, pos-6, pos+5, hg38_genome) for chrom, pos in zip(denovo_combined_snv['chrom'], denovo_combined_snv['pos'])]
denovo_combined_snv['11nt_AB_1'] = [check_tri_context(seq) for seq in denovo_combined_snv['11nt']]
denovo_combined_snv['11nt_AB_2'] = [check_tri_context(seq[1:]) for seq in denovo_combined_snv['11nt']]
denovo_combined_snv['11nt_AB_3'] = [check_tri_context(seq[2:]) for seq in denovo_combined_snv['11nt']]

denovo_triplet_context_count = denovo_combined_snv.groupby(['11nt_AB_1'])['pos'].count().add(denovo_combined_snv.groupby(['11nt_AB_2'])['pos'].count(), fill_value = 0).add(denovo_combined_snv.groupby(['11nt_AB_3'])['pos'].count(), fill_value = 0)
denovo_triplet_context_count.index = pd.MultiIndex.from_tuples(denovo_triplet_context_count.index)
denovo_triplet_context_count = denovo_triplet_context_count.unstack().transpose()
denovo_triplet_context_count['AAB'] = denovo_triplet_context_count['AAB'] + denovo_triplet_context_count['BAA']
del denovo_triplet_context_count['BAA']
denovo_triplet_context_count.columns = ['Afission', 'Acontraction', 'A10']

In [None]:
denovo_combined_snv['11nt_alt'] = [el[0:5]+alt+el[6:] for el, alt in zip(denovo_combined_snv['11nt'], denovo_combined_snv['alt'])]
denovo_combined_snv['11nt_alt_AB_1'] = [check_tri_context(seq) for seq in denovo_combined_snv['11nt_alt']]
denovo_combined_snv['11nt_alt_AB_2'] = [check_tri_context(seq[1:]) for seq in denovo_combined_snv['11nt_alt']]
denovo_combined_snv['11nt_alt_AB_3'] = [check_tri_context(seq[2:]) for seq in denovo_combined_snv['11nt_alt']]

denovo_triplet_context_count_alt = denovo_combined_snv.groupby(['11nt_alt_AB_1'])['pos'].count().add(denovo_combined_snv.groupby(['11nt_alt_AB_2'])['pos'].count(), fill_value = 0).add(denovo_combined_snv.groupby(['11nt_alt_AB_3'])['pos'].count(), fill_value = 0)
denovo_triplet_context_count_alt.index = pd.MultiIndex.from_tuples(denovo_triplet_context_count_alt.index)
denovo_triplet_context_count_alt = denovo_triplet_context_count_alt.unstack().transpose()
denovo_triplet_context_count_alt['AAB'] = denovo_triplet_context_count_alt['AAB'] + denovo_triplet_context_count_alt['BAA']
del denovo_triplet_context_count_alt['BAA']
denovo_triplet_context_count_alt.columns = ['Afusion', 'Aexpansion', 'A01']

In [None]:
denovo_triplet_context_count_all = pd.concat([denovo_triplet_context_count, denovo_triplet_context_count_alt], axis=1)

#### Count all 9-mer contexts in the sequenceable genome

In [None]:
nine_counts_mutregions = dict()
for chrom, pos in count_by_100kb.index:
    seq = hg38_genome[int(chrom)][int(pos):int(pos+1e5)]
    nine_counts_mutregions[(chrom, pos)] = pd.Series(re.findall('.........', seq) + re.findall('.........', seq[1:]) + re.findall('.........', seq[2:]) + re.findall('.........', seq[3:]) + re.findall('.........', seq[4:]) + re.findall('.........', seq[5:]) + re.findall('.........', seq[6:]) + re.findall('.........', seq[7:]) + re.findall('.........', seq[8:]), dtype = 'object').value_counts()
    print ('\r' + 'finished chr'+str(chrom) + ' '+str(pos), end = ' ')

In [None]:
nine_counts_mutregions_sum = pd.Series(dtype = 'int64')
counter = 0
for part in list(nine_counts_mutregions.keys()):
    nine_counts_mutregions_sum = nine_counts_mutregions_sum.add(nine_counts_mutregions[part], fill_value = 0)
    counter +=1; print('\r' + str(counter), end = ' ')
nine_counts_mutregions_sum.to_pickle('denovo/ninelet_totals_hg38_mutregions.pickle')

In [None]:
nine_counts_mutregions_sum = pd.read_pickle('denovo/ninelet_totals_hg38_mutregions.pickle')

In [None]:
nine_counts_mutregions_sum_triindex = nine_counts_mutregions_sum.copy()
nine_counts_mutregions_sum_triindex.index = pd.MultiIndex.from_tuples([re.findall('...', seq) for seq in nine_counts_mutregions_sum.index])
nine_counts_mutregions_sum_triindex = nine_counts_mutregions_sum_triindex.reset_index()

In [None]:
nine_counts_triplet_context_count_all = pd.DataFrame(0, index = denovo_triplet_context_count_all.index, columns = denovo_triplet_context_count_all.columns)
for repeat in nine_counts_triplet_context_count_all.index:
    for motif in repeat_frames_RC(repeat):
        nine_counts_triplet_context_count_all.loc[repeat, 'Afission'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] == motif) & (nine_counts_mutregions_sum_triindex['level_1'] == motif) & (nine_counts_mutregions_sum_triindex['level_2'] == motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'Acontraction'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] == motif) & (nine_counts_mutregions_sum_triindex['level_1'] == motif) & (nine_counts_mutregions_sum_triindex['level_2'] != motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'Acontraction'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] != motif) & (nine_counts_mutregions_sum_triindex['level_1'] == motif) & (nine_counts_mutregions_sum_triindex['level_2'] == motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'A10'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] != motif) & (nine_counts_mutregions_sum_triindex['level_1'] == motif) & (nine_counts_mutregions_sum_triindex['level_2'] != motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'Afusion'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] == motif) & (nine_counts_mutregions_sum_triindex['level_1'] != motif) & (nine_counts_mutregions_sum_triindex['level_2'] == motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'Aexpansion'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] == motif) & (nine_counts_mutregions_sum_triindex['level_1'] != motif) & (nine_counts_mutregions_sum_triindex['level_2'] != motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'Aexpansion'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] != motif) & (nine_counts_mutregions_sum_triindex['level_1'] != motif) & (nine_counts_mutregions_sum_triindex['level_2'] == motif)][0].sum()
        nine_counts_triplet_context_count_all.loc[repeat, 'A01'] += nine_counts_mutregions_sum_triindex.loc[(nine_counts_mutregions_sum_triindex['level_0'] != motif) & (nine_counts_mutregions_sum_triindex['level_1'] != motif) & (nine_counts_mutregions_sum_triindex['level_2'] != motif)][0].sum()

#### Calculate rates

In [None]:
denovo_substitution_context_rate = denovo_triplet_context_count_all /3 / nine_counts_triplet_context_count_all / denovo_n_genomes_snv
denovo_substitution_context_rate.loc['tri_noCpG'] = denovo_triplet_context_count_all.loc[reps_tri_noCpG].sum() /3 / nine_counts_triplet_context_count_all.loc[reps_tri_noCpG].sum() / denovo_n_genomes_snv
denovo_substitution_context_rate.loc['A'] = denovo_mut_freq_AB
denovo_substitution_context_rate.loc['C'] = denovo_mut_freq_CD
denovo_substitution_context_rate.to_pickle('denovo/denovo_mut_freq_triplets.pickle')

In [None]:
denovo_substitution_context_rate = pd.read_pickle('denovo/denovo_mut_freq_triplets.pickle')

In [None]:
denovo_mut_freq_triplets_poisson = dict()
for i in range(200):
    denovo_mut_freq_triplets_poisson[i] = (denovo_triplet_context_count_all /3).apply(np.random.poisson) / nine_counts_triplet_context_count_all / denovo_n_genomes_snv
    denovo_mut_freq_triplets_poisson[i].loc['tri_noCpG'] = (denovo_triplet_context_count_all.loc[reps_tri_noCpG].sum() /3).apply(np.random.poisson) / nine_counts_triplet_context_count_all.loc[reps_tri_noCpG].sum() / denovo_n_genomes_snv
denovo_mut_freq_triplets_poisson = pd.concat(denovo_mut_freq_triplets_poisson)
denovo_mut_freq_triplets_poisson.to_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

In [None]:
# put triplets and As into one file
denovo_mut_freq_triplets_poisson = pd.read_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')
denovo_mut_freq_AB_poisson = pd.read_pickle('denovo/denovo_mut_freq_AB_poisson.pickle')
denovo_mut_freq_AB_poisson = denovo_mut_freq_AB_poisson.transpose()
denovo_mut_freq_AB_poisson.index = pd.MultiIndex.from_tuples([(i, 'A') for i in denovo_mut_freq_AB_poisson.index])
denovo_mut_freq_triplets_poisson = pd.concat([denovo_mut_freq_triplets_poisson, denovo_mut_freq_AB_poisson], axis=0).sort_index()
denovo_mut_freq_triplets_poisson.to_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

## Calculate substitution context rates for dinucleotide repeats

#### Count mutations, checking each triplet frame for repeat status

In [None]:
def check_di_context(seq):
    displit = re.findall('.'*2, seq)
    if displit[0] == displit[1]:
        if displit[1] == displit[2]:
            return 'AAA', repeat_frames_RC(displit[1])[0]
        else:
            return 'AAB', repeat_frames_RC(displit[1])[0]
    else:
        if displit[1] == displit[2]:
            return 'BAA', repeat_frames_RC(displit[1])[0]
        else:
            return 'BAB', repeat_frames_RC(displit[1])[0]

In [None]:
# 6-mer context in two frames -> 7nt context needed
denovo_combined_snv['7nt'] = [get_sequence(chrom, pos-4, pos+3, hg38_genome) for chrom, pos in zip(denovo_combined_snv['chrom'], denovo_combined_snv['pos'])]
denovo_combined_snv['7nt_AB_1'] = [check_di_context(seq) for seq in denovo_combined_snv['7nt']]
denovo_combined_snv['7nt_AB_2'] = [check_di_context(seq[1:]) for seq in denovo_combined_snv['7nt']]

denovo_dinuc_context_count = denovo_combined_snv.groupby(['7nt_AB_1'])['pos'].count().add(denovo_combined_snv.groupby(['7nt_AB_2'])['pos'].count(), fill_value = 0)
denovo_dinuc_context_count.index = pd.MultiIndex.from_tuples(denovo_dinuc_context_count.index)
denovo_dinuc_context_count = denovo_dinuc_context_count.unstack().transpose()
denovo_dinuc_context_count['AAB'] = denovo_dinuc_context_count['AAB'] + denovo_dinuc_context_count['BAA']
del denovo_dinuc_context_count['BAA']
denovo_dinuc_context_count.columns = ['Afission', 'Acontraction', 'A10']

In [None]:
denovo_combined_snv['7nt_alt'] = [sev[0:4]+alt+sev[5:] for sev, alt in zip(denovo_combined_snv['7nt'], denovo_combined_snv['alt'])]
denovo_combined_snv['7nt_alt_AB_1'] = [check_di_context(seq) for seq in denovo_combined_snv['7nt_alt']]
denovo_combined_snv['7nt_alt_AB_2'] = [check_di_context(seq[1:]) for seq in denovo_combined_snv['7nt_alt']]

denovo_dinuc_context_count_alt = denovo_combined_snv.groupby(['7nt_alt_AB_1'])['pos'].count().add(denovo_combined_snv.groupby(['7nt_alt_AB_2'])['pos'].count(), fill_value = 0)
denovo_dinuc_context_count_alt.index = pd.MultiIndex.from_tuples(denovo_dinuc_context_count_alt.index)
denovo_dinuc_context_count_alt = denovo_dinuc_context_count_alt.unstack().transpose()
denovo_dinuc_context_count_alt['AAB'] = denovo_dinuc_context_count_alt['AAB'] + denovo_dinuc_context_count_alt['BAA']
del denovo_dinuc_context_count_alt['BAA']
denovo_dinuc_context_count_alt.columns = ['Afusion', 'Aexpansion', 'A01']

In [None]:
denovo_dinuc_context_count_all = pd.concat([denovo_dinuc_context_count, denovo_dinuc_context_count_alt], axis=1)

#### Count all 6-mer contexts in the sequenceable genome

In [None]:
six_counts_mutregions = dict()
for chrom, pos in count_by_100kb.index:
    seq = hg38_genome[int(chrom)][int(pos):int(pos+1e5)]
    six_counts_mutregions[(chrom, pos)] = pd.Series(re.findall('......', seq) + re.findall('......', seq[1:]) + re.findall('......', seq[2:]) + re.findall('......', seq[3:]) + re.findall('......', seq[4:]) + re.findall('......', seq[5:]), dtype = 'object').value_counts()
    print ('\r' + 'finished chr'+str(chrom) + ' '+str(pos), end = ' ')

In [None]:
six_counts_mutregions_sum = pd.Series(dtype = 'int64')
counter = 0
for part in list(six_counts_mutregions.keys()):
    six_counts_mutregions_sum = six_counts_mutregions_sum.add(six_counts_mutregions[part], fill_value = 0)
    counter +=1; print('\r' + str(counter), end = ' ')
six_counts_mutregions_sum.to_pickle('denovo/sixlet_totals_hg38_mutregions.pickle')

In [None]:
six_counts_mutregions_sum = pd.read_pickle('denovo/sixlet_totals_hg38_mutregions.pickle')

In [None]:
six_counts_mutregions_sum_diindex = six_counts_mutregions_sum.copy()
six_counts_mutregions_sum_diindex.index = pd.MultiIndex.from_tuples([re.findall('..', seq) for seq in six_counts_mutregions_sum.index])
six_counts_mutregions_sum_diindex = six_counts_mutregions_sum_diindex.reset_index()

In [None]:
six_counts_dinuc_context_count_all = pd.DataFrame(0, index = denovo_dinuc_context_count_all.index, columns = denovo_dinuc_context_count_all.columns)
for repeat in six_counts_dinuc_context_count_all.index:
    for motif in repeat_frames_RC(repeat):
        six_counts_dinuc_context_count_all.loc[repeat, 'Afission'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] == motif) & (six_counts_mutregions_sum_diindex['level_1'] == motif) & (six_counts_mutregions_sum_diindex['level_2'] == motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'Acontraction'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] == motif) & (six_counts_mutregions_sum_diindex['level_1'] == motif) & (six_counts_mutregions_sum_diindex['level_2'] != motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'Acontraction'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] != motif) & (six_counts_mutregions_sum_diindex['level_1'] == motif) & (six_counts_mutregions_sum_diindex['level_2'] == motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'A10'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] != motif) & (six_counts_mutregions_sum_diindex['level_1'] == motif) & (six_counts_mutregions_sum_diindex['level_2'] != motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'Afusion'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] == motif) & (six_counts_mutregions_sum_diindex['level_1'] != motif) & (six_counts_mutregions_sum_diindex['level_2'] == motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'Aexpansion'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] == motif) & (six_counts_mutregions_sum_diindex['level_1'] != motif) & (six_counts_mutregions_sum_diindex['level_2'] != motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'Aexpansion'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] != motif) & (six_counts_mutregions_sum_diindex['level_1'] != motif) & (six_counts_mutregions_sum_diindex['level_2'] == motif)][0].sum()
        six_counts_dinuc_context_count_all.loc[repeat, 'A01'] += six_counts_mutregions_sum_diindex.loc[(six_counts_mutregions_sum_diindex['level_0'] != motif) & (six_counts_mutregions_sum_diindex['level_1'] != motif) & (six_counts_mutregions_sum_diindex['level_2'] != motif)][0].sum()

#### Calculate rates

In [None]:
denovo_substitution_context_rate_dinuc = denovo_dinuc_context_count_all /2 / six_counts_dinuc_context_count_all / denovo_n_genomes_snv
denovo_substitution_context_rate_dinuc.loc[['AC', 'AG', 'AT', 'CG']].to_pickle('denovo/denovo_mut_freq_dinuc.pickle')

In [None]:
denovo_substitution_context_rate = pd.concat([denovo_substitution_context_rate, denovo_substitution_context_rate_dinuc])

In [None]:
denovo_substitution_context_rate.to_pickle('denovo/denovo_mut_freq_triplets.pickle')

In [None]:
denovo_mut_freq_triplets_poisson = dict()
for i in range(200):
    denovo_mut_freq_triplets_poisson[i] = (denovo_triplet_context_count_all /3).apply(np.random.poisson) / nine_counts_triplet_context_count_all / denovo_n_genomes_snv
    denovo_mut_freq_triplets_poisson[i].loc['tri_noCpG'] = (denovo_triplet_context_count_all.loc[reps_tri_noCpG].sum() /3).apply(np.random.poisson) / nine_counts_triplet_context_count_all.loc[reps_tri_noCpG].sum() / denovo_n_genomes_snv
denovo_mut_freq_triplets_poisson = pd.concat(denovo_mut_freq_triplets_poisson)
denovo_mut_freq_triplets_poisson.to_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

In [None]:
# put triplets and As into one file
denovo_mut_freq_triplets_poisson = pd.read_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')
denovo_mut_freq_AB_poisson = pd.read_pickle('denovo/denovo_mut_freq_AB_poisson.pickle')
denovo_mut_freq_AB_poisson = denovo_mut_freq_AB_poisson.transpose()
denovo_mut_freq_AB_poisson.index = pd.MultiIndex.from_tuples([(i, 'A') for i in denovo_mut_freq_AB_poisson.index])
denovo_mut_freq_triplets_poisson = pd.concat([denovo_mut_freq_triplets_poisson, denovo_mut_freq_AB_poisson], axis=0).sort_index()
denovo_mut_freq_triplets_poisson.to_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

## Calculate substitution context rates for tetranucleotide repeats

#### Count mutations, checking each quad frame for repeat status

In [None]:
def check_quad_context(seq):
    quadsplit = re.findall('.'*4, seq)
    if quadsplit[0] == quadsplit[1]:
        if quadsplit[1] == quadsplit[2]:
            return 'AAA', repeat_frames_RC(quadsplit[1])[0]
        else:
            return 'AAB', repeat_frames_RC(quadsplit[1])[0]
    else:
        if quadsplit[1] == quadsplit[2]:
            return 'BAA', repeat_frames_RC(quadsplit[1])[0]
        else:
            return 'BAB', repeat_frames_RC(quadsplit[1])[0]

In [None]:
# 12-mer context in four frames -> 15nt context needed
denovo_combined_snv['15nt'] = [get_sequence(chrom, pos-8, pos+7, hg38_genome) for chrom, pos in zip(denovo_combined_snv['chrom'], denovo_combined_snv['pos'])]
denovo_combined_snv['15nt_AB_1'] = [check_quad_context(seq) for seq in denovo_combined_snv['15nt']]
denovo_combined_snv['15nt_AB_2'] = [check_quad_context(seq[1:]) for seq in denovo_combined_snv['15nt']]
denovo_combined_snv['15nt_AB_3'] = [check_quad_context(seq[2:]) for seq in denovo_combined_snv['15nt']]
denovo_combined_snv['15nt_AB_4'] = [check_quad_context(seq[3:]) for seq in denovo_combined_snv['15nt']]

denovo_quad_context_count = denovo_combined_snv.groupby(['15nt_AB_1'])['pos'].count().add(denovo_combined_snv.groupby(['15nt_AB_2'])['pos'].count(), fill_value = 0).add(denovo_combined_snv.groupby(['15nt_AB_3'])['pos'].count(), fill_value = 0).add(denovo_combined_snv.groupby(['15nt_AB_4'])['pos'].count(), fill_value = 0)
denovo_quad_context_count.index = pd.MultiIndex.from_tuples(denovo_quad_context_count.index)
denovo_quad_context_count = denovo_quad_context_count.unstack().transpose()
denovo_quad_context_count['AAB'] = denovo_quad_context_count['AAB'] + denovo_quad_context_count['BAA']
del denovo_quad_context_count['BAA']
denovo_quad_context_count.columns = ['Afission', 'Acontraction', 'A10']

In [None]:
denovo_combined_snv['15nt_alt'] = [el[0:7]+alt+el[8:] for el, alt in zip(denovo_combined_snv['15nt'], denovo_combined_snv['alt'])]
denovo_combined_snv['15nt_alt_AB_1'] = [check_quad_context(seq) for seq in denovo_combined_snv['15nt_alt']]
denovo_combined_snv['15nt_alt_AB_2'] = [check_quad_context(seq[1:]) for seq in denovo_combined_snv['15nt_alt']]
denovo_combined_snv['15nt_alt_AB_3'] = [check_quad_context(seq[2:]) for seq in denovo_combined_snv['15nt_alt']]
denovo_combined_snv['15nt_alt_AB_4'] = [check_quad_context(seq[3:]) for seq in denovo_combined_snv['15nt_alt']]

denovo_quad_context_count_alt = denovo_combined_snv.groupby(['15nt_alt_AB_1'])['pos'].count().add(denovo_combined_snv.groupby(['15nt_alt_AB_2'])['pos'].count(), fill_value = 0).add(denovo_combined_snv.groupby(['15nt_alt_AB_3'])['pos'].count(), fill_value = 0).add(denovo_combined_snv.groupby(['15nt_alt_AB_4'])['pos'].count(), fill_value = 0)
denovo_quad_context_count_alt.index = pd.MultiIndex.from_tuples(denovo_quad_context_count_alt.index)
denovo_quad_context_count_alt = denovo_quad_context_count_alt.unstack().transpose()
denovo_quad_context_count_alt['AAB'] = denovo_quad_context_count_alt['AAB'] + denovo_quad_context_count_alt['BAA']
del denovo_quad_context_count_alt['BAA']
denovo_quad_context_count_alt.columns = ['Afusion', 'Aexpansion', 'A01']

In [None]:
denovo_quad_context_count_all = pd.concat([denovo_quad_context_count, denovo_quad_context_count_alt], axis=1)

In [None]:
denovo_quad_context_count_all.to_pickle('denovo/denovo_quad_context_count_all.pickle')

#### Count all 12-mer contexts in the sequenceable genome

In [None]:
twelve_counts_mutregions = dict()
for chrom, pos in count_by_100kb.index:
    seq = hg38_genome[int(chrom)][int(pos):int(pos+1e5)]
    twelve_counts_mutregions[(chrom, pos)] = pd.Series(re.findall('............', seq) + re.findall('............', seq[1:]) + re.findall('............', seq[2:]) + re.findall('............', seq[3:]) + re.findall('............', seq[4:]) + re.findall('............', seq[5:]) + re.findall('............', seq[6:]) + re.findall('............', seq[7:]) + re.findall('............', seq[8:]) + re.findall('............', seq[9:]) + re.findall('............', seq[10:]) + re.findall('............', seq[11:]), dtype = 'object').value_counts()
    print ('\r' + 'finished chr'+str(chrom) + ' '+str(pos), end = ' ')

In [None]:
twelve_regions_list = list(twelve_counts_mutregions.keys())

In [None]:
twelve_counts_mutregions_sum = dict()
counter = 0
for n in range((len(twelve_regions_list) // 100) +1):
    twelve_counts_mutregions_sum[n] = pd.concat(twelve_counts_mutregions, keys = twelve_regions_list[n*100:(n+1)*100], axis=1).sum(axis=1)
    counter +=1; print('\r' + str(counter), end = ' ')

In [None]:
twelve_counts_mutregions_sum_all = pd.Series(dtype = 'int64')
counter = 0
for part in list(twelve_counts_mutregions_sum.keys()):
    twelve_counts_mutregions_sum_all = twelve_counts_mutregions_sum_all.add(twelve_counts_mutregions_sum[part], fill_value = 0)
    counter +=1; print('\r' + str(counter), end = ' ')
twelve_counts_mutregions_sum_all.to_pickle('denovo/twelvemer_totals_hg38_mutregions.pickle')

In [None]:
twelve_counts_mutregions_sum = pd.read_pickle('denovo/twelvemer_totals_hg38_mutregions.pickle')

In [None]:
denovo_quad_context_count_all = pd.read_pickle('denovo/denovo_quad_context_count_all.pickle')

In [None]:
twelve_counts_mutregions_sum_quadindex = twelve_counts_mutregions_sum.copy()
twelve_counts_mutregions_sum_quadindex.index = pd.MultiIndex.from_tuples([re.findall('....', seq) for seq in twelve_counts_mutregions_sum.index])
twelve_counts_mutregions_sum_quadindex = twelve_counts_mutregions_sum_quadindex.reset_index()

In [None]:
twelve_counts_quad_context_count_all = pd.DataFrame(0, index = denovo_quad_context_count_all.index, columns = denovo_quad_context_count_all.columns)
for repeat in twelve_counts_quad_context_count_all.index:
    for motif in repeat_frames_RC(repeat):
        twelve_counts_quad_context_count_all.loc[repeat, 'Afission'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] == motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'Acontraction'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] != motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'Acontraction'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] == motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'A10'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] != motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'Afusion'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] == motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'Aexpansion'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] == motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] != motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'Aexpansion'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] == motif)][0].sum()
        twelve_counts_quad_context_count_all.loc[repeat, 'A01'] += twelve_counts_mutregions_sum_quadindex.loc[(twelve_counts_mutregions_sum_quadindex['level_0'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_1'] != motif) & (twelve_counts_mutregions_sum_quadindex['level_2'] != motif)][0].sum()

#### Calculate rates

In [None]:
denovo_quad_context_count_all = pd.read_pickle('denovo/denovo_quad_context_count_all.pickle')
denovo_substitution_context_rate_quad = denovo_quad_context_count_all / 4 / twelve_counts_quad_context_count_all / denovo_n_genomes_snv
denovo_substitution_context_rate_quad.to_pickle('denovo/denovo_mut_freq_quads.pickle')

In [None]:
denovo_substitution_context_rate_quad

In [None]:
denovo_substitution_context_rate = pd.read_pickle('denovo/denovo_mut_freq_triplets.pickle')

In [None]:
denovo_substitution_context_rate = pd.concat([denovo_substitution_context_rate, denovo_substitution_context_rate_quad])

In [None]:
denovo_substitution_context_rate.to_pickle('denovo/denovo_mut_freq_triplets.pickle')

In [None]:
denovo_mut_freq_triplets_poisson = dict()
for i in range(200):
    denovo_mut_freq_triplets_poisson[i] = (denovo_triplet_context_count_all /3).apply(np.random.poisson) / nine_counts_triplet_context_count_all / denovo_n_genomes_snv
    denovo_mut_freq_triplets_poisson[i].loc['tri_noCpG'] = (denovo_triplet_context_count_all.loc[reps_tri_noCpG].sum() /3).apply(np.random.poisson) / nine_counts_triplet_context_count_all.loc[reps_tri_noCpG].sum() / denovo_n_genomes_snv
denovo_mut_freq_triplets_poisson = pd.concat(denovo_mut_freq_triplets_poisson)
denovo_mut_freq_triplets_poisson.to_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

In [None]:
# put triplets and As into one file
denovo_mut_freq_triplets_poisson = pd.read_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')
denovo_mut_freq_AB_poisson = pd.read_pickle('denovo/denovo_mut_freq_AB_poisson.pickle')
denovo_mut_freq_AB_poisson = denovo_mut_freq_AB_poisson.transpose()
denovo_mut_freq_AB_poisson.index = pd.MultiIndex.from_tuples([(i, 'A') for i in denovo_mut_freq_AB_poisson.index])
denovo_mut_freq_triplets_poisson = pd.concat([denovo_mut_freq_triplets_poisson, denovo_mut_freq_AB_poisson], axis=0).sort_index()
denovo_mut_freq_triplets_poisson.to_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

# deCODE instability rate measurement
- Download supplementary data from Kristmundsdottir et al 2023, extract in folder named 'decode'
- https://github.com/DecodeGenetics/mDNM_analysisAndData

### Process original data file
- used to calculate denominator for rates

In [None]:
decode = pd.read_csv('decode/mDNM_analysisAndData-main/mutRateDataAll.gz', compression = 'gzip', sep = '\t', skiprows = 0)
decode = decode.reset_index()
decode.columns = ['chrom', 'start', 'end', 'motif', 'missing', 'DNMs', 'Correct', 'geneticDiversity', 'motifLength', 'refLen', 'purity', 'GCcontentInMotif']
decode['chrom'] = [chrom[3:] if chrom[3:] in ['X', 'Y'] else int(chrom[3:]) for chrom in decode['chrom']]
decode['motif_std'] = [repeat_frames_RC(rep)[0] for rep in decode['motif']]
decode['seq'] = [get_sequence(chrom, start, end, hg38_genome) for chrom, start, end in zip(decode['chrom'], decode['start'], decode['end'])]

In [None]:
# Find longest pure repeat in sequence
decode['seq+-10'] = [get_sequence(chrom, start-10, end+10, hg38_genome) for chrom, start, end in zip(decode['chrom'], decode['start'], decode['end'])]
decode['max_pure_length'] = [max([len(rep) for rep in re.findall('(?:' + motif + ')+', seq)]) for motif, seq in zip(decode['motif'], decode['seq+-10'])]
decode['max_pure_units'] = decode['max_pure_length'] / decode['motifLength']
decode['ref_units'] = decode['refLen'] / decode['motifLength']
decode['ref_units_int'] = decode['ref_units'] //1
decode.to_pickle('decode/mutRateDataAll.pickle')

In [None]:
decode = pd.read_pickle('decode/mutRateDataAll.pickle')

In [None]:
# loci where popSTR-listed reference repeat length is a perfect repeat of the appropriate motif
decode.loc[decode['ref_units_int'] == decode['max_pure_units']]

In [None]:
len(decode), len(decode.loc[decode['ref_units_int'] == decode['max_pure_units']])

In [None]:
# 'Correct' column specifies number of families where the locus was sequenced with sufficient quality
# denominator by motif and reference length for rate calculations
# Because parental genotypes are not available for all loci (just for mutated loci), the denominator is approximated using this distribution of reference length counts.
motif_counts_pure_units = decode.loc[decode['ref_units_int'] == decode['max_pure_units']].groupby(['ref_units_int', 'motif_std'])['Correct'].sum().unstack()

### Process extended data file
- used to calculate numerator for rates

In [None]:
dec_ext = pd.read_csv('decode/mDNM_analysisAndData-main/bpInvolved_extended', sep = '\t')

dec_ext[['Proband_1', 'Proband_2']] = (dec_ext['ProbandGt'].str.split('/', expand = True)).astype(float).round(1)
dec_ext[['Father_1', 'Father_2']] = (dec_ext['FatherGt'].str.split('/', expand = True)).astype(float).round(1)
dec_ext[['Mother_1', 'Mother_2']] = (dec_ext['MotherGt'].str.split('/', expand = True)).astype(float).round(1)
dec_ext['Motif_len'] = dec_ext['Motif'].str.len()
dec_ext['Phase'] = dec_ext['Phase'].replace('X', 0).astype(int)
dec_ext['units_Involved'] = (dec_ext['BpInvolved'].replace('X', np.nan).astype(float) / dec_ext['Motif_len']).round(1)
dec_ext['exp_con'] = ['exp' if change > 0 else 'con' if change < 0 else np.nan for change in dec_ext['units_Involved']]

dec_ext['motif_std'] = [repeat_frames_RC(rep)[0] for rep in dec_ext['Motif']]
dec_ext['Chrom'] = [chrom[3:] if chrom[3:] in ['X', 'Y'] else int(chrom[3:]) for chrom in dec_ext['Chrom']]

In [None]:
# get sequence upstream and downstream of mutation position based on given hg38 RefLen
dec_ext['seq'] = [get_sequence(chrom, pos-reflen-10, pos+reflen+10, hg38_genome) for chrom, pos, reflen in zip(dec_ext['Chrom'], dec_ext['Pos'], dec_ext['RefLen'])]
# Find longest pure repeat in sequence
dec_ext['RefLen_pure'] = [max([len(rep) for rep in re.findall('(?:' + motif + ')+', seq)]) for motif, seq in zip(dec_ext['Motif'], dec_ext['seq'])]
dec_ext['RefLen_units'] = dec_ext['RefLen'] / dec_ext['Motif_len']
dec_ext['RefLen_units_pure'] = dec_ext['RefLen_pure'] / dec_ext['Motif_len']

#### Find closest parental allele
in the absence of phasing, assume mutation originated from parental allele with nearest length

In [None]:
def check_p(F1, UI, P1):
    return np.abs(F1+UI-P1) <= 0.11
dec_ext['parent_allele'] = ['F1' if check_p(F1,UI,P1) & (phase == 1) else 'F2' if check_p(F2,UI,P1) & (phase == 1) else 'F1' if check_p(F1,UI,P2) & (phase == 1) else 'F2' if check_p(F2,UI,P2) & (phase == 1) else 'M1' if check_p(M1,UI,P1) & (phase == 2) else 'M2' if check_p(M2,UI,P1) & (phase == 2) else 'M1' if check_p(M1,UI,P2) & (phase == 2) else 'M2' if check_p(M2,UI,P2) & (phase == 2) else 'unknown' for F1, F2, M1, M2, P1, P2, UI, phase in zip(dec_ext['Father_1'], dec_ext['Father_2'], dec_ext['Mother_1'], dec_ext['Mother_2'], dec_ext['Proband_1'], dec_ext['Proband_2'], dec_ext['units_Involved'], dec_ext['Phase'])]

In [None]:
dec_ext['P1_in_M'] = [(P1 in [M1, M2]) for M1, M2, P1, P2 in zip(dec_ext['Mother_1'], dec_ext['Mother_2'], dec_ext['Proband_1'], dec_ext['Proband_2'])]
dec_ext['P2_in_M'] = [(P2 in [M1, M2]) for M1, M2, P1, P2 in zip(dec_ext['Mother_1'], dec_ext['Mother_2'], dec_ext['Proband_1'], dec_ext['Proband_2'])]
dec_ext['P1_in_F'] = [(P1 in [F1, F2]) for F1, F2, P1, P2 in zip(dec_ext['Father_1'], dec_ext['Father_2'], dec_ext['Proband_1'], dec_ext['Proband_2'])]
dec_ext['P2_in_F'] = [(P2 in [F1, F2]) for F1, F2, P1, P2 in zip(dec_ext['Father_1'], dec_ext['Father_2'], dec_ext['Proband_1'], dec_ext['Proband_2'])]

dec_ext['P1F_P2F_P1M_P2M'] = dec_ext[['P1_in_F','P2_in_F', 'P1_in_M', 'P2_in_M']].astype(int).values.tolist()
dec_ext['P1F_P2F_P1M_P2M'] = [''.join(str(val)) for val in dec_ext['P1F_P2F_P1M_P2M']]

# find probable parental allele contributing to instability
# assumes the instable allele comes from the closest length parental allele, after identifying the stable allele by a match between any proband and parental allele
dec_ext['probable_parent_allele'] = ['F1' if (min([abs(P1-F1), abs(P2-F1)]) == min([abs(P1-F1), abs(P1-F2), abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 1]') else 
'F2' if (min([abs(P1-F2), abs(P2-F2)]) == min([abs(P1-F1), abs(P1-F2), abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 1]') else
'M1' if (min([abs(P1-M1), abs(P2-M1)]) == min([abs(P1-M1), abs(P1-M2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 1, 0, 0]') else 
'M2' if (min([abs(P1-M2), abs(P2-M2)]) == min([abs(P1-M1), abs(P1-M2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 1, 0, 0]') else 
'F1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'F2' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'F1' if (abs(P2-F1) == min([abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 0]') else 
'F2' if (abs(P2-F2) == min([abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 0]') else 
'M1' if (abs(P1-M1) == min([abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 0]') else 
'M2' if (abs(P1-M2) == min([abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 0]') else 
'M1' if (abs(P2-M1) == min([abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 0, 0]') else 
'M2' if (abs(P2-M2) == min([abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 0, 0]') else 
'F1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'F2' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'F1' if (abs(P2-F1) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'F2' if (abs(P2-F2) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'M1' if (abs(P2-M1) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'M2' if (abs(P2-M2) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'F1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 
'F2' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 
'M1' if (abs(P1-M1) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 
'M2' if (abs(P1-M2) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 'unknown'
 for F1,  F2, M1,  M2, P1,  P2, FFMM in zip(dec_ext['Father_1'], dec_ext['Father_2'], dec_ext['Mother_1'], dec_ext['Mother_2'], dec_ext['Proband_1'], dec_ext['Proband_2'], dec_ext['P1F_P2F_P1M_P2M'])]

# find probable proband allele contributing to instability
# assumes the instable allele comes from the closest length parental allele, after identifying the stable allele by a match between any proband and parental allele
dec_ext['probable_proband_allele'] = [
'P1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2), abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 1]') else 
'P2' if (abs(P2-F1) == min([abs(P1-F1), abs(P1-F2), abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 1]') else 
'P1' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2), abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 1]') else
'P2' if (abs(P2-F2) == min([abs(P1-F1), abs(P1-F2), abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 1]') else
'P1' if (abs(P1-M1) == min([abs(P1-M1), abs(P1-M2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 1, 0, 0]') else 
'P2' if (abs(P2-M1) == min([abs(P1-M1), abs(P1-M2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 1, 0, 0]') else 
'P1' if (abs(P1-M2) == min([abs(P1-M1), abs(P1-M2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 1, 0, 0]') else 
'P2' if (abs(P2-M2) == min([abs(P1-M1), abs(P1-M2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 1, 0, 0]') else 
'P1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'P1' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'P2' if (abs(P2-F1) == min([abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 0]') else 
'P2' if (abs(P2-F2) == min([abs(P2-F1), abs(P2-F2)])) & (FFMM == '[0, 0, 1, 0]') else 
'P1' if (abs(P1-M1) == min([abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 0]') else 
'P1' if (abs(P1-M2) == min([abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 0]') else 
'P2' if (abs(P2-M1) == min([abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 0, 0]') else 
'P2' if (abs(P2-M2) == min([abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 0, 0]') else 
'P1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'P1' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2)])) & (FFMM == '[0, 0, 0, 1]') else 
'P2' if (abs(P2-F1) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'P2' if (abs(P2-F2) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'P2' if (abs(P2-M1) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'P2' if (abs(P2-M2) == min([abs(P2-F1), abs(P2-F2), abs(P2-M1), abs(P2-M2)])) & (FFMM == '[1, 0, 1, 0]') else 
'P1' if (abs(P1-F1) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 
'P1' if (abs(P1-F2) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 
'P1' if (abs(P1-M1) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 
'P1' if (abs(P1-M2) == min([abs(P1-F1), abs(P1-F2), abs(P1-M1), abs(P1-M2)])) & (FFMM == '[0, 1, 0, 1]') else 'unknown'
 for F1,  F2, M1,  M2, P1,  P2, FFMM in zip(dec_ext['Father_1'], dec_ext['Father_2'], dec_ext['Mother_1'], dec_ext['Mother_2'], dec_ext['Proband_1'], dec_ext['Proband_2'], dec_ext['P1F_P2F_P1M_P2M'])]

In [None]:
# use known allele when available, and likely allele if not
dec_ext['known_or_likely_parent_allele'] = [parent if parent != 'unknown' else probable for parent, probable in zip(dec_ext['parent_allele'], dec_ext['probable_parent_allele'])]
dec_ext['known_or_likely_parent_length'] = [F1 if likely == 'F1' else F2 if likely == 'F2' else M1 if likely == 'M1' else M2 if likely == 'M2' else np.nan for F1, F2, M1, M2, likely in zip(dec_ext['Father_1'], dec_ext['Father_2'], dec_ext['Mother_1'], dec_ext['Mother_2'], dec_ext['known_or_likely_parent_allele'])]
# round down unit length of partial repeats
dec_ext['known_or_likely_parent_length_int'] = dec_ext['known_or_likely_parent_length'] //1

dec_ext['likely_BpInvolved'] = [P1 - parent if likely == 'P1' else P2 - parent if likely == 'P2' else np.nan for P1, P2, likely, parent, in zip(dec_ext['Proband_1'], dec_ext['Proband_2'], dec_ext['probable_proband_allele'], dec_ext['known_or_likely_parent_length'])]
dec_ext['known_or_likely_BpInvolved'] = [float(bp) if bp != 'X' else probable for bp, probable in zip(dec_ext['BpInvolved'], dec_ext['likely_BpInvolved'])]
dec_ext['known_or_likely_units_Involved'] = dec_ext['known_or_likely_BpInvolved'] / dec_ext['Motif_len']
dec_ext['known_or_likely_exp_con'] = ['exp' if change > 0 else 'con' if change < 0 else 'unknown' for change in dec_ext['known_or_likely_units_Involved']]

In [None]:
dec_ext.to_pickle('decode/DECODE_extended_data.pickle')

In [None]:
dec_ext = pd.read_pickle('decode/DECODE_extended_data.pickle')

In [None]:
dec_ext['ref_units'] = dec_ext['RefLen'] / dec_ext['Motif_len']
dec_ext['ref_units_int'] = dec_ext['ref_units'] //1
dec_ext_pure = dec_ext.loc[(dec_ext['ref_units_int'] == dec_ext['RefLen_units_pure'])].copy()

dec_ext_pure['Mother_1_int'] = dec_ext_pure['Mother_1'] // 1
dec_ext_pure['Mother_2_int'] = dec_ext_pure['Mother_2'] // 1
dec_ext_pure['Father_1_int'] = dec_ext_pure['Father_1'] // 1
dec_ext_pure['Father_2_int'] = dec_ext_pure['Father_2'] // 1

In [None]:
len(dec_ext), len(dec_ext_pure)

In [None]:
# choose only single unit indels
dec_ext_pure = dec_ext_pure.loc[dec_ext_pure['known_or_likely_units_Involved'].abs() == 1]
len(dec_ext_pure)

### Calculate rates

In [None]:
# DECODE data
decode_exp_rate = dict(); decode_con_rate = dict()
decode_exp_counts = dict(); decode_con_counts = dict()
decode_exp_rate_poisson = dict(); decode_con_rate_poisson = dict()
for motif in reps_1_4:
    # using closest parental allele 
    decode_exp_counts[motif] = dec_ext_pure.loc[(dec_ext_pure['motif_std'] == motif) & (dec_ext_pure['known_or_likely_exp_con'] == 'exp')]['known_or_likely_parent_length_int'].value_counts()
    decode_con_counts[motif] = dec_ext_pure.loc[(dec_ext_pure['motif_std'] == motif) & (dec_ext_pure['known_or_likely_exp_con'] == 'con')]['known_or_likely_parent_length_int'].value_counts()
    # assuming reference length is parental length, but size and direction of change (units_involved) is the same
#    decode_exp_counts[motif] = dec_ext_pure.loc[(dec_ext_pure['motif_std'] == motif) & (dec_ext_pure['known_or_likely_exp_con'] == 'exp')]['ref_units_int'].value_counts()
#    decode_con_counts[motif] = dec_ext_pure.loc[(dec_ext_pure['motif_std'] == motif) & (dec_ext_pure['known_or_likely_exp_con'] == 'con')]['ref_units_int'].value_counts()

    loci_count = motif_counts_pure_units[motif] *2 # number of trios where all three members pass quality filters, multiplied by 2 parental alleles
    # using DECODE population information for denominator, with adjustment for average number of motifs passing filters / total number of motifs
    decode_exp_rate[motif] = decode_exp_counts[motif].div(loci_count).dropna() 
    decode_con_rate[motif] = decode_con_counts[motif].div(loci_count).dropna()
    if len(decode_exp_rate[motif]) > 0:
        decode_exp_rate_poisson_current = dict()
        for i in range(200):
            decode_exp_rate_poisson_current[i] = decode_exp_counts[motif].apply(lambda x: np.random.poisson(x)).div(loci_count)
        decode_exp_rate_poisson[motif] = pd.concat(decode_exp_rate_poisson_current)#.div(decode_exp_rate_poisson_current.index * len(motif))
    if len(decode_con_rate[motif]) > 0:
        decode_con_rate_poisson_current = dict()
        for i in range(200):
            decode_con_rate_poisson_current[i] = decode_con_counts[motif].apply(lambda x: np.random.poisson(x)).div(loci_count)
        decode_con_rate_poisson[motif] = pd.concat(decode_con_rate_poisson_current)
decode_exp_counts = pd.concat(decode_exp_counts, axis=1).sort_index()
decode_con_counts = pd.concat(decode_con_counts, axis=1).sort_index()
decode_exp_rate = pd.concat(decode_exp_rate, axis=1).sort_index()
decode_con_rate = pd.concat(decode_con_rate, axis=1).sort_index()
decode_exp_rate_poisson = pd.concat(decode_exp_rate_poisson, axis=1).sort_index()
decode_con_rate_poisson = pd.concat(decode_con_rate_poisson, axis=1).sort_index()

In [None]:
# combine units
for unit_len in [1,2,3,4]:
    current_reps = [rep for rep in decode_exp_rate.columns if len(rep) == unit_len]
    decode_exp_counts['unit_'+str(unit_len)] = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(current_reps)) & (dec_ext_pure['known_or_likely_exp_con'] == 'exp')]['known_or_likely_parent_length_int'].value_counts()
    decode_con_counts['unit_'+str(unit_len)] = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(current_reps)) & (dec_ext_pure['known_or_likely_exp_con'] == 'con')]['known_or_likely_parent_length_int'].value_counts()
    loci_count_units = motif_counts_pure_units[current_reps].sum(axis=1) *2
    # using DECODE parental information for denominator, with adjustment for average number of motifs passing filters / total number of motifs
    decode_exp_rate['unit_'+str(unit_len)] = decode_exp_counts['unit_'+str(unit_len)].div(loci_count_units).dropna() 
    decode_con_rate['unit_'+str(unit_len)] = decode_con_counts['unit_'+str(unit_len)].div(loci_count_units).dropna()
    
    decode_exp_rate_units = dict(); decode_con_rate_units = dict()
    for i in range(200):
        decode_exp_rate_units[i] = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(current_reps)) & (dec_ext_pure['known_or_likely_exp_con'] == 'exp')]['known_or_likely_parent_length_int'].value_counts().apply(np.random.poisson).div(loci_count_units).dropna()
        decode_con_rate_units[i] = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(current_reps)) & (dec_ext_pure['known_or_likely_exp_con'] == 'con')]['known_or_likely_parent_length_int'].value_counts().apply(np.random.poisson).div(loci_count_units).dropna()
    decode_exp_rate_poisson['unit_'+str(unit_len)] = pd.concat(decode_exp_rate_units, axis=0).replace(0, np.nan)
    decode_con_rate_poisson['unit_'+str(unit_len)] = pd.concat(decode_con_rate_units, axis=0).replace(0, np.nan)

In [None]:
# combine non-CpG trinucleotide repeats
decode_exp_count_tri_noCpG = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(reps_tri_noCpG)) & (dec_ext_pure['known_or_likely_exp_con'] == 'exp')]['known_or_likely_parent_length_int'].value_counts()
decode_con_count_tri_noCpG = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(reps_tri_noCpG)) & (dec_ext_pure['known_or_likely_exp_con'] == 'con')]['known_or_likely_parent_length_int'].value_counts()
loci_count_tri_noCpG = motif_counts_pure_units[reps_tri_noCpG].sum(axis=1) *2
# using DECODE parental information for denominator, with adjustment for average number of motifs passing filters / total number of motifs
decode_exp_rate['tri_noCpG'] = decode_exp_count_tri_noCpG.div(loci_count_tri_noCpG).dropna() 
decode_con_rate['tri_noCpG'] = decode_con_count_tri_noCpG.div(loci_count_tri_noCpG).dropna()

decode_exp_rate_tri_noCpG = dict(); decode_con_rate_tri_noCpG = dict()
for i in range(200):
    decode_exp_rate_tri_noCpG[i] = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(reps_tri_noCpG)) & (dec_ext_pure['known_or_likely_exp_con'] == 'exp')]['known_or_likely_parent_length_int'].value_counts().apply(np.random.poisson).div(loci_count_tri_noCpG).dropna()
    decode_con_rate_tri_noCpG[i] = dec_ext_pure.loc[(dec_ext_pure['motif_std'].isin(reps_tri_noCpG)) & (dec_ext_pure['known_or_likely_exp_con'] == 'con')]['known_or_likely_parent_length_int'].value_counts().apply(np.random.poisson).div(loci_count_tri_noCpG).dropna()
decode_exp_rate_poisson['tri_noCpG'] = pd.concat(decode_exp_rate_tri_noCpG, axis=0).replace(0, np.nan)
decode_con_rate_poisson['tri_noCpG'] = pd.concat(decode_con_rate_tri_noCpG, axis=0).replace(0, np.nan)

In [None]:
# not adjusting for motif length here... calculate per unit instead of per nt
decode_exp_rate = decode_exp_rate.div(decode_exp_rate.index, axis=0) 
decode_con_rate = decode_con_rate.div(decode_con_rate.index, axis=0)
decode_exp_rate_poisson = decode_exp_rate_poisson.div(decode_exp_rate_poisson.index.get_level_values(1), axis=0)
decode_con_rate_poisson = decode_con_rate_poisson.div(decode_con_rate_poisson.index.get_level_values(1), axis=0)

In [None]:
decode_exp_counts.to_pickle('decode/decode_expansion_counts.pickle')
decode_con_counts.to_pickle('decode/decode_contraction_counts.pickle')

In [None]:
decode_exp_rate.to_pickle('decode/decode_expansion_rates.pickle')
decode_con_rate.to_pickle('decode/decode_contraction_rates.pickle')

decode_exp_rate_poisson.to_pickle('decode/decode_expansion_rates_poisson.pickle')
decode_con_rate_poisson.to_pickle('decode/decode_contraction_rates_poisson.pickle')

In [None]:
decode_exp_counts = pd.read_pickle('decode/decode_expansion_counts.pickle')
decode_con_counts = pd.read_pickle('decode/decode_contraction_counts.pickle')

decode_exp_rate = pd.read_pickle('decode/decode_expansion_rates.pickle')
decode_con_rate = pd.read_pickle('decode/decode_contraction_rates.pickle')

decode_exp_rate_poisson = pd.read_pickle('decode/decode_expansion_rates_poisson.pickle')
decode_con_rate_poisson = pd.read_pickle('decode/decode_contraction_rates_poisson.pickle')

decode_con_rate_poisson = decode_con_rate_poisson.reindex(decode_exp_rate_poisson.columns, axis=1)

# Figures

#### Units added per event

In [None]:
dec_ext['BpInvolved_withsign'] = dec_ext['BpInvolved'].replace('X', np.nan).astype(float)
decode_unit_len_per_event = dec_ext.dropna(subset = ['BpInvolved_withsign']).groupby(['Motif_len', 'RefLen_pure', 'BpInvolved_withsign'])['Chrom'].count().unstack().div(dec_ext.groupby(['Motif_len', 'RefLen_pure', 'BpInvolved_abs'])['Chrom'].count().unstack().sum(axis=1), axis=0)

In [None]:
fig = make_subplots(rows = 4, cols = 1, shared_xaxes = True, vertical_spacing = 0.015, row_titles = ['unit=1', 'unit=2', 'unit=3', 'unit=4'], x_title = 'repeat length (units)', y_title = 'fraction of events per length bin')
for unit_len in range(1,5):
    current_units = decode_unit_len_per_event.loc[unit_len]
    for inslen in [1,2,3,4]:
        fig.add_trace(go.Bar(x = current_units.index / unit_len, y = current_units[inslen], name = str(np.abs(inslen)), legendgroup = str(np.abs(inslen)), showlegend = True if unit_len == 1 else False), row = unit_len, col = 1)
    fig.add_trace(go.Bar(x = current_units.index  / unit_len, y = current_units[[col for col in current_units.columns if col > 4]].sum(axis=1), name = '>4', legendgroup = 'other', showlegend = True if unit_len == 1 else False), row = unit_len, col = 1)
    for inslen in [-1,-2,-3,-4]:
        fig.add_trace(go.Bar(x = current_units.index / unit_len, y = -current_units[inslen], name = str(np.abs(inslen)), legendgroup = str(np.abs(inslen)), showlegend = False), row = unit_len, col = 1)
    fig.add_trace(go.Bar(x = current_units.index  / unit_len, y = -current_units[[col for col in current_units.columns if col < -4]].sum(axis=1), name = '>4', legendgroup = 'other', showlegend = False), row = unit_len, col = 1)
    fig.update_xaxes(range = [2.5,33.5], dtick = 1, row = unit_len, col = 1)
fig.update_yaxes(range = [-1,1], tickvals = [-0.5, 0, 0.5])
fig.update_layout(font=dict(family = 'Arial', size = 14), height = 400, width = 1000, title = 'popSTR', colorway = plotly.colors.DEFAULT_PLOTLY_COLORS[:5], legend_title_text='nt added', margin={'t':30,'l':60,'b':60,'r':10})
fig.update_layout(barmode = 'relative')
fig.show()

In [None]:
fig.write_image('plots/figS4b.pdf')

#### Rate curve plots

In [None]:
# (exp, con, nonexp 0-bin rates)
4.562984e-12, 2.885531e-10, 1.442766e-10 

In [None]:
def add_rate_trace(rates, name, symbol, colornum, motif='A', rowcol=None, markersize = 7, per = 'unit', xmult = 1, plot_min = 1e-12):
    rates = rates[motif].unstack().replace(np.inf, np.nan).dropna(how='all', axis=1).loc[1:]
    med = rates.median(axis=0).replace(0, np.nan).replace(np.inf, np.nan).dropna()
    low = med.sub(rates.quantile(0.025).replace(0, np.nan).replace(np.inf, np.nan))
    high = (rates.quantile(0.975).replace(0, np.nan).replace(np.inf, np.nan)).sub(med)
    
    if per == 'repeat':
        med *= med.index; low *= low.index; high *= high.index
    if per == 'nt':
        med *= len(motif); low *= len(motif); high *= len(motif)

    low = low.replace(0, plot_min)

    if rowcol is None:
        fig_rates.add_trace(go.Scatter(x = med.index * xmult, y = med, error_y = dict(type = 'data', arrayminus = low, array = high, color = 'rgba(0,0,0,0.2)', thickness = (1.8/7) * markersize, width = markersize), name = name, mode = 'markers', opacity = 0.95, marker = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[colornum], symbol= symbol, size = markersize, line_width = (3/7) * markersize)))
    else:
        fig_rates.add_trace(go.Scatter(x = med.index * xmult, y = med, error_y = dict(type = 'data', arrayminus = low, array = high, color = 'rgba(0,0,0,0.2)', thickness = (1.8/7) * markersize, width = markersize), name = name, mode = 'markers', opacity = 0.95, marker = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[colornum], symbol= symbol, size = markersize, line_width = (3/7) * markersize), showlegend = True if (rowcol[0]==1) & (rowcol[1]==1) else False), row = rowcol[0], col = rowcol[1])

In [None]:
fig_rates = go.Figure()
add_rate_trace(denovo_exp_rate_poisson, 'Expansion (de novo)', 'square-open', colornum = 0)
add_rate_trace(denovo_con_rate_poisson, 'Contraction (de novo)', 'square-open', colornum = 1)
add_rate_trace(denovo_nonexp_rate_poisson, 'Non-motif insertion (de novo)', 'square-open', colornum = 2)
add_rate_trace(decode_exp_rate_poisson, 'Expansion/insertion (popSTR)', 'circle-open', colornum = 0)
add_rate_trace(decode_con_rate_poisson, 'Contraction (popSTR)', 'circle-open', colornum = 1)

fig_rates.add_trace(go.Scatter(x = list(range(1,40)), y = [rates_mu_nu['A>B']]*40, name = 'substitutions A>B', line = dict(color = 'rgba(50,50,50,0.5)', dash = 'dash'), showlegend = False))
fig_rates.add_trace(go.Scatter(x = list(range(1,40)), y = [rates_mu_nu['B>A']]*40, name = 'substitutions B>A', line = dict(color = 'rgba(50,50,50,0.5)', dash = 'dot'), showlegend = False))

fig_rates.update_xaxes(type = 'log', title = 'repeat length (nt)', range = [-0.03,1.6], tickvals = [1,5,10,15,20,25,30], gridcolor = 'rgba(0,0,0,0.2)')
fig_rates.update_yaxes(type = 'log', title = 'rate (per nt per generation)', range = [-11.6, -4.8], exponentformat =  'e', dtick = 1, gridcolor = 'rgba(0,0,0,0.2)')
fig_rates.update_layout(margin={'t':20,'l':60,'b':35,'r':10}, height = 320, width = 540)
fig_rates.update_layout(font=dict(family = 'Arial', size = 12), legend=dict(yanchor="top", y=0.97, xanchor="left", x=0.073))

In [None]:
fig_rates.write_image('plots/fig2a_rates.svg')
fig_rates.write_image('plots/fig2a_rates.pdf')

In [None]:
motif_list = ['A', 'C', 'CG', 'AC', 'AG', 'AT', 'AAC', 'AAG', 'AAT', 'CCG', 'AGC', 'AGG', 'ACT', 'ATC', 'ACC', 'AAAC', 'AAAT', 'AAAG', 'AGAT', 'ACAT', 'AAGG', 'ACAG', 'AATG', 'ATCC']
fig_rates = make_subplots(rows = 6, cols = 4, shared_xaxes = True, shared_yaxes = True, horizontal_spacing = 0.025, vertical_spacing = 0.05, subplot_titles = motif_list, x_title = 'repeat length (nt)', y_title = 'rate (per nt per generation)')
col_counter = 0; row_counter = 1
for motif in motif_list:
    col_counter +=1
    if col_counter ==5:
        row_counter +=1; col_counter -=4
    add_rate_trace(denovo_exp_rate_poisson, 'Expansion (de novo)', 'square-open', colornum = 0, motif = motif, markersize = 4, rowcol = (row_counter, col_counter))
    add_rate_trace(denovo_con_rate_poisson, 'Contraction (de novo)', 'square-open', colornum = 1, motif = motif, markersize = 4, rowcol = (row_counter, col_counter))
    add_rate_trace(denovo_nonexp_rate_poisson, 'Non-motif insertion (de novo)', 'square-open', colornum = 2, motif = motif, markersize = 4, rowcol = (row_counter, col_counter))
    add_rate_trace(decode_exp_rate_poisson, 'Expansion/insertion (popSTR)', 'circle-open', colornum = 0, motif = motif, markersize = 4, rowcol = (row_counter, col_counter))
    add_rate_trace(decode_con_rate_poisson, 'Contraction (popSTR)', 'circle-open', colornum = 1, motif = motif, markersize = 4, rowcol = (row_counter, col_counter))
fig_rates.update_xaxes(type = 'log', range = [-0.1,2], tickvals = [1,5,10,25])
fig_rates.update_yaxes(type = 'log', range = [-12.1, -2.4], exponentformat =  'e', dtick = 2)
fig_rates.update_layout(font=dict(family = 'Arial', size = 12), margin={'t':20,'l':80,'b':70,'r':10}, legend = dict(orientation = 'h', xanchor = 'center', x = 0.5, y= -0.08), height = 840, width = 640, colorway = plotly.colors.DEFAULT_PLOTLY_COLORS[:3])

In [None]:
fig_rates.write_image('plots/figS3_rates.svg')
fig_rates.write_image('plots/figS3_rates.pdf')

#### save rate data for Supplementary File SF2

In [None]:
SF2_data = dict()
SF2_data['denovo_expansions'] = denovo_exp_rate_poisson[motif_list].unstack().median(axis=0).unstack().transpose().replace(0, np.nan).replace(np.inf, np.nan).dropna(how='all')
SF2_data['denovo_contractions'] = denovo_con_rate_poisson[motif_list].unstack().median(axis=0).unstack().transpose().replace(0, np.nan).replace(np.inf, np.nan).dropna(how='all')
SF2_data['denovo_non_motif_insertions'] = denovo_nonexp_rate_poisson[motif_list].unstack().median(axis=0).unstack().transpose().replace(0, np.nan).replace(np.inf, np.nan).dropna(how='all')
SF2_data['popSTR_insertions'] = decode_exp_rate_poisson[motif_list].unstack().median(axis=0).unstack().transpose().replace(0, np.nan).replace(np.inf, np.nan).dropna(how='all')
SF2_data['popSTR_contractions'] = decode_con_rate_poisson[motif_list].unstack().median(axis=0).unstack().transpose().replace(0, np.nan).replace(np.inf, np.nan).dropna(how='all')

In [None]:
writer = pd.ExcelWriter('decode/SF2_rate_data.xlsx')
for sheet, frame in  SF2_data.items():
    frame.to_excel(writer, sheet_name = sheet)
writer.close()

## regression on popSTR data

In [None]:
from scipy.stats import linregress
from scipy.stats import t
# Two-sided inverse Students t-distribution
# p - probability, df - degrees of freedom
tinv = lambda p, df: abs(t.ppf(p/2, df))

In [None]:
mult = 8
Lmult = denovo_exp_rate['A'].loc[8:8] * mult
Lmult.index = [9]
exp_portion = decode_exp_rate['A'].dropna().loc[12:19]
linreg_e = linregress(np.log10(exp_portion.index), np.log10(exp_portion))
print(linreg_e)
# mult
print(10**linreg_e.intercept * 9**linreg_e.slope / denovo_exp_rate['A'][8])

In [None]:
slope_c95_e = tinv(0.001, len(exp_portion.index)-2)*linreg_e.stderr
intercept_c95_e = tinv(0.001, len(exp_portion.index)-2)*linreg_e.intercept_stderr
print(str(round(linreg_e.slope, 3)) + '+/-' + str(round(slope_c95_e,3)))

In [None]:
print(str(round(linreg_e.slope + slope_c95_e, 3)) + ',' + str(round(linreg_e.slope - slope_c95_e,3)))

In [None]:
Lmult = denovo_con_rate['A'].loc[8:8] * mult
Lmult.index = [9]
#con_portion = decode_con_rate['A'].dropna()
con_portion = decode_con_rate['A'].dropna().loc[12:19]
#con_portion = decode_con_rate['A'].dropna().loc[23:29]
#con_portion = pd.concat([denovo_con_rate['A'].loc[8:8] * mult, decode_con_rate['A'].dropna()])#.loc[12:19]])
linreg_c = linregress(np.log10(con_portion.index), np.log10(con_portion))
print(con_portion)
print(linreg_c)
# mult
print(np.e**linreg_c.intercept * 9**linreg_c.slope / denovo_con_rate['A'][8])

In [None]:
slope_c95_c = tinv(0.05, len(con_portion.index)-2)*linreg_c.stderr
intercept_c95_c = tinv(0.05, len(con_portion.index)-2)*linreg_c.intercept_stderr
print(str(round(linreg_c.slope, 3)) + '+/-' + str(round(slope_c95_c,3)))

In [None]:
print(str(round(linreg_c.slope + slope_c95_c, 3)) + ',' + str(round(linreg_c.slope - slope_c95_c,3)))

In [None]:
fig_rates = go.Figure()
add_rate_trace(denovo_exp_rate_poisson, 'Exp. (de novo)', 'square-open', colornum = 0)
add_rate_trace(denovo_con_rate_poisson, 'Con. (de novo)', 'square-open', colornum = 1)
#add_rate_trace(denovo_nonexp_rate_poisson, 'Ins. (de novo)', 'square-open')
add_rate_trace(decode_exp_rate_poisson, 'Exp. (popSTR)', 'circle-open', colornum = 0)
add_rate_trace(decode_con_rate_poisson, 'Con. (popSTR)', 'circle-open', colornum = 1)
fig_rates.update_xaxes(type = 'log', title = 'repeat length (nt)', range = [-0.03,1.8], gridcolor = 'rgba(0,0,0,0.2)')
fig_rates.update_yaxes(type = 'log', title = 'rate (per generation)', range = [-11.6, -4.5], exponentformat =  'e', dtick = 1, gridcolor = 'rgba(0,0,0,0.2)')
fig_rates.update_layout(margin={'t':20,'l':60,'b':35,'r':10}, height = 450, width = 600, colorway = plotly.colors.DEFAULT_PLOTLY_COLORS[:2])
#fig_rates.update_layout(legend=dict(yanchor="top", y=0.97, xanchor="left", x=0.075))

mean_e = pd.Series([10**linreg_e.intercept * i**linreg_e.slope for i in range(5,60)], index = list(range(5,60)))
upper_e = pd.Series([10**(linreg_e.intercept - intercept_c95_e) * i**(linreg_e.slope + slope_c95_e) for i in range(5,60)], index = list(range(5,60)))
lower_e = pd.Series([10**(linreg_e.intercept + intercept_c95_e) * i**(linreg_e.slope - slope_c95_e) for i in range(5,60)], index = list(range(5,60)))

fig_rates.add_trace(go.Scatter(x = upper_e.index, y = upper_e, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[0], width = 0), name = 'CI', showlegend = False, legendgroup = 'exp_regress'))
fig_rates.add_trace(go.Scatter(x = mean_e.index, y = mean_e, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[0]), name = 'exp_regress', legendgroup = 'exp_regress', fill = 'tonexty'))
fig_rates.add_trace(go.Scatter(x = lower_e.index, y = lower_e, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[0], width = 0), name = 'CI', showlegend = False,  legendgroup = 'exp_regress', fill = 'tonexty'))

mean_c = pd.Series([10**linreg_c.intercept * i**linreg_c.slope for i in range(5,60)], index = list(range(5,60)))
upper_c = pd.Series([10**(linreg_c.intercept - intercept_c95_c) * i**(linreg_c.slope + slope_c95_c) for i in range(5,60)], index = list(range(5,60)))
lower_c = pd.Series([10**(linreg_c.intercept + intercept_c95_c) * i**(linreg_c.slope - slope_c95_c) for i in range(5,60)], index = list(range(5,60)))

fig_rates.add_trace(go.Scatter(x = upper_c.index, y = upper_c, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[1], width = 0), name = 'CI', showlegend = False, legendgroup = 'con_regress'))
fig_rates.add_trace(go.Scatter(x = mean_c.index, y = mean_c, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[1]), name = 'exp_regress', legendgroup = 'con_regress', fill = 'tonexty'))
fig_rates.add_trace(go.Scatter(x = lower_c.index, y = lower_c, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[1], width = 0), name = 'CI', showlegend = False,  legendgroup = 'con_regress', fill = 'tonexty'))

fig_rates.update_layout(font=dict(family = 'Arial', size = 14), margin={'t':20,'l':80,'b':40,'r':10}, height = 300, width = 640)

In [None]:
fig_rates.write_image('plots/fig_rates_regression.pdf')

In [None]:
upper_e[9] / denovo_exp_rate['A'][8], mean_e[9] / denovo_exp_rate['A'][8], lower_e[9] / denovo_exp_rate['A'][8]

In [None]:
upper_c[9] / denovo_con_rate['A'][8], mean_c[9] / denovo_con_rate['A'][8], lower_c[9] / denovo_con_rate['A'][8]