# Large, three-generation CEPH families reveal post-zygotic mosaicism and variability in germline mutation accumulation

### Thomas A. Sasani, Brent S. Pedersen, Ziyue Gao, Lisa M. Baird, Molly Przeworski, Lynn B. Jorde, Aaron R. Quinlan

### Prior to recreating figures, first import necessary libraries and set global plot aesthetics. 

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from collections import defaultdict, OrderedDict
import scipy.stats as ss
from statsmodels.stats import multitest as sms
import seaborn as sns
import numpy as np
import pandas as pd
%matplotlib inline

### Read in files containing DNMs identified in the F1 and F2 generations, as well as putative gonosomal and post-PGCS mosaic DNMs.

### Additionally, read in "summary" files for the F1 and F2 generation.

In [None]:
# these files contain a row for each DNM, annotated with sample ID, reference allele,
# alternate allele, depth, etc.
f1_dnms = pd.read_csv("../data/f1.dnms.txt", sep="\t")
f2_dnms = pd.read_csv("../data/f2.dnms.txt", sep="\t")
gm_dnms = pd.read_csv("../data/post-pgcs.dnms.txt", sep="\t")
pz_dnms = pd.read_csv("../data/gonosomal.dnms.txt", sep='\t')

# these files contain summary statistics for each sample, including the total
# number of DNMs in the sample, the sample's callable fraction, etc.
f1_simple = pd.read_csv("../data/f1.dnms.summary.csv")
f2_simple = pd.read_csv("../data/f2.dnms.summary.csv")

### Figure 1. Estimating the rate of germline mutation using multigenerational CEPH/Utah pedigrees
> B) Total numbers of DNMs (both SNVs and indels) identified across F1 CEPH/Utah individuals and stratified by parent-of-origin.

In [None]:
def dnm_distrib(df):
    """
    make box and swarm plots illustrating
    the distribution of DNM counts across 
    individuals
    
    df: pd.DataFrame() object
    """
    
    # make the figure object
    f, ax = plt.subplots(figsize=(10,8))
    
    # get distributions of DNM counts, stratified by phase
    d, m, t = df['dad_dnms'].values, df['mom_dnms'].values, df['all_dnms'].values
    s = [t, d, m]
    
    sns.swarmplot(data=s, palette=['k', "#66c2a5", "#fc8d62"],
                     size=10, edgecolor='w', linewidth=0.25)
    sns.boxplot(data=s, color='w')
    
    ax.set_xticks(np.arange(3))
    ax.set_xticklabels(('Total', 'Paternal', 'Maternal'))
    ax.set_ylabel('Number of DNMs')
    ax.set_xlabel('Parent of origin')
    ax.set_ylim(-10, 120)
    sns.despine(ax=ax, trim=True)

In [None]:
dnm_distrib(f1_simple)

### Figure 2. Effects of parental age and sex on autosomal DNM counts and mutation types in the F1 generation
> B) Mutation spectra in phased autosomal DNMs identified in the F1 generation

In [None]:
def plot_mutation_spectrum(df1, df2, labels=["a", "b"], colors=["#66c2a5", "#fc8d62"], indels=False, save=None):
    """
    Plot a comparison of mutation types in two 
    sets of DNMs.
    
    df1: pd.DataFrame() object
    df2: pd.DataFrame() object
    labels: 2-element list containing the two dataset names
    colors: manually pass in colors for each of the two datasets
    indels: boolean, include indels or not (some datasets are SNV only)
    """
        
    # make the figure object
    f, ax = plt.subplots(figsize=(12,8))
    # calculate numbers of each mutation type from each
    # of the two dataframes (we'll call them "a" and "b")
    a, b = defaultdict(int), defaultdict(int)
    for i, row in df1.iterrows():
        if row['chrom'] == 'X': continue # only look at autosomes
        if not indels and row['mut'] == 'indel': continue
        a[row['mut']] += 1
    for i, row in df2.iterrows():
        if row['chrom'] == 'X': continue # only look at autosomes
        if not indels and row['mut'] == 'indel': continue
        b[row['mut']] += 1

    # calculate significance of enrichment of particular
    # mutations in either of the two datasets
    print ("Mutation enrichment:")
    
    unadj_p_vals = []
    for i, change in enumerate(a):
        a_back = sum(a.values()) - a[change] 
        a_fore = a[change] 
        b_back = sum(b.values()) - b[change] 
        b_fore = b[change] 
        
        # chi-square test of independence
        o_r, p, _, _ = ss.chi2_contingency([ [a_back, a_fore],
                                             [b_back, b_fore] ])
                
        unadj_p_vals.append((change, p))
                    
    # Benjamini-Hochberg FDR 
    signif, p_corr = sms.fdrcorrection([x[1] for x in unadj_p_vals])

    for i,sig in enumerate(signif):
        u_p = unadj_p_vals[i][-1]
        change = unadj_p_vals[i][0]
        if bool(sig) is True: 
            print ("* {}: unadjusted p-value = {}".format(change, u_p))
        else:
            print ("{}: unadjusted p-value = {}".format(change, u_p))
        
    
    # make sure keys and values in parental dictionaries are sorted uniformly
    a, b = {k:v for k,v in sorted(a.items())}, {k:v for k,v in sorted(b.items())} 
    
    # convert raw counts to fractions
    a_frac = np.array(list(a.values())) / float(sum(list(a.values())))
    b_frac = np.array(list(b.values())) / float(sum(list(b.values())))
    
    idx = np.arange(len(a_frac))
    width = 0.425
    ax.bar(idx, a_frac, width, label=labels[0], edgecolor='w', lw=0.5, color=colors[0])
    ax.bar(idx + width, b_frac, width, label=labels[1], edgecolor='w', lw=0.5, color=colors[1])

    ax.set_xticks(idx + width / 2.)
    ax.set_xticklabels(a.keys())
    ax.set_ylabel("Fraction")
    ax.set_xlabel("Mutation type")
    ax.legend()
    sns.despine(ax=ax)
    if save is not None:
        f.savefig('../figs/{}.eps'.format(save), bbox_inches="tight")
        f.savefig('../figs/{}.png'.format(save), dpi=200, bbox_inches="tight")

In [None]:
plot_mutation_spectrum(f1_dnms.query('phase == "paternal"'), # first set of DNMs
                       f1_dnms.query('phase == "maternal"'), # second set of DNMs
                       labels=['paternal', 'maternal'],      # labels for the first and second sets of DNMs
                       indels=True)                          # include indels as a mutation "class"

### Figure 4: Identification of post-PGCS mosaicism in the F1 generation
> B) Comparison of mutation spectra in F2 post-PGCS variants and F2 germline *de novo* variants

In [None]:
plot_mutation_spectrum(f2_dnms,                                             # all F2 germline DNMs, excluding post-PGCS DNMs
                       gm_dnms.drop_duplicates(['chrom', 'start', 'end']),  # F2 post-PGCS DNMs (note that we only count each DNM once)
                       labels=['F2 germline', 'F1 post-PGCS'],              # labels for the first and second sets of DNMs
                       colors=['dodgerblue', 'firebrick'])                  # color scheme

### Figure 5: Identification of gonosomal mutations in the F1 generation
> B) Comparison of mutation spectra in paternal and maternal gonosomal variants.

In [None]:
plot_mutation_spectrum(pz_dnms.query('phase == "paternal"'), # paternal gonosomal DNMs in the F1 generation
                       pz_dnms.query('phase == "maternal"'), # maternal gonosomal DNMs in the F1 generation
                       labels=['paternal', 'maternal'])      # labels for the first and second sets of DNMs

> C) Comparison of mutation spectra in F1 germline DNMs (non-gonosomal) and putative gonosomal mutations in the F1 generation.

In [None]:
plot_mutation_spectrum(f1_dnms,                                # all F1 germline DNMs (excluding gonosomals)
                       pz_dnms,                                # all F1 gonosomal DNMs
                       labels=['F1 germline', 'F1 gonosomal'], # labels for the first and second sets of DNMs
                       colors=["dodgerblue", "firebrick"])     # color scheme

### Supplementary Figure 4: Comparison of mutation spectra in children born to older or younger parents
> A: Mutation spectra in children born to fathers younger or older than 29.2y (median)

In [None]:
print("median age of fathers: {}".format(np.median(f1_dnms['paternal_age_at_conception'])))
print("median age of mothers: {}".format(np.median(f1_dnms['maternal_age_at_conception'])))

In [None]:
plot_mutation_spectrum(f1_dnms.query('paternal_age_at_conception < 29.2'), 
                       f1_dnms.query('paternal_age_at_conception >= 29.2'), 
                       labels=['younger fathers', 'older fathers'],
                       colors=["goldenrod", "forestgreen"],
                       indels=True)

> B: Mutation spectra in children born to mothers younger or older than 25.7y (median)

In [None]:
plot_mutation_spectrum(f1_dnms.query('maternal_age_at_conception < 25.7'), 
                       f1_dnms.query('maternal_age_at_conception >= 25.7'), 
                       labels=['younger mothers', 'older mothers'],
                       colors=["goldenrod", "forestgreen"],
                       indels=True)

In [None]:
upper_quartile_paternal = np.percentile(f1_dnms['paternal_age_at_conception'], 75)
lower_quartile_paternal = np.percentile(f1_dnms['paternal_age_at_conception'], 25)

upper_quartile_maternal = np.percentile(f1_dnms['maternal_age_at_conception'], 75)
lower_quartile_maternal = np.percentile(f1_dnms['maternal_age_at_conception'], 25)

print(lower_quartile_paternal, upper_quartile_paternal)
print(lower_quartile_maternal, upper_quartile_maternal)

> C: Mutation spectra in children born to fathers younger than 26.4y or older than 34y (lower and upper quartile)

In [None]:
plot_mutation_spectrum(f1_dnms.query('paternal_age_at_conception <= 26.4'), 
                       f1_dnms.query('paternal_age_at_conception >= 34.0'), 
                       labels=['younger fathers', 'older fathers'],
                       colors=["goldenrod", "forestgreen"],
                       indels=True)

> D: Mutation spectra in children born to mothers younger than 22.5y or older than 31.4y (lower and upper quartile)

In [None]:
plot_mutation_spectrum(f1_dnms.query('maternal_age_at_conception <= 22.5'), 
                       f1_dnms.query('maternal_age_at_conception >= 31.4'), 
                       labels=['younger mothers', 'older mothers'],
                       colors=["goldenrod", "forestgreen"],
                       indels=True)