# Large, three-generation CEPH families reveal post-zygotic mosaicism and variability in germline mutation accumulation

### Thomas A. Sasani, Brent S. Pedersen, Ziyue Gao, Lisa M. Baird, Molly Przeworski, Lynn B. Jorde, Aaron R. Quinlan

### Read in files containing DNMs identified in the second and third generations, as well as putative gonosomal and post-PGCS mosaic DNMs.

In [None]:
mosaic = read.csv("../data/post-pgcs.dnms.summary.csv")
gonosomal = read.csv("../data/gonosomal.dnms.summary.csv")
gen3 = read.csv("../data/third_gen.dnms.summary.csv")
gen2 = read.csv("../data/second_gen.dnms.summary.csv")

### Figure 2. Effects of parental age and sex on autosomal DNM counts and mutation types in the second generation
> A) Numbers of phased paternal and maternal de novo variants as a function of parental age at birth.

In [None]:
plot_dnms <- function(df, adjust_ar=FALSE, alpha=1.) {
        
    library(ggplot2)
    library(cowplot)
    library(MASS)
            
    # fit the Poisson regressions for moms and dads separately
    d = glm(dad_dnms ~ dad_age, data=df, family=poisson(link="identity"))
    m = glm(mom_dnms ~ mom_age, data=df, family=poisson(link="identity"))

    # use the fitted GLM to predict the response variable
    d_pred = predict(d, type='response', se.fit=TRUE)
    m_pred = predict(m, type='response', se.fit=TRUE)
    
    # add CI intervals by calculating 1.96 standard deviations
    # from the mean in either direction
    df$dad_ci_lo = d_pred$fit - 1.96 * d_pred$se.fit
    df$dad_ci_hi = d_pred$fit + 1.96 * d_pred$se.fit
    df$mom_ci_lo = m_pred$fit - 1.96 * m_pred$se.fit
    df$mom_ci_hi = m_pred$fit + 1.96 * m_pred$se.fit
        
    # get min and max X and Y values for plot limits
    min_age = min(c(min(df$dad_age), min(df$mom_age)))
    max_age = max(c(max(df$dad_age), max(df$mom_age)))

    min_dnm = min(c(min(df$dad_dnms), min(df$mom_dnms)))
    max_dnm = max(c(max(df$dad_dnms), max(df$mom_dnms)))
    
    # set the upper Y limit 
    if (max_dnm < 15) {
        max_dnm = max_dnm
    }
    else {
        max_dnm = max_dnm + 15
    }
    
    # adjust the aspect ratio if needed.
    # these adjustments are specific to plotting either second-generation
    # DNMs or gonosomal DNMs, and are for aesthetic purposes only
    if (adjust_ar) {
        adjust = (0.075 * min_age/min_dnm)
    }
    else {
        adjust = 2.25
    }
    
    p <- ggplot(df) + 
        # plot the raw data
        geom_point(aes(x=dad_age, y=dad_dnms), size=3.5, pch=21, fill='#66c2a5', col='white', stroke=0.25) +
        geom_point(aes(x=mom_age, y=mom_dnms), size=3.5, pch=21, fill='#fc8d62', col='white', stroke=0.25) +
        # plot the predictions from the fitted GLM
        geom_line(data=cbind(df, pred_d=d_pred$fit), aes(x=dad_age, y=pred_d), col='#66c2a5') +
        geom_line(data=cbind(df, pred_m=m_pred$fit), aes(x=mom_age, y=pred_m), col='#fc8d62') +
        # plot confidence bands
        geom_ribbon(aes(x=dad_age, ymin=dad_ci_lo, ymax=dad_ci_hi), alpha=alpha, fill='#66c2a5') +
        geom_ribbon(aes(x=mom_age, ymin=mom_ci_lo, ymax=mom_ci_hi), alpha=alpha, fill='#fc8d62') +
        # aesthetics for the plot
        xlab('Parental age at birth') + 
        ylab('Number of DNMs') + 
        theme(text = element_text(size=16)) +
        theme(axis.text.x = element_text(size = 16)) +
        theme(axis.text.y = element_text(size = 16)) +
        xlim(c(min_age, max_age)) + 
        ylim(c(0, max_dnm)) +
        coord_fixed(adjust)
    
    p
}

In [None]:
plot_dnms(gen2,                # file of DNMs
          adjust_ar=T,         # adjust the aspect ratio (for aesthetics only)
          alpha=0.25)          # transparency value for the `geom_ribbon`

#### Get the slopes of each regression, as well as 95% CIs

In [None]:
get_model_params <- function(df) {
    
    d = glm(dad_dnms ~ dad_age, data=df, family=poisson(link="identity"))
    m = glm(mom_dnms ~ mom_age, data=df, family=poisson(link="identity"))

    # summaries of each model
    print(summary(d))
    print(summary(m))

    # 95% confidence intervals
    print(confint(d), level=0.95)
    print(confint(m), level=0.95)
}

In [None]:
get_model_params(gen2)

### Figure 3. Parental age effects on autosomal germline mutation counts vary significantly among CEPH/Utah families

> C: Total number of autosomal DNMs vs. paternal age at birth for each of the 40 CEPH families (i.e., combinations of second-generation parents and their third-generation children). 

> D: The slope of each family's Poisson regression +/- 95% confidence intervals, sorted in ascending order from top to bottom.

In [None]:
library(ggplot2)
library(cowplot)
library(MASS)

# create a new dataframe, which consists of the
# `gen3` dataframe plus four new columns. these columns
# contain the slope, intercept, and 95% CI of the slope
# estimate (in both directions) for the paternal age 
# effect in that family
new_df = data.frame()
for (sp in split(gen3, as.factor(gen3$family_id))) {
    m = glm(autosomal_dnms ~ dad_age, data=sp, family=poisson(link="identity"))
    s = summary(m)
    ci = confint(m, level=0.95)
    sp$slope_ci_lo = ci[[2]]
    sp$slope_ci_hi = ci[[4]]
    sp$slope = s$coefficients[[2]]
    sp$intercept = s$coefficients[[1]]
    new_df = rbind(new_df, sp)
    }

# sort the dataframe by slope
sorted_df = new_df[order(new_df$slope),]

# set overall colorscheme, and color 
# the two most extreme families
sorted_df$pt_color = "azure4"
sorted_df$ci_color = "grey"

sorted_df$pt_color[sorted_df$family_id == "24_C"] = "dodgerblue"
sorted_df$ci_color[sorted_df$family_id == "24_C"] = "dodgerblue"

sorted_df$pt_color[sorted_df$family_id == "16"] = "firebrick"
sorted_df$ci_color[sorted_df$family_id == "16"] = "firebrick"

# fit the poisson regression (with interaction) to the full
# dataset of third-generation DNMs
m = glm(autosomal_dnms ~ dad_age * family_id, data=sorted_df, family=poisson(link="identity"))

# get model predictions
m_predict = predict(m, sorted_df, type='response', se.fit=TRUE)

# add columns for regression confidence interval bands
sorted_df$ci_lo = m_predict$fit - 1.96 * m_predict$se.fit
sorted_df$ci_hi = m_predict$fit + 1.96 * m_predict$se.fit

# add a column to the dataframe that assigns a specific level
# to each family so that we can sort the families in our plot
sorted_df$facet_order = factor(sorted_df$family_id, levels = unique(sorted_df$family_id))

# plot `dad_age` vs. `autosomal_dnms` for each family, and
# add regression lines + confidence bands
p1 <- ggplot(sorted_df, aes(x = dad_age, y = autosomal_dnms)) +
      facet_wrap(~facet_order, nrow=10) +
      # plot 95% CI bands, regression lines, and raw data points
      geom_ribbon(data = sorted_df, aes(x=dad_age, ymin=ci_lo, ymax=ci_hi), fill=sorted_df$ci_color, alpha=1) +
      geom_line(data=cbind(sorted_df, p=m_predict$fit), aes(x=dad_age, y=p), col="white", size=0.1) +
      geom_point(pch=21, fill=sorted_df$pt_color, size=2, col='white', stroke=0.15) +
      # plot aesthetics, labels, font sizes
      xlab("Paternal age at birth") +
      ylab("Number of autosomal DNMs") +
      ylim(0,135) +
      xlim(15,50) +
      theme(text = element_text(size = 12)) +
      theme(strip.text = element_blank()) +
      theme(panel.spacing.y = unit(-5, "pt")) +
      theme(panel.spacing.x = unit(-5, "pt")) +
      theme(axis.text.x = element_text(size = 9)) +
      theme(axis.text.y = element_text(size = 9)) +
      theme(axis.line = element_line(colour = 'black', size = 0.5)) +
      theme(axis.ticks = element_line(colour = "black", size = 0.5)) +
      # adjust the aspect ratio of the plot (also for aesthetics)
      coord_fixed(ratio=0.15)

# remove duplicate family IDs from the dataframe so that we
# only plot one point per family in the next plot
sorted_df = sorted_df[!duplicated(sorted_df[,c('family_id')]),]
sorted_df$order = rev(c(1:nrow(sorted_df)))

# calculate the overall paternal age effect
ovl_age_effect = glm(autosomal_dnms ~ dad_age, data=gen3, family=poisson(link="identity"))
ovl_slope = summary(ovl_age_effect)$coefficients[[2]]
ci = confint(ovl_age_effect, level=0.95)
ovl_slope_ci_lo = ci[[2]]
ovl_slope_ci_hi = ci[[4]]

print ("Overall slope estimate in the F2")
print (ovl_slope)
print (ovl_slope_ci_lo)
print (ovl_slope_ci_hi)

# plot the slope estimate (+/- 95% CI) for each family separately
p2 <- ggplot(sorted_df) +
      # plot the overall slope, estimated using all samples together, regardless of family ID
      geom_vline(xintercept=ovl_slope, color = "black", linetype="dashed") +
      # plot the slope estimate for each family
      geom_point(aes(y=order, x=slope), col=sorted_df$ci_color) +
      scale_y_continuous(breaks=sorted_df$order, labels=sorted_df$family_id) +
      # plot 95% CI for each family
      geom_errorbarh(aes(y=order, xmax = slope_ci_hi, xmin = slope_ci_lo), col=sorted_df$ci_color) +
      # plot aesthetics, labels, font sizes
      theme(axis.text.x = element_text(size = 10)) +
      theme(axis.text.y = element_text(size = 9)) +
      theme(text = element_text(size = 12)) +
      ylab('Family ID') +
      xlab('Additional DNMs per year of \npaternal age (slope) +/- 95% CI') +
      background_grid(major="x", minor="none") + 
      coord_fixed(0.4)

p1
p2 

### Figure 4: Identification of post-PGCS mosaicism in the second generation
> C) Mosaic number as a function of paternal age at birth.


In [None]:
mosaic_number_vs_age <- function(mosaic, alpha=1.) {

    library(ggplot2)
    library(cowplot)
    
    mosaic_fit = glm(snv_dnms ~ dad_age, data=mosaic, family=poisson(link="identity"))
    mosaic_pred = predict(mosaic_fit, mosaic, type="response", se.fit=TRUE)
    # add columns for regression confidence interval bands
    mosaic$ci_lo = mosaic_pred$fit - 1.96 * mosaic_pred$se.fit
    mosaic$ci_hi = mosaic_pred$fit + 1.96 * mosaic_pred$se.fit
    
    p <-  ggplot(mosaic, aes(x = dad_age, y = snv_dnms)) +
          geom_ribbon(data = mosaic, aes(x=dad_age, ymin=ci_lo, ymax=ci_hi), alpha=alpha, fill="firebrick") +
          geom_line(data=cbind(mosaic, p=mosaic_pred$fit), aes(x=dad_age, y=p), col="firebrick", size=1) +
          geom_point(fill="black", color='white', pch=21, size=3.5, stroke=0.25) +
          # plot aesthetics, labels, font sizes
          xlab("Paternal age at birth") +
          ylab("Number of sample's DNMs that is shared with siblings") +
          theme(text = element_text(size=14)) +
          theme(axis.text.x = element_text(size = 14)) +
          theme(axis.text.y = element_text(size = 14)) +
          coord_fixed(3)
          
    print(summary(mosaic_fit))
        
    p
}

In [None]:
mosaic_number_vs_age(mosaic,           # combined dataframe containing both post-PGCS and F2 germline DNMs
                     alpha=0.25)       # alpha value for `geom_ribbon`

> 4D: Comparison of germline and post-PGCS DNM age effects

In [None]:
germline_vs_pgcs <- function(gen3, mosaic) {
    
    library(ggplot2)
    library(cowplot)

    # fit the Poisson regression for post-PGCS DNMs
    mosaic_fit = glm(snv_dnms ~ dad_age, family=poisson(link="identity"), data=mosaic)
    mosaic_pred = predict(mosaic_fit, mosaic, type='response', se.fit=TRUE)
    
    mosaic$mosaic_ci_lo = mosaic_pred$fit - 1.96 * mosaic_pred$se.fit
    mosaic$mosaic_ci_hi = mosaic_pred$fit + 1.96 * mosaic_pred$se.fit

    # fit the Poisson regression for germline DNMs
    germline_fit = glm(snv_dnms ~ dad_age, family=poisson(link="identity"), data=gen3)
    germline_pred = predict(germline_fit, gen3, type='response', se.fit=TRUE)

    # add columns for regression confidence interval bands
    gen3$germline_ci_lo = germline_pred$fit - 1.96 * germline_pred$se.fit
    gen3$germline_ci_hi = germline_pred$fit + 1.96 * germline_pred$se.fit

    p <- ggplot() + 
            geom_line(data=cbind(gen3, p=germline_pred$fit), aes(x=dad_age, y=p), col="dodgerblue") +
            geom_ribbon(data=gen3, aes(x=dad_age, ymin=germline_ci_lo, ymax=germline_ci_hi), fill="dodgerblue", alpha=0.25) +
            geom_line(data=cbind(mosaic, p=mosaic_pred$fit), aes(x=dad_age, y=p), col="firebrick") +
            geom_ribbon(data=mosaic, aes(x=dad_age, ymin=mosaic_ci_lo, ymax=mosaic_ci_hi), fill="firebrick", alpha=0.25) +
            # plot aesthetics, labels, font sizes
            xlab("Paternal age at birth") +
            ylab("Number of DNMs") + 
            theme(text = element_text(size=14)) +
            theme(axis.text.x = element_text(size = 14)) +
            theme(axis.text.y = element_text(size = 14)) +
            xlim(18,50) +
            coord_fixed(0.1)
    
    p
}

In [None]:
germline_vs_pgcs(gen3, mosaic)

> 4E: Fraction of post-PGCS DNMs as a function of paternal age

In [None]:
mosaic_fraction_vs_age <- function(gen3, mosaic) {
    
    library(ggplot2)
    library(cowplot)
    
    # merge the third-generation and post-PGCS DNM dataframes by sample ID, so
    # that we can access metadata about each sample from both
    # dataframes at the same time
    combined = merge(gen3, mosaic, by="sample_id")

    # make a column representing the fraction of a sample's DNMs (SNVs only,
    # since we only looked at post-PGCS SNVs) that are shared with a sibling.
    # the `.x` and `.y` suffixes on column IDs simply refer to the dataframe that
    # the column came from. since both dataframes have the same column IDs, `snv_dnms.x`
    # represents the column from the third-generation dataframe, and `snv_dnms.y` represents the 
    # column from the post-PGCS mosaic dataframe
    combined$mosaic_fraction = combined$snv_dnms.y / (combined$snv_dnms.y + combined$snv_dnms.x)

    # add a column representing the total number of DNMs (germline SNVs + 
    # post-PGCS shared SNVs) in the samples
    combined$total_dnms = combined$snv_dnms.x + combined$snv_dnms.y
    
    # fit a model predicting the log of the mosaic fraction vs. dad age
    mosaic_fraction_fit = lm(log(mosaic_fraction) ~ dad_age.x, data=combined)

    print(summary(mosaic_fraction_fit))
    
    p <- ggplot(combined, aes(x=dad_age.x, y=mosaic_fraction)) +
            geom_point(pch=21, size=2, color="white", fill="black", stroke=0.25) +
            geom_line(data=cbind(combined, p_frac=exp(predict(mosaic_fraction_fit, combined, type="response"))), aes(x=dad_age.x, y=p_frac), col="firebrick", size=1) +
            # plot aesthetics, labels, font sizes
            xlab("Paternal age at birth") +
            ylab("Fraction of sample's DNMs that is\n shared with siblings") +
            theme(text = element_text(size=14)) +
            theme(axis.text.x = element_text(size = 14)) +
            theme(axis.text.y = element_text(size = 14)) +
            xlim(18,50) +
            coord_fixed(80)

    p
}
    

In [None]:
mosaic_fraction_vs_age(gen3, mosaic)

#### Determine whether the mosaic fraction is dependent on the number of siblings in a family

In [None]:
combined = merge(gen3, mosaic, by="sample_id")
# make a column representing the fraction of a sample's DNMs (SNVs only,
# since we only looked at post-PGCS SNVs) that are shared with a sibling.
# the `.x` and `.y` suffixes on column IDs simply refer to the dataframe that
# the column came from. since both dataframes have the same column IDs, `snv_dnms.x`
# represents the column from the third-generation dataframe, and `snv_dnms.y` represents the 
# column from the post-PGCS mosaic dataframe
combined$mosaic_fraction = combined$snv_dnms.y / (combined$snv_dnms.y + combined$snv_dnms.x)

# fit a model predicting the mosaic fraction as a function of age
m = lm(mosaic_fraction ~ n_sibs.x, data=combined)
summary(m)

In [None]:
cor.test(combined$mosaic_fraction, combined$n_sibs.x)

#### Determine whether the mosaic number is dependent on the number of siblings in a family

In [None]:
m = lm(snv_dnms.y ~ n_sibs.x, data=combined)
summary(m)

cor.test(combined$snv_dnms.y, combined$n_sibs.x)

### Figure 5: Identification of gonosomal mutations in the second generation

> D: Numbers of phased paternal and maternal gonosomal variants as a function of parental age at birth

In [None]:
plot_dnms(gonosomal,        # file of DNMs
          adjust_ar=F,      # adjust the aspect ratio (for aesthetics only)
          alpha=0.25)       # transparency value for the `geom_ribbon`

In [None]:
get_model_params(gonosomal)

### Supplementary Figure 3: Contribution of maternal and paternal age to de novo mutation rates

> A) Contributions of maternal and paternal age to mutation rates in second-generation children

In [None]:
age_by_age <- function(df) {

    library(ggplot2)
    library(cowplot)

    # calculate the autosomal mutation rate
    df$rate = df$snv_autosomal_dnms / df$autosomal_callable_fraction / 2.

    p <- ggplot(df, aes(dad_age, mom_age)) +
          geom_point(aes(col = rate), size=2) +
          geom_abline(slope=1, intercept=0) +
          scale_colour_gradient(low = "blue", high = "red") +
          xlab("Paternal age at birth") +
          ylab("Maternal age at birth") +
          theme(text = element_text(size=16)) +
          theme(axis.text.x = element_text(size = 16)) +
          theme(axis.text.y = element_text(size = 16))   
        
    p
}

In [None]:
age_by_age(gen2)

> B) Contributions of maternal and paternal age to mutation rates in third-generation children

In [None]:
age_by_age(gen3)