In [2]:
require(tidyverse)
require(data.table)


Loading required package: tidyverse
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: data.table

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose



In [3]:
geno_f <- 'ukb24983_cal_cALL_v2_hg19_ANGPTL7_protein-altering_vars.raw'
geno_df <- fread(
    file=geno_f, sep='\t', data.table=F
)

In [4]:
# (a) INI2734 Number_of_live_births
# (b) INI2405 Number_of_children_fathered

pheno_f <- list()
pheno_f[['INI2734']] <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/9796/24611/phe/INI2734.phe'
pheno_f[['INI2405']] <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/10136/21731/phe/INI2405.phe'


In [5]:
pheno_df <- pheno_f %>% lapply(fread, sep='\t', head=F)

for(k in names(pheno_df)){
    colnames(pheno_df[[k]]) <- c('FID', 'IID', k)
}

In [6]:
covar_f <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/ukb24983_GWAS_covar.phe'
covar_df <- fread(
    cmd=paste0('cat ', covar_f, '| cut -f1-9'), sep='\t', data.table=F
)


In [7]:
joined_df <- pheno_df %>% 
reduce(inner_join, by=c('FID', 'IID')) %>%
inner_join(covar_df, by=c('FID', 'IID')) %>%
inner_join(geno_df, by=c('FID', 'IID')) %>%
select(-PAT, -MAT, -PHENOTYPE) %>%
left_join(
    data.frame(
        rs200058074_A = c(0, 1, 2),
        rs200058074 = c('G/G', 'A/G', 'A/A')   
    ), by='rs200058074_A'
) %>%
left_join(
    data.frame(
        rs28991002_G = c(0, 1, 2),
        rs28991002 = c('A/A', 'G/A', 'G/G')   
    ), by='rs28991002_G'
) %>%
left_join(
    data.frame(
        rs28991009_G = c(0, 1, 2),
        rs28991009 = c('T/T', 'G/T', 'G/G')   
    ), by='rs28991009_G'
) %>%
left_join(
    data.frame(
        rs143435072_C = c(0, 1, 2),
        rs143435072 = c('T/T', 'C/T', 'C/C')   
    ), by='rs143435072_C'
)


In [8]:
joined_df %>% head()

FID,IID,INI2734,INI2405,age,sex,Array,PC1,PC2,PC3,PC4,SEX,rs200058074_A,rs28991002_G,rs28991009_G,rs143435072_C,rs200058074,rs28991002,rs28991009,rs143435072
5441319,5441319,-9,1,70,1,0,-15.0388,3.95816,-4.01181,4.72712,1,2,2,2,2,A/A,G/G,G/G,C/C
5743544,5743544,2,-9,59,0,1,-11.6026,1.85332,-2.30169,-0.0928505,2,2,2,2,2,A/A,G/G,G/G,C/C
3266897,3266897,-9,2,68,1,0,-9.95342,4.77525,1.42974,3.8178,1,2,2,2,2,A/A,G/G,G/G,C/C
3990694,3990694,0,-9,67,0,1,-12.1141,6.77668,-1.24896,-1.20295,2,2,2,2,2,A/A,G/G,G/G,C/C
1080519,1080519,1,-9,75,0,1,-11.7876,3.47476,1.25679,0.071901,2,2,2,2,2,A/A,G/G,G/G,C/C
2611069,2611069,2,-9,73,0,1,-15.2218,5.28979,-1.55469,2.77965,2,2,2,2,2,A/A,G/G,G/G,C/C


In [9]:
show_95CI <- function(estimate, standard_error){
    c(estimate - 1.96 * standard_error, estimate + 1.96 * standard_error) %>% print()
}

## fit lm (dosage models)

In [10]:
lm_INI2734_rs200058074 <- lm (
    INI2734 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs200058074),
    joined_df %>% filter(INI2734 != -9)
)
lm_INI2734_rs28991002 <- lm (
    INI2734 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs28991002),
    joined_df %>% filter(INI2734 != -9)
)
lm_INI2734_rs28991009 <- lm (
    INI2734 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs28991009),
    joined_df %>% filter(INI2734 != -9)
)
lm_INI2734_rs143435072 <- lm (
    INI2734 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs143435072),
    joined_df %>% filter(INI2734 != -9)
)

lm_INI2405_rs200058074 <- lm (
    INI2405 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs200058074),
    joined_df %>% filter(INI2405 != -9)
)
lm_INI2405_rs28991002 <- lm (
    INI2405 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs28991002),
    joined_df %>% filter(INI2405 != -9)
)
lm_INI2405_rs28991009 <- lm (
    INI2405 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs28991009),
    joined_df %>% filter(INI2405 != -9)
)
lm_INI2405_rs143435072 <- lm (
    INI2405 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + PC4 + as.factor(rs143435072),
    joined_df %>% filter(INI2405 != -9)
)


In [11]:
lm_INI2734_rs200058074 %>% summary() %>% print() 
lm_INI2734_rs28991002  %>% summary() %>% print()
lm_INI2734_rs28991009  %>% summary() %>% print()
lm_INI2734_rs143435072 %>% summary() %>% print()



Call:
lm(formula = INI2734 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + 
    PC4 + as.factor(rs200058074), data = joined_df %>% filter(INI2734 != 
    -9))

Residuals:
    Min      1Q  Median      3Q     Max 
-2.2149 -0.7961  0.0976  0.5418 20.0449 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)                0.183575   0.031926   5.750 8.94e-09 ***
age                        0.024790   0.000339  73.123  < 2e-16 ***
as.factor(Array)1          0.005015   0.008911   0.563   0.5736    
PC1                        0.002568   0.001751   1.467   0.1424    
PC2                        0.003521   0.001809   1.946   0.0517 .  
PC3                       -0.004300   0.001738  -2.475   0.0133 *  
PC4                        0.003995   0.000960   4.161 3.17e-05 ***
as.factor(rs200058074)A/G  0.089140   0.078386   1.137   0.2555    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.141 on 180823 degrees of fr

In [27]:
rbind(
    lm_INI2734_rs200058074 %>% summary() %>% coef() %>% tail(1) %>% data.frame() %>% rownames_to_column('variant'),
    lm_INI2734_rs28991002  %>% summary() %>% coef() %>% tail(1) %>% data.frame() %>% rownames_to_column('variant'),
    lm_INI2734_rs28991009  %>% summary() %>% coef() %>% tail(2) %>% data.frame() %>% rownames_to_column('variant'),
    lm_INI2734_rs143435072 %>% summary() %>% coef() %>% tail(1) %>% data.frame() %>% rownames_to_column('variant')
) %>%
data.frame() %>%
rename(
    'Std' = 'Std..Error'
) %>%
mutate(
    l95 = Estimate - 1.96 * Std,
    u95 = Estimate + 1.96 * Std,
    str = paste0(signif(Estimate, digits=2), ' [', signif(l95, digits=2), ', ', signif(u95, digits=2), ']')    
)

variant,Estimate,Std,t.value,Pr...t..,l95,u95,str
as.factor(rs200058074)A/G,0.089140425,0.07838611,1.1371968,0.25545755,-0.06449634,0.24277719,"0.089 [-0.064, 0.24]"
as.factor(rs28991002)G/G,-0.293434537,1.14056632,-0.2572709,0.79696994,-2.52894452,1.94207545,"-0.29 [-2.5, 1.9]"
as.factor(rs28991009)G/T,-0.007591093,0.02154496,-0.3523373,0.72458572,-0.04981921,0.03463703,"-0.0076 [-0.05, 0.035]"
as.factor(rs28991009)T/T,0.22770552,0.36076659,0.6311713,0.52792932,-0.479397,0.93480804,"0.23 [-0.48, 0.93]"
as.factor(rs143435072)C/T,0.202658155,0.0928595,2.182417,0.02908006,0.02065353,0.38466278,"0.2 [0.021, 0.38]"


In [16]:
lm_INI2405_rs200058074 %>% summary() %>% print()
lm_INI2405_rs28991002  %>% summary() %>% print()
lm_INI2405_rs28991009  %>% summary() %>% print()
lm_INI2405_rs143435072 %>% summary() %>% print()



Call:
lm(formula = INI2405 ~ age + as.factor(Array) + PC1 + PC2 + PC3 + 
    PC4 + as.factor(rs200058074), data = joined_df %>% filter(INI2405 != 
    -9))

Residuals:
   Min     1Q Median     3Q    Max 
-2.166 -0.877  0.115  0.567 98.583 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                0.2775398  0.0373846   7.424 1.14e-13 ***
age                        0.0238867  0.0003903  61.204  < 2e-16 ***
as.factor(Array)1         -0.0217311  0.0097107  -2.238  0.02523 *  
PC1                        0.0054746  0.0020600   2.658  0.00787 ** 
PC2                       -0.0025967  0.0021344  -1.217  0.22376    
PC3                       -0.0024271  0.0020531  -1.182  0.23713    
PC4                        0.0077032  0.0011301   6.816 9.38e-12 ***
as.factor(rs200058074)A/G  0.0063612  0.1025595   0.062  0.95054    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.243 on 154973 degrees of fre

In [26]:
rbind(
    lm_INI2405_rs200058074 %>% summary() %>% coef() %>% tail(1) %>% data.frame() %>% rownames_to_column('variant'),
    lm_INI2405_rs28991002  %>% summary() %>% coef() %>% tail(1) %>% data.frame() %>% rownames_to_column('variant'),
    lm_INI2405_rs28991009  %>% summary() %>% coef() %>% tail(2) %>% data.frame() %>% rownames_to_column('variant'),
    lm_INI2405_rs143435072 %>% summary() %>% coef() %>% tail(1) %>% data.frame() %>% rownames_to_column('variant')
) %>%
data.frame() %>%
rename(
    'Std' = 'Std..Error'
) %>%
mutate(
    l95 = Estimate - 1.96 * Std,
    u95 = Estimate + 1.96 * Std,
    str = paste0(signif(Estimate, digits=2), ' [', signif(l95, digits=2), ', ', signif(u95, digits=2), ']')
)

variant,Estimate,Std,t.value,Pr...t..,l95,u95,str
as.factor(rs200058074)A/G,0.006361193,0.10255951,0.06202441,0.9505435,-0.19465545,0.20737783,"0.0064 [-0.19, 0.21]"
as.factor(rs28991002)G/G,-0.002967125,0.04297386,-0.06904488,0.944954,-0.0871959,0.08126165,"-0.003 [-0.087, 0.081]"
as.factor(rs28991009)G/T,-0.00515247,0.02487,-0.20717608,0.8358726,-0.05389768,0.04359274,"-0.0052 [-0.054, 0.044]"
as.factor(rs28991009)T/T,0.408971665,0.29293059,1.39613849,0.1626748,-0.16517228,0.98311561,"0.41 [-0.17, 0.98]"
as.factor(rs143435072)C/T,-0.051924783,0.11349786,-0.45749571,0.6473154,-0.27438058,0.17053102,"-0.052 [-0.27, 0.17]"


In [22]:
signif(0.00102, digits = 2)