### M2 tables

In [4]:
require('tidyverse')
require('data.table')
options(stringsAsFactors=FALSE)

In [5]:
variant_tab <- fread('/oak/stanford/groups/mrivas/private_data/ukbb/variant_filtering/variant_filter_table.new.tsv.gz', data.table=FALSE)
head(variant_tab)

CHROM,POS,REF,ALT,ID,Gene,Consequence,HGVSp,LoF,LoF_filter,⋯,bileve_only,filter,missingness,hwe,mcpi,gnomad_af,mgi,mgi_notes,all_filters,Gene_symbol
1,723307,C,G,rs28659788,ENSG00000237491,intron_variant,,,,⋯,True,,1,1,0,,,,2,AL669831.5
1,727841,G,A,rs116587930,ENSG00000237491,intron_variant,,,,⋯,False,,1,1,0,,,,2,AL669831.5
1,729632,C,T,rs116720794,ENSG00000237491,intron_variant,,,,⋯,False,,1,1,0,,,,2,AL669831.5
1,752721,A,G,rs3131972,ENSG00000240453,intron_variant,,,,⋯,False,,0,1,0,,,,1,RP11-206L10.10
1,754105,C,T,rs12184325,ENSG00000177757,splice_region_variant,,,,⋯,False,,0,1,0,,,,1,FAM87B
1,756604,A,G,rs3131962,ENSG00000240453,upstream_gene_variant,,,,⋯,False,,0,0,0,,,,0,RP11-206L10.10


In [17]:
colnames(variant_tab)
variant_sm <- variant_tab %>% select(ID, POS, REF, ALT, Gene_symbol, maf, HGVSp) 


In [34]:
M2.DIR <- "../data/1020/m2/"

# read in all files
my_files <- list.files(M2.DIR, pattern="snps4_*")
m2_biomarkers <- sapply(my_files, function(x) strsplit(strsplit(x, "snps4_",fixed=TRUE)[[1]][[2]], ".txt")[[1]][[1]])

snps <- do.call(rbind, lapply(m2_biomarkers, function(x) {
    df <- read.table(sprintf("%s/snps4_%s.txt", M2.DIR, x), sep=" ", header=TRUE); 
    df$trait <- rep(x, nrow(df)); 
    return(df)}))

# filter variant data to speed up
variant_short <- variant_sm %>% filter(ID %in% snps$SNP)



ID,POS,REF,ALT,Gene_symbol,maf,HGVSp,CHR,B.f,B.m,⋯,SE.m,P.f,P.m,p1,p2,p3,p4,category,gene,trait
Affx-15837191,3752874,A,G,APBA3,0.45243249,ENSP00000315136.2:p.Cys376Arg,19,-0.0111695,-0.0224609,⋯,1.38868e-05,0.0012617,1.67073e-09,7.516764e-06,1.463738e-11,6.029481e-06,0.9999865,4,APBA3,Alanine_aminotransferase
Affx-19716376,44332888,T,TC,PNPLA3,0.1629523,,22,0.107704,0.109732,⋯,2.480588e-05,6.15256e-119,2.11911e-107,4.133258e-217,5.073164e-108,1.456869e-119,1.0,4,PNPLA3,Alanine_aminotransferase
Affx-20090007,44066247,G,C,ABCG8,0.06540481,ENSP00000272286.2:p.Asp19His,2,0.0588241,0.0324785,⋯,5.571264e-05,2.44843e-17,1.35421e-05,8.457752e-16,6.971725e-08,2.2691209999999997e-19,0.9999999,4,ABCG8,Alanine_aminotransferase
Affx-22413417,50176739,C,A,SEMA3F-AS1,0.25722407,,3,-0.0184957,-0.00766103,⋯,1.781574e-05,2.43861e-06,0.0695206,0.1876171,0.0001524191,1.143639e-08,0.8122305,4,LOC100129060,Alanine_aminotransferase
Affx-26682790,52193237,A,G,ITGA1,0.08134591,,5,0.0178112,0.0366407,⋯,4.581774e-05,0.00452681,6.20354e-08,0.0002804715,3.072548e-10,1.217277e-05,0.9997074,4,ITGA1,Alanine_aminotransferase
Affx-28463585,31830593,A,C,NEU1,0.02988194,,6,0.0401167,0.0346471,⋯,0.0001199748,6.6688e-05,0.001561,0.03699061,3.280478e-06,1.676937e-07,0.9630059,4,NEU1,Alanine_aminotransferase


In [46]:
# we're missing X chromosome annotations!
gwas_file <- fread("../gwas1015_ss/ukb24983_v2_hg19.Alanine_aminotransferase_onesex.genotyped.glm.linear", data.table=FALSE)
x_pos <- gwas_file %>% rename("CHR"="#CHROM") %>% filter(CHR %in% c("X", "XY")) %>% select(ID, CHR, POS, REF, ALT)
x_pos %>% head()


ID,CHR,POS,REF,ALT
rs34557243,XY,60425,C,A
rs28494123,XY,60454,A,G
rs28590175,XY,61067,A,G
rs117654552,XY,62079,C,G
rs28491545,XY,62615,A,C
rs73174453,XY,167755,G,T


In [57]:
# we can get MAF from my table, BP from a GWAS table, we have some gene symbol data?
chrX <- read.table("../data/chr_qc/chrX_qc_table.txt", header=TRUE)
chrXY <- read.table("../data/chr_qc/chrXY_qc_table.txt", header=TRUE)

xxy_maf <- do.call(rbind, list(chrX %>% select(SNP, MAF), chrXY %>% select(SNP, MAF)))
xxy_maf %>% head()


SNP,MAF
Affx-34464730,0.821453
Affx-34470541,0.998817
Affx-34480971,0.987194
Affx-34492707,0.988038
Affx-34492709,0.224857
Affx-34494619,0.767836


In [59]:
x_tab <- full_join(x_pos, xxy_maf, by=c("ID"="SNP"))
write_tsv(x_tab, "../data/x_variant_info.txt")


In [64]:
snp_gene <- read.table("../data/snp_gene_table.txt", sep=" ", header=TRUE)
xxy_snp <- snp_gene %>% filter(snp %in% x_tab$ID)
head(xxy_snp)

gene,snp
BTK,Affx-34464730
GPRASP1,Affx-34470541
IL1RAPL2,Affx-34480971
COL4A6,Affx-34492707
COL4A6,Affx-34492709
IRS4,Affx-34494619


In [70]:
x_w_g <- left_join(x_tab, xxy_snp, by=c("ID"="snp"))

x_w_g %>% filter(is.na(gene)) %>% group_by(CHR) %>% count()
x_w_g %>% filter(!is.na(gene)) %>% group_by(CHR) %>% count()

CHR,n
X,6637
XY,171


CHR,n
X,12220
XY,1186


In [73]:
head(x_w_g)
head(variant_short)

reform_x <- x_w_g %>% rename(Gene_symbol=gene, maf=MAF) %>% mutate(HGVSp=NA) %>% select(colnames(variant_short))
var_w_x <- rbind(variant_short, reform_x)

ID,CHR,POS,REF,ALT,MAF,gene
rs34557243,XY,60425,C,A,0.96139,
rs28494123,XY,60454,A,G,0.481242,
rs28590175,XY,61067,A,G,0.953109,
rs117654552,XY,62079,C,G,0.960478,
rs28491545,XY,62615,A,C,0.478833,
rs73174453,XY,167755,G,T,0.462018,


ID,POS,REF,ALT,Gene_symbol,maf,HGVSp
rs4970383,838555,C,A,AL645608.6,0.24553333,
rs13303065,891059,C,T,NOC2L,0.34523622,
rs3829740,909238,G,C,PLEKHN1,0.43165099,"ENSP00000462558.1:p.Arg52Pro,ENSP00000368720.3:p.Arg487Pro,ENSP00000368719.2:p.Arg539Pro,ENSP00000368717.2:p.Arg452Pro"
rs28869591,920640,C,T,PERM1,0.16179403,
rs2710887,986443,C,T,AGRN,0.08425568,
rs9651273,1031540,A,G,C1orf159,0.27136542,


In [77]:
# join
comb_x <- right_join(var_w_x, snps, by=c("ID"="SNP")) 
head(comb_x)


out_tab <- comb_x %>% 
      mutate(CHR=factor(CHR, levels=c(1:22, "X", "XY"))) %>% # do this to reorder properly
      arrange(trait, CHR, POS) %>% 
      select(-gene) %>% 
      select(trait, ID, CHR, POS, everything()) %>% 
      select(-HGVSp, HGVSp) %>%
     rename( "MAF"="maf", "GENE"="Gene_symbol")
    

ID,POS,REF,ALT,Gene_symbol,maf,HGVSp,CHR,B.f,B.m,⋯,SE.m,P.f,P.m,p1,p2,p3,p4,category,gene,trait
Affx-15837191,3752874,A,G,APBA3,0.45243249,ENSP00000315136.2:p.Cys376Arg,19,-0.0111695,-0.0224609,⋯,1.38868e-05,0.0012617,1.67073e-09,7.516764e-06,1.463738e-11,6.029481e-06,0.9999865,4,APBA3,Alanine_aminotransferase
Affx-19716376,44332888,T,TC,PNPLA3,0.1629523,,22,0.107704,0.109732,⋯,2.480588e-05,6.15256e-119,2.11911e-107,4.133258e-217,5.073164e-108,1.456869e-119,1.0,4,PNPLA3,Alanine_aminotransferase
Affx-20090007,44066247,G,C,ABCG8,0.06540481,ENSP00000272286.2:p.Asp19His,2,0.0588241,0.0324785,⋯,5.571264e-05,2.44843e-17,1.35421e-05,8.457752e-16,6.971725e-08,2.2691209999999997e-19,0.9999999,4,ABCG8,Alanine_aminotransferase
Affx-22413417,50176739,C,A,SEMA3F-AS1,0.25722407,,3,-0.0184957,-0.00766103,⋯,1.781574e-05,2.43861e-06,0.0695206,0.1876171,0.0001524191,1.143639e-08,0.8122305,4,LOC100129060,Alanine_aminotransferase
Affx-26682790,52193237,A,G,ITGA1,0.08134591,,5,0.0178112,0.0366407,⋯,4.581774e-05,0.00452681,6.20354e-08,0.0002804715,3.072548e-10,1.217277e-05,0.9997074,4,ITGA1,Alanine_aminotransferase
Affx-28463585,31830593,A,C,NEU1,0.02988194,,6,0.0401167,0.0346471,⋯,0.0001199748,6.6688e-05,0.001561,0.03699061,3.280478e-06,1.676937e-07,0.9630059,4,NEU1,Alanine_aminotransferase


In [78]:
head(out_tab)

trait,ID,CHR,POS,REF,ALT,GENE,MAF,B.f,B.m,SE.f,SE.m,P.f,P.m,p1,p2,p3,p4,category,HGVSp
Alanine_aminotransferase,rs263526,1,2173504,T,C,SKI,0.38452375,-0.0107959,-0.0140366,1.246062e-05,1.444448e-05,0.00222582,0.000221468,0.49807778,5.86327e-07,4.823192e-06,0.5019168,4,
Alanine_aminotransferase,rs77797313,1,2186115,G,A,SKI,0.03826131,0.0258035,0.0395107,7.992843e-05,9.296026e-05,0.00389956,4.17058e-05,0.06336973,1.13254e-07,7.529861e-06,0.9366226,4,
Alanine_aminotransferase,rs72646048,1,2723193,G,T,TTC34,0.04009857,0.0356744,0.0173326,7.528651e-05,8.845421e-05,3.93329e-05,0.0653435,0.4246497,5.358132e-05,6.069179e-08,0.5752967,4,
Alanine_aminotransferase,rs1884429,1,11112836,T,C,SRM,0.23407125,0.0172101,0.0151433,1.820703e-05,2.153693e-05,5.50205e-05,0.00110234,0.08856655,3.998032e-06,2.345487e-07,0.9114292,4,
Alanine_aminotransferase,rs2982372,1,11372913,T,C,,0.2476737,-0.0162248,-0.0128728,1.58937e-05,1.847822e-05,4.70851e-05,0.00274821,0.1814832,8.871068e-06,1.921977e-07,0.8185077,4,
Alanine_aminotransferase,rs79757554,1,16067710,C,T,SLC25A34,0.05112729,0.0359501,0.0109642,6.007692e-05,7.01346e-05,3.51776e-06,0.190462,0.17364802,0.0001960343,9.529805e-09,0.8261559,4,


In [79]:
OUT.DIR <- "../data/aggreg_1020"

out_tab %>% filter(category==2) %>% arrange(trait, CHR) %>% write_csv(sprintf("%s/m2_results_1020_f_spec.csv", OUT.DIR))
out_tab %>% filter(category==3) %>% arrange(trait, CHR) %>% write_csv(sprintf("%s/m2_results_1020_m_spec.csv", OUT.DIR))
out_tab %>% filter(category==4) %>% arrange(trait, CHR) %>% write_csv(sprintf("%s/m2_results_1020_shared.csv", OUT.DIR))

In [80]:
table(out_tab$trait, out_tab$category)

                            
                                2    3    4
  Alanine_aminotransferase      1    1  691
  Albumin                       0    0  557
  Alkaline_phosphatase          4    5 1725
  C_reactive_protein            1    0  997
  Creatinine                    2    1 2062
  Creatinine_in_urine           0    0    1
  Cystatin_C                    1    1 1833
  Direct_bilirubin              0    0  314
  Glucose                       0    0  165
  Glycated_haemoglobin_HbA1c    3    3 1956
  HDL_cholesterol               7    2 1448
  Phosphate                     1    1  431
  Sodium_in_urine               0    0    4
  Testosterone                161  651   18

In [81]:
length(unique(out_tab$trait))