# Testing annotation with snpGeneSets R package

This is a utility script to test the usage of snpGeneSets before creating a dedicated workflow for it

In [None]:
library('snpGeneSets')
library('dplyr')

In [35]:
# Clean workspace
rm(list=ls())
setwd('/gpfs/gibbs/pi/dewan/data/UKBiobank/results/FastGWA_results/results_imputed_data/srt_int_worst')
# Import the sumstats file as dataframe
data <- read.table(gzfile('200904_UKBB_SRT_int_worst_cc_srt_int_worst.fastGWA.snp_stats_original_columns.gz'), header=T)
head(data)

Unnamed: 0_level_0,CHR,SNP,POS,A1,A2,N,AF1,BETA,SE,P,INFO
Unnamed: 0_level_1,<int>,<fct>,<int>,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,1:692794_CA_C,692794,CA,C,135829,0.888284,-0.0092449,0.0060826,0.128538,0.82737
2,1,rs12238997,693731,A,G,135829,0.883261,-0.00425031,0.00575698,0.460339,0.882547
3,1,rs371890604,707522,G,C,135829,0.901952,-0.00915078,0.00647343,0.157482,0.813483
4,1,rs149887893,714596,T,C,135829,0.968106,0.00414519,0.0109184,0.704205,0.84973
5,1,rs12184267,715265,C,T,135829,0.964543,0.003227,0.00991155,0.744742,0.931906
6,1,rs12184277,715367,A,G,135829,0.964459,0.00326852,0.00987841,0.740739,0.936044


In [36]:
# Filter SNPs with p-val <5e-06
# Subset data to obtain only chr, pos and snp for gene mapping
sig.p <- data %>%
    filter(P < 1e-5) %>%
    mutate(chr = CHR,
        pos = POS,
        snp = as.character(SNP)) %>%
      select(chr, pos, snp)
head(sig.p)

Unnamed: 0_level_0,chr,pos,snp
Unnamed: 0_level_1,<int>,<int>,<chr>
1,1,26489360,rs183594486
2,1,26560636,rs565353716
3,1,26597672,rs189266207
4,1,26659329,rs571629135
5,1,26673095,rs181557677
6,1,26722812,rs558456184


In [37]:
# Get the annotation of SNPs with different genome assemblies
snpMapAnn37<- getSNPMap(sig.p$snp, GRCh=37)

In [38]:
head(snpMapAnn37$rsid_map)
length(snpMapAnn37$other)

Unnamed: 0_level_0,chr,pos,snp
Unnamed: 0_level_1,<chr>,<int>,<chr>
1,4,88770819,rs10017912
2,4,88770890,rs10018094
3,4,88771879,rs10021236
4,4,88772779,rs10024086
5,4,88804039,rs10222708
6,4,88804315,rs10222816


In [39]:
# Mapping SNPs to genes (define gene boundary ‘up’ for the upstream region and ‘down’ for the downstream region with default value of 2,000 bp for both)
snpGeneMapAnn<- snp2Gene(snpMapAnn37$rsid_map$snp)
cat("The unique number of genes is",length(unique(snpGeneMapAnn$map$gene_id)),"\n")
cat("The number of variants that could not be mapped to a gene is:",length(snpGeneMapAnn$other),"\n")

The unique number of genes is 41 
The number of variants that could not be mapped to a gene is: 142 


In [40]:
#Get the gene-name and gene-id for the mapped variants
gene_mapped <- getGeneMap(snpGeneMapAnn$map$gene_id)$gene_map
head(gene_mapped)

Unnamed: 0_level_0,chr,start,end,strand,gene_name,gene_id
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<chr>,<int>
1,22,23522402,23660224,+,BCR,613
2,10,126676418,126849624,-,CTBP2,1488
3,11,57520756,57586652,+,CTNND1,1500
4,6,32605169,32612152,+,HLA-DQA1,3117
5,2,183004762,183387572,-,PDE1A,5136
6,22,19744226,19771116,+,TBX1,6899


In [41]:
# Merge the datasets
snp_gene = merge(x = snpMapAnn37$rsid_map,y = snpGeneMapAnn$map[,c("snp", "gene_id")],by="snp", all.x=TRUE)
head(snp_gene)

Unnamed: 0_level_0,snp,chr,pos,gene_id
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>
1,rs10017912,4,88770819,
2,rs10018094,4,88770890,
3,rs10021236,4,88771879,
4,rs10024086,4,88772779,
5,rs10222708,4,88804039,
6,rs10222816,4,88804315,


In [42]:
snp_gene_2 = merge(x = snp_gene,y = gene_mapped[,c("gene_id", "gene_name")],by="gene_id", all.x=TRUE)
head(snp_gene_2)

Unnamed: 0_level_0,gene_id,snp,chr,pos,gene_name
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<chr>
1,613,rs138263187,22,23585867,BCR
2,1488,rs17711828,10,126814086,CTBP2
3,1488,rs17644414,10,126822676,CTBP2
4,1488,rs75979600,10,126819764,CTBP2
5,1488,rs10510140,10,126816124,CTBP2
6,1488,rs17644332,10,126820093,CTBP2


In [43]:
names(snp_gene_2)[names(snp_gene_2) == 'snp'] <- 'SNP'
snp_gene_3 = merge(x = snp_gene_2, y = data[,c("A1", "A2", "N", "AF1","P","BETA", "SE", "INFO","SNP")],by="SNP", all.x=TRUE)
head(snp_gene_3)

Unnamed: 0_level_0,SNP,gene_id,chr,pos,gene_name,A1,A2,N,AF1,P,BETA,SE,INFO
Unnamed: 0_level_1,<chr>,<int>,<chr>,<int>,<chr>,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,rs10017912,,4,88770819,,C,T,135829,0.687338,6.72998e-06,0.0169046,0.00375487,0.99445
2,rs10018094,,4,88770890,,G,A,135829,0.686101,6.10159e-06,0.016959,0.00374963,0.995276
3,rs10021236,,4,88771879,,G,C,135829,0.691783,3.26023e-06,0.0175132,0.00376329,0.997103
4,rs10024086,,4,88772779,,C,T,135829,0.694296,2.40336e-06,0.0178419,0.00378313,0.991256
5,rs10222708,,4,88804039,,T,C,135829,0.690659,1.16324e-06,0.0182764,0.00375919,0.997218
6,rs10222816,,4,88804315,,A,G,135829,0.691428,1.38434e-06,0.0181838,0.0037669,0.994396


In [44]:
# Get the final table with ordered pval
final_gene_set <- snp_gene_3 %>%
    select(chr, SNP, pos, A1, A2, N, AF1, BETA, SE, P, INFO, gene_id, gene_name) %>%
    arrange(P)
names(final_gene_set)[names(final_gene_set) == 'chr'] <- 'CHR'
names(final_gene_set)[names(final_gene_set) == 'pos'] <- 'POS'
head(final_gene_set)
dim(final_gene_set)
#select(CHR, SNP, POS, A1, A2, N, AF1, BETA, SE, P, INFO, gene_id, gene_name)

Unnamed: 0_level_0,CHR,SNP,POS,A1,A2,N,AF1,BETA,SE,P,INFO,gene_id,gene_name
Unnamed: 0_level_1,<chr>,<chr>,<int>,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
1,11,rs150558458,95313570,G,T,135829,0.9554,-0.0445585,0.00859721,2.18453e-07,0.959707,,
2,6,rs79251081,13999543,G,T,135829,0.983423,-0.0708902,0.0138285,2.95346e-07,0.965011,,
3,22,rs1978060,19749525,A,G,135829,0.383189,0.0185151,0.00362548,3.27433e-07,0.971673,6899.0,TBX1
4,22,rs200354114,19764608,T,TGTAA,135829,0.621157,-0.0186359,0.00365535,3.42804e-07,0.961533,6899.0,TBX1
5,1,rs181557677,26673095,G,A,135829,0.986702,-0.07912,0.0155721,3.75704e-07,0.945189,55057.0,AIM1L
6,3,rs148779301,153007429,G,A,135829,0.98862,-0.084044,0.0166775,4.67093e-07,0.966573,,


In [45]:
# Write results to a table
write.table(final_gene_set, '200904_UKBB_SRT_int_worst_cc_srt_int_worst.fastGWA.geneann', sep = "\t", quote=FALSE, row.names=FALSE)