-
Notifications
You must be signed in to change notification settings - Fork 2
/
vcf2genlight.R
executable file
·142 lines (117 loc) · 4.81 KB
/
vcf2genlight.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#! /usr/bin/Rscript
options(warn=-1) #suppress warnings during execution
args <- commandArgs(TRUE)
#Exception if no argument is provided
if(length(args) < 1) {
args <- "--help"
}
###########################################################################################
# #HELP SECTION
###########################################################################################
if("--help" %in% args) {
cat("
angsd2vcf
_________________________________________________________________________________________________
Converts a VCF file to the SNP input format for adegenet. You can provide a population map (two
columns, one of sample ID and one of population ID, tab-separated).
Arguments:
--vcf= path to the VCF file to convert
--out= output file name and path
--popmap= population map (optional)
--help print this text
Example:
./vcf2genlight.R --vcf=~/path/to/data.vcf --out=~/path/to/outfile.snp
_________________________________________________________________________________________________
")
q(save="no")}
###########################################################################################
# #STANDARD ARGUMENT PARSER
###########################################################################################
parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
argsL <- as.list(as.character(argsDF$V2))
names(argsL) <- argsDF$V1
##--vcf error
if(is.null(argsL$vcf)) {
stop("No VCF file has been provided")}
## --out error
if(is.null(argsL$out)) {
stop("No output file has been provided")}
##--popmap option
if(is.null(argsL$popmap)==F){
argsL$popmap -> path_popmap
pop_flag<-1
} else{ pop_flag <- 0 }
###########################################################################################
# #SETTING UP THE PATHS
###########################################################################################
argsL$vcf -> path_vcf
argsL$out -> path_out
###########################################################################################
# #CHECKING INPUT
###########################################################################################
#Check the input
if(file.exists(eval(path_vcf))==FALSE){stop("VCF file list not found.")}
#Check the output directory
if(file.exists(dirname(eval(path_out)))==FALSE){stop("Output directory not found")}
###########################################################################################
# #CONVERTING FORMAT
###########################################################################################
cat("\n\tReading VCF file...")
paste("cat ", path_vcf, " | grep -v '##' | cut -f 10- > ", path_vcf, ".tmp", sep='', collapse='')->cmd
system(cmd)
paste(path_vcf, '.tmp', sep='', collapse='') -> path_tmp
read.table(eval(path_tmp), sep='\t', header=T, comment.char='')->vcf
paste("rm ", path_vcf, ".tmp", sep='', collapse='')->cmd
system(cmd)
cat("done.\n")
if(pop_flag==1){
cat("\tReading the population map...")
read.table(path_popmap, sep='\t', header=F)->popmap
cat("done.\n")
}
cat("\tParsing data...")
#A complicated way to keep only the genotypes for each individual
as.list(vcf)->vcf.list
vcf.split <- function(x){strsplit(as.character(x), ':')[[1]][1]}
vcf.apply <- function(x){unlist(lapply(x, vcf.split))}
lapply(vcf.list, vcf.apply)->gen.list
cat("done.\n")
#Then convert them to 012 format
sub.<-function(x){gsub('./.', '-', x)}
sub0<-function(x){gsub('0/0','0',x)}
sub1<-function(x){gsub('0/1','1',x)}
sub2<-function(x){gsub('1/1','2',x)}
cat("\tFormatting SNP file...\n")
cat('\t|## |\t20%')
lapply(gen.list, sub0)->gen.list
cat('\r\t|#### |\t40%')
lapply(gen.list, sub1)->gen.list
cat('\r\t|###### |\t60%')
lapply(gen.list, sub2)->gen.list
cat('\r\t|######## |\t80%')
lapply(gen.list, sub.)->gen.list
cat('\r\t|##########|\t100%\t\n\tLoading data...')
lapply(gen.list, as.numeric)->gen.list
#And transform that into a genotype matrix
do.call(cbind.data.frame, gen.list)->gen.df
t(as.matrix(gen.df))->geno
rep(">", length(rownames(geno)))->prefix
rep("\n", length(rownames(geno)))->return
paste(prefix, rownames(geno), return, sep='')->rownames(geno)
cat('done.\n\tWriting genlight file...')
line1 <- ">>>> begin comments - do not remove this line <<<<"
line2<-paste("Generated with vcf2genlight.R from ", path_vcf, ".", sep='', collapse='')
line3<-">>>> end comments - do not remove this line <<<<"
line6<-">> ploidy\n2"
if(pop_flag==0){
rbind(line1,line2,line3, line6)->header
} else if (pop_flag==1){
line4<-">> population"
line5<-paste(as.character(popmap[,2]), sep=' ', collapse=' ')
rbind(line1,line2,line3,line4,line5, line6)->header
}
write.table(header, path_out, quote=F, sep='', row.names=F, col.names=F, na='-', append=F)
write.table(geno, path_out, quote=F, sep='', col.names=F, na='-', append=T)
cat("done.\n")
cat('\n\tΤΑ ΔΗ ΝΥΝ ΠΑΝΤΑ ΤΕΛΕΙΤΑΙ\n\n')