/
FFdownload.R
144 lines (135 loc) · 7.47 KB
/
FFdownload.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#' @title Downloads Datasets from Kenneth French's Website
#'
#' @description \code{FFdownload} returns an RData file with all (possibility to exclude the large daily) datasets from Kenneth French's Website.
#' Should help researchers to work with the datasets and update the regularly. Allows for reproducible research. Be aware that processing
#' (especially when including daily files) takes quite a long time!
#'
#' @param output_file name of the .RData file to be saved (include path if necessary)
#' @param tempd specify if you want to keep downloaded files somewhere save. Seems to be necessary for
#' reproducible research as the files on the website do change from time to time
#' @param exclude_daily excludes the daily datasets (are not downloaded) ==> speeds the process up considerably
#' @param download set to TRUE if you actually want to download again. set to false and specify tempd to keep processing the already downloaded files
#' @param download_only set to FALSE if you want to process all your downloaded files at once
#' @param listsave if not NULL, the list of unzipped files is saved here (good for processing only a limited number of files through inputlist).
#' Is written before inputlist is processed.
#' @param inputlist if not NULL, FFdownload tries to match the names from the list with the list of zip-files
#' @param format (set to xts) specify "xts" or "tbl"/"tibble" for the output format of the nested lists
#'
#' @return RData file
#'
#' @examples
#' \dontrun{
#' tempf <- tempfile(fileext = ".RData"); outd <- paste0(tempdir(),"/",format(Sys.time(), "%F_%H-%M"))
#' temptxt <- tempfile(fileext = ".txt")
#'
#' # Example 1: Use FFdownload to get a list of all monthly zip-files. Save that list as temptxt.
#'
#' FFdownload(exclude_daily=TRUE,download=FALSE,download_only=TRUE,listsave=temptxt)
#' read.delim(temptxt,sep = ",")
#' # set vector with only files to download (we try a fuzzyjoin, so "Momentum" should be enough to get
#' # the Momentum Factor)
#' inputlist <- c("Research_Data_Factors","Momentum_Factor","ST_Reversal_Factor","LT_Reversal_Factor")
#' # Now process only these files if they can be matched (download only)
#' FFdownload(exclude_daily=FALSE,tempd=outd,download=TRUE,download_only=FALSE,
#' inputlist=inputlist,output_file = tempf)
#' list.files(outd)
#' # Then process all the downloaded files
#' FFdownload(output_file = tempf, exclude_daily=TRUE,tempd=outd,download=FALSE,
#' download_only=FALSE,inputlist=inputlist)
#' load(tempf); FFdata$`x_F-F_Momentum_Factor`$monthly$Temp2[1:10]
#'
#' # Example 2: Download all non-daily files and process them
#'
#' # Commented out to not being tested
#' # tempf2 <- tempfile(fileext = ".RData");
#' # outd2<- paste0(tempdir(),"/",format(Sys.time(), "%F_%H-%M"))
#' # FFdownload(output_file = tempf2,tempd = outd2, exclude_daily = TRUE, download = TRUE,
#' # download_only=FALSE, listsave=temptxt)
#' # load(tempf2)
#' # FFdownload$x_25_Portfolios_5x5$monthly$average_value_weighted_returns
#' }
#'
#' @importFrom utils download.file unzip
#' @importFrom xml2 read_html
#' @importFrom rvest html_attr html_nodes
#' @importFrom utils adist write.csv
#' @importFrom plyr mlply
#'
#' @export
FFdownload <- function(output_file = "data.Rdata", tempd=NULL, exclude_daily=FALSE, download=TRUE, download_only=FALSE, listsave=NULL, inputlist=NULL, format="xts") {
message("Step 1: getting list of all the csv-zip-files!\n")
URL <- "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html"
pg <- xml2::read_html(URL)
Flinks <- rvest::html_attr(rvest::html_nodes(pg, "a"), "href")
Findex <- grep("CSV.zip",Flinks)
Fdaily <- grep("daily",Flinks[Findex],ignore.case = TRUE)
Flinks_csv <- Flinks[Findex]
Flinks_csv_daily <- Flinks[Findex][Fdaily]
Flinks_csv_nodaily <- Flinks[Findex][-Fdaily]
# save list of links if listsave!=NULL
if(!is.null(listsave)){write.csv(gsub("ftp/","",Flinks_csv),file=listsave)}
# if there is an input-list
if(!is.null(inputlist)){
if(exclude_daily){
Flinks_final <- Flinks_csv_nodaily[apply(adist(x=inputlist,y=Flinks_csv_nodaily,ignore.case = TRUE), 1, which.min)]
} else {
Flinks_final <- Flinks_csv_nodaily[apply(adist(x=inputlist,y=Flinks_csv_nodaily,ignore.case = TRUE), 1, which.min)]
Flinks_final <- c(Flinks_final,
Flinks_csv_daily[apply(adist(x=inputlist,y=Flinks_csv_daily,ignore.case = TRUE), 1, which.min)])
}
} else {
if (exclude_daily){Flinks_final <- Flinks_csv_nodaily} else {Flinks_final <- Flinks_csv}
}
if (download){
message("Step 2: Downloading ",length(Flinks_final)," zip-files\n")
temp_download <- tempfile(pattern=""); dir.create(temp_download,showWarnings = FALSE)
if(capabilities("libcurl")){
utils::download.file(url = paste0("http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/",Flinks_final),
destfile = paste0(temp_download,"/",gsub(pattern = "ftp/","",Flinks_final)), method="libcurl",quite=TRUE)
} else {
for (i in 1:length(Flinks_final)){
Fdest <- gsub("ftp/","",Flinks_final[i])
utils::download.file(paste0("http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/",Flinks_final[i]), paste0(temp_download,"/", Fdest),quite=TRUE)
}
}
# copy to final tempd if wished
if (!is.null(tempd)) {dir.create(tempd,showWarnings = FALSE); file.copy(from = paste0(temp_download,"/",gsub(pattern = "ftp/","",Flinks_final)),
to = tempd, recursive=TRUE)}
}
# if download_only==TRUE exit
if(!download_only){
if (!download&is.null(tempd)) {stop("No directory given for reading files!")
} else if (download&is.null(tempd)) {
tempd <- temp_download
}
zip_files <- list.files(tempd, full.names = TRUE, pattern = "\\.zip$", ignore.case = TRUE) # full path
lapply(zip_files, function (x) unzip(zipfile = x, exdir = tempd))
csv_files <- list.files(tempd, full.names = TRUE, pattern = "\\.csv$", ignore.case = TRUE) # full path
csv_files2 <- list.files(tempd, full.names = FALSE, pattern = "\\.csv$", ignore.case = TRUE) # only filenames
if (length(grep("daily",csv_files2,ignore.case = TRUE))){
csv_files2_daily <- csv_files2[grep("daily",csv_files2,ignore.case = TRUE)]
csv_files2_nodaily <- csv_files2[-grep("daily",csv_files2,ignore.case = TRUE)]
} else {csv_files2_daily <- NULL; csv_files2_nodaily <- csv_files2}
vars_nodaily <- paste0("x_", gsub("(.*)\\..*", "\\1", csv_files2_nodaily) )
vars_daily <- paste0("x_", gsub("(.*)\\..*", "\\1", csv_files2_daily) )
vars <- paste0("x_", gsub("(.*)\\..*", "\\1", csv_files2) )
message("Step 3: Start processing ",length(Flinks_final)," csv-files\n")
if (format == "xts"){
FFdata <- plyr::mlply(function(y) converter(y), .data=csv_files, .progress = "text")
} else if (format %in% c("tbl","tibble")){
FFdata <- plyr::mlply(function(y) converter_tbl(y), .data=csv_files, .progress = "text")
}
names(FFdata) <- vars
# recombine lists
if(!exclude_daily){
for (i in 1:length(vars_nodaily)){
if (any(grepl(vars_nodaily[i],vars_daily))){
FFdata[[eval(vars_nodaily[i])]]$daily <- FFdata[[eval(vars_daily[grep(vars_nodaily[i],vars_daily)])]]$daily
FFdata[[eval(vars_daily[grep(vars_nodaily[i],vars_daily)])]] <- NULL
}
}
}
save(FFdata, file = output_file)
message("Be aware that as of version 1.0.6 the saved object is named FFdata rather than FFdownload to not be confused with the corresponding command!")
}
}