-
Notifications
You must be signed in to change notification settings - Fork 11
/
nothotdog-find-data.R
83 lines (70 loc) · 3.18 KB
/
nothotdog-find-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
## Find images to use for the Not Hotdog with Custom Vision application
## DO NOT RUN THIS SCRIPT
## This script generates the supplied hotdog-good.txt and nothotdog-good.txt files
## It samples from a collection of ImageNet URLs in various categories
## and eliminates URLs that fail and other inappropriate images
library(tools)
library(httr)
## these lists were downloaded from ImageNet 2011 http://image-net.org/explore_popular.php
## for the "hot dog", "frankfurter bun", categories
dogs <- scan("hotdogs.txt",what=character())
franks <- scan("frankfurtbuns.txt", what=character())
burgers <- scan("hamburger.txt", what=character())
tacos <- scan("tacos.txt", what=character())
## so we can reproduce this later
set.seed(3302018)
## We won't need many images to build a Not Hotdog classifier, so let's grab
## 100 images. We'll actually lose about 30% to bad URLs further down the line
Nimages <- 100
hotdogs <- sample(c(dogs,franks),Nimages)
## We'll also grab images of things that might be easily mistaken for hotdogs:
## burgers, tacos
nothotdogs <- sample(c(burgers,tacos), Nimages)
## Not all of the URLs are valid. URLs may not be accessible, and Flickr may return
## a "This photo is not available" image
## These indexes were calculated using code in the next chunk and visual inspection.
## They're only valid if you use the same seed, Nimages, and supplied .txt files of URLs
bad.dogs <- c(3,4,10,14,18,21,25,26,28,32,33,34,38,43,47,51,56,68,71,73,75,76,79,80,81,82,85,89,91,94,95,96,
31, #woman eating icecream
52, #redirected URL
62, #puppy in a hotdog bun
88, #loaves of bread,
89 #invalid URL
)
bad.notdogs <- c(8,9,12,26,27,31,34,50,52,56,61,69,71,72,83,86,95,97,98,
54, # bad extension
93 #corrupt file
)
hotdogs.good <- hotdogs[-bad.dogs]
nothotdogs.good <- nothotdogs[-bad.notdogs]
write(hotdogs.good, "hotdogs-good.txt")
write(nothotdogs.good, "nothotdogs-good.txt")
## This code was run interactively. It's included here so you can see
## how we downloaded the images for review, and created the bad.dogs and bad.notdogs
## objects above
if(FALSE) {
valid.dogs <- rep(TRUE, length(hotdogs))
valid.dogs[bad.dogs] <- FALSE
for (i in seq(along=hotdogs)) {
u <- hotdogs[i]
destfile <- file.path("hotdogs", paste0(i,"-",basename(u)))
if(valid.dogs[i] && !file.exists(destfile))
try(download.file(u, destfile , mode="wb", method="auto"))
# flag failed downloads and tiny files (flickr error images are 2051 bytes)
valid.dogs[i] <- file.exists(destfile) && (file.size(destfile)>9000)
}
# print index of bad URLs
cat((1:Nimages)[!valid.dogs],sep=",")
valid.notdogs <- rep(TRUE, length(nothotdogs))
valid.notdogs[bad.notdogs] <- FALSE
for (i in seq(along=nothotdogs)) {
u <- nothotdogs[i]
destfile <- file.path("nothotdogs", paste0(i,"-",basename(u)))
if(valid.notdogs[i] && !file.exists(destfile))
try(download.file(u, destfile , mode="wb", method="auto"))
# flag failed downloads and tiny files (flickr error images are 2051 bytes)
valid.notdogs[i] <- file.exists(destfile) && (file.size(destfile)>9000)
}
# print index of bad URLs
cat((1:Nimages)[!valid.notdogs],sep=",")
}