From fb3dc293ed45e7eccd1013c68d2f9a4a8f3593be Mon Sep 17 00:00:00 2001 From: Rob Miller Date: Tue, 13 Feb 2018 10:55:55 +0000 Subject: [PATCH] spider: outputs all of the unique URLs on a domain --- bin/spider | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100755 bin/spider diff --git a/bin/spider b/bin/spider new file mode 100755 index 0000000..da90011 --- /dev/null +++ b/bin/spider @@ -0,0 +1,14 @@ +#!/bin/bash +# +# spider +# +# Author: Rob Miller +# +# Outputs all of the HTML pages on a given domain. + + +wget -r -nd --delete-after -w 0.1 "$1" 2>&1 | + grep -B3 text/html | + grep -B2 '200 OK' | egrep 'https?://' | + cut -d' ' -f3- | + sort | uniq