Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 40 lines (25 sloc) 1.67 KB
#!/bin/bash
## This script will spider a website and download all HTML pages and then determine
# what % have analytics installed. This script can run independantly ot together with the check.sh script.
outputdir="web"
address=$1
analytics_string=$2
domain=`echo $address | sed -r "s/^(.+\/\/)([^/]+)(.*)/\2/"`
exec `wget -D $domain -R .swf,.JPG,.PNG,.GIF,.tiff,.bmp,*smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l 5 -nc --connect-timeout=5 -Q 10m -P$outputdir --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/9.0.1" $address`
# Grep for the number of pages that include the analytics string - stop at first occourance of string in file
finds=`grep -lri "$analytics_string" ./web/$domain --include=*.html | wc -l | sed 's/ //g'`
# Find how many HTML pages have been spidered
files=`find ./$outputdir/$domain -type f \( -name "*.html" -or -name "*.htm" \) | wc -l | sed 's/ //g'`
# There are some files that are mirroed that are behind a proxy, which are not part of the website, but wget still picks them up. The -E 'string' supports regex matching
ignore_files=`find ./$outputdir/$domain -type f \( -name "*.html" -or -name "*.htm" \) | grep -i -E 'smartproxy' | wc -l | sed 's/ //g'`
# Subtract the ignored files from files to get a final number.
files=`echo $files-$ignore_files|bc`
#echo "found $finds files with string out of $files files"
if [ $files -ge 1 ]; then
# If at least one page is found, calculate percentage
echo "scale=2; $finds*100/$files" | bc
else
# Return 0 if none or one pages returned total.
echo "0"
fi
exec `rm -rf $outputdir/$domain`