Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

executable file 40 lines (25 sloc) 1.711 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
#!/bin/bash

## This script will spider a website and download all HTML pages and then determine
# what % have analytics installed. This script can run independantly ot together with the check.sh script.

outputdir="web"
address=$1
analytics_string=$2
domain=`echo $address | sed -r "s/^(.+\/\/)([^/]+)(.*)/\2/"`

exec `wget -D $domain -R .swf,.JPG,.PNG,.GIF,.tiff,.bmp,*smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l 5 -nc --connect-timeout=5 -Q 10m -P$outputdir --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/9.0.1" $address`

# Grep for the number of pages that include the analytics string - stop at first occourance of string in file

finds=`grep -lri "$analytics_string" ./web/$domain --include=*.html | wc -l | sed 's/ //g'`

# Find how many HTML pages have been spidered

files=`find ./$outputdir/$domain -type f \( -name "*.html" -or -name "*.htm" \) | wc -l | sed 's/ //g'`

# There are some files that are mirroed that are behind a proxy, which are not part of the website, but wget still picks them up. The -E 'string' supports regex matching

ignore_files=`find ./$outputdir/$domain -type f \( -name "*.html" -or -name "*.htm" \) | grep -i -E 'smartproxy' | wc -l | sed 's/ //g'`

# Subtract the ignored files from files to get a final number.

files=`echo $files-$ignore_files|bc`

#echo "found $finds files with string out of $files files"

if [ $files -ge 1 ]; then
# If at least one page is found, calculate percentage
echo "scale=2; $finds*100/$files" | bc
else
# Return 0 if none or one pages returned total.
echo "0"
fi

exec `rm -rf $outputdir/$domain`
Something went wrong with that request. Please try again.