bin/mechmonkey

#!/usr/bin/perl
# This code is a part of Slash, and is released under the GPL.
# Copyright 1997-2005 by Open Source Technology Group. See README
# and COPYING for more information, or see http://slashcode.com/.
# $Id$

# A script to test a Slash site by clicking around in it.  Not
# really intended for load testing but if you run 100 copies of
# this it would probably work pretty well for that purpose.
# Mostly intended to poke around and hit every script on your
# site so you will find errors in your apache log.
#
# Still in early stages of development...

use warnings;
use strict;

use Getopt::Std;
use Time::HiRes;
use LWP::Parallel::UserAgent;
use WWW::Mechanize;

use Slash;
use Slash::Utility;
use Slash::Utility::Data;
use Slash::DB;

use vars qw(
	$VERSION
	%opts
	$quiet
	$pause_factor	$stop_time	$load_images	$dict_file
	$virtuser
	$slashdb
	$mech
	$absolutedir		$basedomain_regex
	$url_tilde_regex	$url_comment_regex	$url_mode_regex
);

($VERSION) = ' $Revision$ ' =~ /\$Revision:\s+([^\s]+)/;

init();
run();

exit 0;

############################################################

sub init {
	my $opts_success = getopts('p:s:d:u:vhqI', \%opts);
	if (!$opts_success) {
		usage('Options used incorrectly');
	}
	usage() if $opts{h};
	version() if $opts{v};

	$virtuser = $opts{u} || 'slash';
	createEnvironment($virtuser);
	my $constants = getCurrentStatic();
	$pause_factor = $opts{p} || 1;
	$stop_time = $^T + ($opts{s} || 3600);
	$quiet = $opts{q} ? 1 : 0;
	$load_images = $opts{I} ? 0 : 1;
	$dict_file = $opts{d} || $constants->{hc_q1_usedict};
	$slashdb = getCurrentDB();

	my $abs = $constants->{absolutedir};
	my $abs_uri = URI->new($abs);
	die "no absolute uri from '$constants->{absolutedir}'" if !$abs_uri;
	$absolutedir = $abs_uri->canonical->as_string;
	my $host = $abs_uri->host;
	my $host_q = "\\b\Q$host\E\$";
	$basedomain_regex = qr{$host_q};
	$url_tilde_regex = qr{\~|\%7e}i;
	$url_comment_regex = qr{(article|comments|journal)\.pl};
	$url_mode_regex = qr{\bmode=(nocomment|thread|nested|flat)\b/};

	$mech = WWW::Mechanize->new( autocheck => 1, onerror => undef );
	$mech->get($absolutedir);
}

sub run {
	my $last_elapsed = undef;
	while (time <= $stop_time) {
		report($last_elapsed);
		load_images();
		sleep_short();
		$last_elapsed = do_random_action();
		back_if_error();
	}
}

sub sleep_short { # median 1 second
	Time::HiRes::sleep( (rand(1)*rand(1)*4) * $pause_factor );
}

sub sleep_medium { # median 6 seconds
	Time::HiRes::sleep( (rand(1)*rand(1)*16 + 2) * $pause_factor );
}

sub sleep_long { # median 60 seconds
	Time::HiRes::sleep( (rand(1)*rand(1)*160 + 20) * $pause_factor );
}

sub report {
	my($last_elapsed) = @_;
	my $elapsed_str = defined($last_elapsed)
		? sprintf("%7.3f ", $last_elapsed)
		: "      - ";
	my $success = $mech->success();
	return if $success && $quiet;
	my $f = $success ? '' : 'FAILURE AT ';
	printf "%s %9d %s%s%s\n",
		scalar(localtime),
		length($mech->content()),
		$elapsed_str, $f, $mech->uri();
}

sub load_images {

	return unless $load_images;

#use LWP::Debug;
#use Data::Dumper;
#LWP::Debug::level("+trace");
#LWP::Debug::level("+debug");

	my $start_time = Time::HiRes::time;
	my @i = $mech->images();
	my $p = LWP::Parallel::UserAgent->new();
	$p->cookie_jar( $mech->cookie_jar );
	$p->in_order(1);	# try to fetch images in same order as found on webpage I guess
	$p->duplicates(0);	# if images are duplicated, load them only once
	$p->timeout(0.5);	# timeout for establishing the conn with the server, per request I think
	$p->redirect(1);	# do follow redirects
	$p->max_redirect(5);	# give up after 5 redirects in a row
	$p->max_req(8);		# max 8 parallel requests to any one server
	$p->max_hosts(10);	# "max parallel servers accessed," I'm not sure exactly what this means
	for my $i (@i) {
		my $url = URI->new_abs($i->url(), $mech->uri());
		my $req = HTTP::Request->new(GET => $url);
		$p->register($req);
	}
#print scalar(localtime) . " beginning p->wait for " . scalar(@i) . " images...\n";
	my $entries = $p->wait(10); # overall timeout for getting all responses
#print scalar(localtime) . " p->wait done.\n";
	my($image_bytes, $load_errors) = (0, 0);
	for my $e (sort keys %$entries) {
		my $response = $entries->{$e}->response;
#print "image length " . length($response->content()) . " for " . $response->request->url . "\n";
		if (!$response->is_success()) {
my $line = $response->status_line; chomp $line;
warn "image load failure '$line' for " . $response->request->url . "\n";
			++$load_errors;
		} else {
			$image_bytes += length($response->content());
		}
	}
	if ($load_errors) {
		my $elapsed = Time::HiRes::time-$start_time;
		printf "%s %9d loaded %d images in %.2f secs with %d errors\n",
			scalar(localtime), $image_bytes, scalar(@i), $elapsed, $load_errors;
	}
}

sub url_is_within_site {
	my($url) = @_;
	my $uri_abs = URI->new_abs($url, $absolutedir);
	die "no uri_abs from url '$url'" if !$uri_abs;
	my $uri_scheme = $uri_abs->scheme;
	return 0 if $uri_scheme ne 'http';
	my $uri_host = $uri_abs->host;
	return ($uri_host && $uri_host =~ $basedomain_regex) ? 1 : 0;
}

sub do_random_action {
	my $r = rand();
	my $start_time = Time::HiRes::time;
	my $slept = 0;
	   if ($r < 0.07) {	$mech->back()			}
	elsif ($r < 0.10) {	sleep_medium(); $slept = 1	}
	elsif ($r < 0.12) {	sleep_long(); $slept = 1	}
	elsif ($r < 0.15) {	edit_url_up()			}
	elsif ($r < 0.17) {	reload()			}
	elsif ($r < 0.35) {	search()			}
	elsif ($r < 0.40) {	go_home()			}
	elsif ($r < 0.43) {	pick_image_link()		}
	else              {	pick_any_link()			}
	if ($slept) { return undef }
	else { return Time::HiRes::time - $start_time }
}

sub back_if_error {
	my $uri = URI->new($mech->uri());
	return if $uri->host =~ $basedomain_regex && $mech->success;
	$mech->back();
}

sub edit_url_up {
	my $uri = URI->new($mech->uri());
	my $path = $uri->path();
	$path =~ s{[^/]+/?$}{};
	$uri->path($path);
	$mech->get($uri);
}

sub reload {
	$mech->reload();
}

sub search {
	my $form_num = find_search_form_number();
	return unless $form_num;
	my $dict_word = getRandomWordFromDictFile($dict_file,
		{ min_chars => 1, max_chars => 6 })
		|| 'foo';
	$mech->form_number($form_num);
	$mech->field(query => $dict_word);
	$mech->click();
}

sub find_search_form_number {
	my @forms = $mech->forms();
	for my $i (0..$#forms) {
		my $action = $forms[$i]->{action};
		next unless $action && ref $action;
		my $host = $action->host;
		next unless $host =~ $basedomain_regex;
		my $path = $action->path;
		next unless $path eq '/search.pl';
		return $i + 1; # WWW::Mechanize numbers forms one-based
	}
	return 0; # no search form on the current page
}

sub go_home {
	my $uri = URI->new($absolutedir);
	if (rand(1) < 0.50) {
		$uri->path('/index.pl');
	}
	$mech->get($uri->as_string);
}

sub pick_image_link {
	my @links =
		grep { url_is_within_site($_) }	# only local links please
		map { $_->url() }		# convert WWW::Mechanize::Link to url text
		$mech->find_all_links( tag => 'a',
			url_regex => qr{image}
		);
	return if !@links;
	my $link = @links[rand @links];
	$mech->get($link) if $link;
}

sub pick_any_link {
	my @links =
		grep { url_is_within_site($_) }	# only local links please
		map { $_->url() }		# convert WWW::Mechanize::Link to url text
		$mech->find_all_links( tag => 'a' );
	return if !@links;

	# Prefer certain types of link because they test the system better.
	my $tildes = scalar grep /$url_tilde_regex/, @links;
	my $comments = scalar grep /$url_comment_regex/, @links;
	my $r = rand(1);
	if ($tildes && $r < 0.30) {
		@links = grep /$url_tilde_regex/, @links;
	} elsif ($comments && $r < 0.80) {
		@links = grep /$url_comment_regex/, @links;
	}

	my $link = @links[rand @links];
	$link = massage_link($link);
	$mech->get($link) if $link;
}

sub massage_link {
	my($link) = @_;
	return $link if rand(1) < 0.50;
	my @modes = qw( nocomment thread nested flat );
	if ($link =~ $url_mode_regex) {
		# Switch around the mode for fun.
		my $newmode = $modes[rand @modes];
		$link =~ s/$url_mode_regex/mode=$newmode/;
	} elsif ($link =~ /(article|comments)\.pl\?.*\bsid=/ && $link !~ /\bmode=/) {
		# No mode specified, add one for fun.
		$link = "${link}&mode=" . $modes[rand @modes];
	}
	return $link;
}