htdig/Retriever.cc

//
// Retriever.cc
//
// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
//            parser notifies the Retriever object that it got something
//            (got_* functions) and the Retriever object feed the databases
//            and statistics accordingly.
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $
//

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#ifdef _MSC_VER /* _WIN32 */
# include <sys/types.h>
# include <winsock2.h>
#endif


#include "Retriever.h"
#include "htdig.h"
#include "HtWordList.h"
#include "WordRecord.h"
#include "URLRef.h"
#include "Server.h"
#include "Parsable.h"
#include "Document.h"
#include "StringList.h"
#include "WordType.h"
#include "Transport.h"
#include "HtHTTP.h"			  // For HTTP statistics
#include "md5.h"
#include "defaults.h"

#ifndef _MSC_VER /* _WIN32 */
#include <pwd.h>
#endif

#include <signal.h>
#include <stdio.h>


static int noSignal;

// no_store_phrases:
// If true, only store first occurrence of each word in a document
static bool no_store_phrases;

//*****************************************************************************
// Retriever::Retriever()
//
Retriever::Retriever(RetrieverLog flags):
words(*(HtConfiguration::config())),
words_to_add (100, 0.75)
{
	HtConfiguration *config = HtConfiguration::config();
	FILE *urls_parsed;

	currenthopcount = 0;
	max_hop_count = config->Value("max_hop_count", 999999);

	no_store_phrases = !config->Boolean("store_phrases");

	//
	// Initialize the flags for the various HTML factors
	//

	// text_factor
	factor[0] = FLAG_TEXT;
	// title_factor
	factor[1] = FLAG_TITLE;
	// heading factor (now generic)
	factor[2] = FLAG_HEADING;
	factor[3] = FLAG_HEADING;
	factor[4] = FLAG_HEADING;
	factor[5] = FLAG_HEADING;
	factor[6] = FLAG_HEADING;
	factor[7] = FLAG_HEADING;
	// img alt text
	//factor[8] = FLAG_KEYWORDS;
	factor[8] = FLAG_TEXT;	  // treat alt text as plain text, until it has
	// its own FLAG and factor.
	// keywords factor
	factor[9] = FLAG_KEYWORDS;
	// META description factor
	factor[10] = FLAG_DESCRIPTION;
	factor[11] = FLAG_AUTHOR;

	doc = new Document();
	minimumWordLength = config->Value("minimum_word_length", 3);

	log = flags;
	// if in restart mode
	if (Retriever_noLog != log)
	{
		String filelog = config->Find("url_log");
		char buffer[1024];
		int l;

		urls_parsed = fopen((char *) filelog, "r");
		if (urls_parsed != 0)
		{
			// read all url discovered but not fetched before 
			while (fgets(buffer, sizeof(buffer), urls_parsed))
			{
				l = strlen(buffer);
				buffer[l - 1] = 0;
				Initial(buffer, 2);
			}
			fclose(urls_parsed);
		}
		unlink((char *) filelog);
	}

	check_unique_md5 = config->Boolean("check_unique_md5", 0);
	check_unique_date = config->Boolean("check_unique_date", 0);

	d_md5 = 0;
	if (check_unique_md5)
	{
		d_md5 = Database::getDatabaseInstance(DB_HASH);

		if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
		{
			cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
		}
	}

}


//*****************************************************************************
// Retriever::~Retriever()
//
Retriever::~Retriever()
{
	if (d_md5)
		d_md5->Close();
	delete doc;
}


//*****************************************************************************
// void Retriever::setUsernamePassword(char *credentials)
//
void Retriever::setUsernamePassword(const char *credentials)
{
	doc->setUsernamePassword(credentials);
}


//*****************************************************************************
// void Retriever::Initial(char *list, int from)
//   Add a single URL to the list of URLs to visit.
//   Since URLs are stored on a per server basis, we first need to find the
//   the correct server to add the URL's path to.
//
//   from == 0 urls in db.docs and no db.log
//   from == 1 urls in start_url add url only if not already in the list 
//   from == 2 add url from db.log 
//   from == 3 urls in db.docs and there was a db.log 
//
void Retriever::Initial(const String & list, int from)
{
	//
	// Split the list of urls up into individual urls.
	//
	StringList tokens(list, " \t");
	String sig;
	String url;
	Server *server;

	for (int i = 0; i < tokens.Count(); i++)
	{
		URL u(tokens[i]);
		url = u.get();	// get before  u.signature()  resolves aliases
		server = (Server *) servers[u.signature()];
		if (debug > 2)
			cout << "\t" << from << ":" << (int) log << ":" << url;
		if (!server)
		{
			String robotsURL = u.signature();
			robotsURL << "robots.txt";
			StringList *localRobotsFile = GetLocal(robotsURL);

			server = new Server(u, localRobotsFile);
			servers.Add(u.signature(), server);
			delete localRobotsFile;
		}

		if (from && visited.Exists(url))
		{
			if (debug > 2)
				cout << " skipped" << endl;
			continue;
		}
		else if (IsValidURL(url) != 1)
		{
			if (debug > 2)
				cout << endl;
			continue;
		}

		if (Retriever_noLog == log || from != 3)
		{
			if (debug > 2)
				cout << " pushed";
			server->push(u.get(), 0, 0, IsLocalURL(url.get()));
		}
		if (debug > 2)
			cout << endl;
		visited.Add(url, 0);
	}
}


//*****************************************************************************
// void Retriever::Initial(List &list, int from)
//
void Retriever::Initial(List & list, int from)
{
	list.Start_Get();
	String *str;

	// from == 0 is an optimisation for pushing url in update mode
	//  assuming that 
	// 1) there's many more urls in docdb 
	// 2) they're pushed first
	// 3) there's no duplicate url in docdb
	// then they don't need to be check against already pushed urls
	// But 2) can be false with -l option
	//
	// FIXME it's nasty, what have to be test is :
	// we have urls to push from db.docs but do we already have them in
	// db.log? For this it's using a side effect with 'visited' and that
	// urls in db.docs are only pushed via this method, and that db.log are pushed
	// first, db.docs second, start_urls third!
	//  
	if (!from && visited.Count())
	{
		from = 3;
	}
	while ((str = (String *) list.Get_Next()))
	{
		Initial(str->get(), from);
	}
}

//*****************************************************************************
//
static void sigexit(int)
{
	noSignal = 0;   //don't exit here.. just set the flag.
}

static void sigpipe(int)
{
}

//*****************************************************************************
// static void sig_handlers
//   initialise signal handlers
//
static void sig_handlers(void)
{
#ifndef _MSC_VER /* _WIN32 */
    //POSIX SIGNALS
	struct sigaction action;

	/* SIGINT, SIGQUIT, SIGTERM */
	action.sa_handler = sigexit;
	sigemptyset(&action.sa_mask);
	action.sa_flags = 0;
	if (sigaction(SIGINT, &action, NULL) != 0)
		reportError("Cannot install SIGINT handler\n");
	if (sigaction(SIGQUIT, &action, NULL) != 0)
		reportError("Cannot install SIGQUIT handler\n");
	if (sigaction(SIGTERM, &action, NULL) != 0)
		reportError("Cannot install SIGTERM handler\n");
	if (sigaction(SIGHUP, &action, NULL) != 0)
		reportError("Cannot install SIGHUP handler\n");
#else
    //ANSI C signal handling - Limited to supported Windows signals.
    signal(SIGINT, sigexit); 
    signal(SIGTERM, sigexit); 
#endif //_MSC_VER /* _WIN32 */
}


static void sig_phandler(void)
{
#ifndef _MSC_VER /* _WIN32 */
	struct sigaction action;

	sigemptyset(&action.sa_mask);
	action.sa_handler = sigpipe;
	action.sa_flags = SA_RESTART;
	if (sigaction(SIGPIPE, &action, NULL) != 0)
		reportError("Cannot install SIGPIPE handler\n");
#endif //_MSC_VER /* _WIN32 */
}


//*****************************************************************************
// static void win32_check_messages
//   Check WIN32 messages!
//
#ifdef _MSC_VER /* _WIN32 */
static void win32_check_messages(void)
{
// NEAL - NEEDS FINISHING/TESTING
#if 0
    MSG msg = {0, 0, 0, 0};
    int cDown = 0;
    int controlDown = 0;

    if( GetMessage(&msg, 0, 0, 0) )
    {

        switch(msg.message)
        {
            case WM_KEYDOWN:
                {
                    if(LOWORD(msg.message)== 17)
                        controlDown = 1;
                    else if(LOWORD(msg.message) == 67)
                    {
                        cDown = 1;
                    }
                }
                break;
            case WM_KEYUP:
                {
                    if(LOWORD(msg.message) == 17)
                        controlDown = 0;
                    else if(LOWORD(msg.message) == 67)
                        cDown = 0;
                }
                break;
        }
    }

    DispatchMessage(&msg);
#endif
}
#endif //_MSC_VER /* _WIN32 */


//*****************************************************************************
// void Retriever::Start()
//   This is the main loop of the retriever.  We will go through the
//   list of paths stored for each server.  While parsing the
//   retrieved documents, new paths will be added to the servers.  We
//   return if no more paths need to be retrieved.
//
void Retriever::Start()
{
	//
	// Main digger loop.  The todo list should initialy have the start
	// URL and all the URLs which were seen in a previous dig.  The
	// loop will continue as long as there are more URLs to visit.
	//
	int more = 1;
	Server *server;
	URLRef *ref;

	HtConfiguration *config = HtConfiguration::config();

	//  
	// Always sig . The delay bother me but a bad db is worst
	// 
	if (Retriever_noLog != log)
	{
		sig_handlers();
	}
	sig_phandler();
	noSignal = 1;


///////
	//    Main loop. We keep on retrieving until a signal is received
	//    or all the servers' queues are empty.
///////

#ifdef _MSC_VER /* _WIN32 */
    win32_check_messages();
#endif

	while (more && noSignal)
	{
		more = 0;

		//
		// Go through all the current servers in sequence.
		// If they support persistent connections, we keep on popping
		// from the same server queue until it's empty or we reach a maximum
		// number of consecutive requests ("max_connection_requests").
		// Or the loop may also continue for the infinite,
		// if we set the "max_connection_requests" to -1.
		// If the server doesn't support persistent connection, we take
		// only an URL from it, then we skip to the next server.
		//
		// Since 15.05.02: even when persistent connections are activated
		// we should wait for a 'server_wait_time' number of seconds
		// after the 'max_connection_requests' value has been reached.
		//

		// Let's position at the beginning
		servers.Start_Get();

		int count;

		// Maximum number of repeated requests with the same
		// TCP connection (so on the same Server:Port).

		int max_connection_requests;

#ifdef _MSC_VER /* _WIN32 */
        win32_check_messages();
#endif

		while ((server = (Server *) servers.Get_NextElement()) && noSignal)
		{
			if (debug > 1)
				cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl;

			// We already know if a server supports HTTP pers. connections,
			// because we asked it for the robots.txt file (constructor of
			// the class).

			// If the Server doesn't support persistent connections
			// we turn it down to 1.

			if (server->IsPersistentConnectionAllowed())
			{

				// Let's check for a '0' value (out of range)
				// If set, we change it to 1.

				if (config->Value("server", server->host(), "max_connection_requests") == 0)
					max_connection_requests = 1;
				else
					max_connection_requests =
						config->Value("server", server->host(), "max_connection_requests");

				if (debug > 2)
				{

					cout << "> " << server->host() << " supports HTTP persistent connections";

					if (max_connection_requests == -1)
						cout << " (" << "infinite" << ")" << endl;
					else
						cout << " (" << max_connection_requests << ")" << endl;

				}

			}
			else
			{

				// No HTTP persistent connections. So we request only 1 document.

				max_connection_requests = 1;

				if (debug > 2)
					cout << "> " << server->host() << " with a traditional HTTP connection" << endl;

			}


			count = 0;

#ifdef _MSC_VER /* _WIN32 */
            win32_check_messages();
#endif


			while (((max_connection_requests == -1) ||
				   (count < max_connection_requests)) && (ref = server->pop()) && noSignal)
			{
				count++;

				//
				// We have a URL to index, now.  We need to register the
				// fact that we are not done yet by setting the 'more'
				// variable. So, we have to restart scanning the queue.
				//

				more = 1;

				//
				// Deal with the actual URL.
				// We'll check with the server to see if we need to sleep()
				// before parsing it.
				//

				parse_url(*ref);
				delete ref;

				// We reached the maximum number of connections (either with
				// or without persistent connections) and we must pause and
				// respect the 'net ethic'.
				if ((max_connection_requests - count) == 0)
					server->delay();	// This will pause if needed
				// and reset the time

#ifdef _MSC_VER /* _WIN32 */
                win32_check_messages();
#endif

			}

#ifdef _MSC_VER /* _WIN32 */
            win32_check_messages();
#endif

		}
	}

#ifdef _MSC_VER /* _WIN32 */
    win32_check_messages();
#endif


	// if we exited on signal 
	if (Retriever_noLog != log && !noSignal)
	{
		FILE *urls_parsed;
		String filelog = config->Find("url_log");
		// save url seen but not fetched
		urls_parsed = fopen((char *) filelog, "w");
		if (0 == urls_parsed)
		{
			reportError(form("Unable to create URL log file '%s'", filelog.get()));
		}
		else
		{
			servers.Start_Get();
			while ((server = (Server *) servers.Get_NextElement()))
			{
				while (NULL != (ref = server->pop()))
				{
					fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get());
					delete ref;
				}
			}
			fclose(urls_parsed);
		}
	}
	words.Close();
}


//*****************************************************************************
// void Retriever::parse_url(URLRef &urlRef)
//
void Retriever::parse_url(URLRef & urlRef)
{
	HtConfiguration *config = HtConfiguration::config();
	URL url;
	DocumentRef *ref;
	int old_document;
	time_t date;
	static int index = 0;
	static int local_urls_only = config->Boolean("local_urls_only");
	static int mark_dead_servers = config->Boolean("ignore_dead_servers");
	Server *server;

	url.parse(urlRef.GetURL().get());

	currenthopcount = urlRef.GetHopCount();

	ref = docs[url.get()];	  // It might be nice to have just an Exists() here
	if (ref)
	{
		//
		// We already have an entry for this document in our database.
		// This means we can get the document ID and last modification
		// time from there.
		//
		current_id = ref->DocID();
		date = ref->DocTime();
		if (ref->DocAccessed())
			old_document = 1;
		else				  // we haven't retrieved it yet, so we only have the first link
			old_document = 0;
		ref->DocBackLinks(ref->DocBackLinks() + 1);	// we had a new link
		ref->DocAccessed(time(0));
		ref->DocState(Reference_normal);
		currenthopcount = ref->DocHopCount();
	}
	else
	{
		//
		// Never seen this document before.  We need to create an
		// entry for it.  This implies that it gets a new document ID.
		//
		date = 0;
		current_id = docs.NextDocID();
		ref = new DocumentRef;
		ref->DocID(current_id);
		ref->DocURL(url.get());
		ref->DocState(Reference_normal);
		ref->DocAccessed(time(0));
		ref->DocHopCount(currenthopcount);
		ref->DocBackLinks(1); // We had to have a link to get here!
		old_document = 0;
	}

	word_context.DocID(ref->DocID());

	if (debug > 0)
	{
		//
		// Display progress
		//
		cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": ";
		cout.flush();
	}

	// Reset the document to clean out any old data
	doc->Reset();
	doc->Url(url.get());
	doc->Referer(urlRef.GetReferer().get());

	base = doc->Url();

	// Retrieve document, first trying local file access if possible.
	Transport::DocStatus status;
	server = (Server *) servers[url.signature()];
	StringList *local_filenames = GetLocal(url.get());
	if (local_filenames)
	{
		if (debug > 1)
			cout << "Trying local files" << endl;
		status = doc->RetrieveLocal(date, local_filenames);
		if (status == Transport::Document_not_local)
		{
			if (debug > 1)
				cout << "Local retrieval failed, trying HTTP" << endl;
			if (server && !server->IsDead() && !local_urls_only)
				status = doc->Retrieve(server, date);
			else
				status = Transport::Document_no_host;
		}
		delete local_filenames;
	}
	else if (server && !server->IsDead() && !local_urls_only)
		status = doc->Retrieve(server, date);
	else
		status = Transport::Document_no_host;

	current_ref = ref;

	//
	// Determine what to do by looking at the status code returned by
	// the Document retrieval process.
	//

	String shash;
	String sx;
	char bhash[16];
	time_t ddate;

	switch (status)
	{

	case Transport::Document_ok:
		trackWords = 1;

		if (check_unique_md5)
		{
			if (doc->StoredLength() > 0)
			{
				if (check_unique_date)
				{
					ddate = doc->ModTime();
					if (ddate < time(NULL) - 10)
					{	  // Unknown date was set to current time
						md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug);
					}
					else
					{
						md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
					}
				}
				else
					md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);

				shash.append(bhash, MD5_LENGTH);
				d_md5->Get(shash, sx);

				if (!sx.empty())
				{
					if (debug > 1)
					{
						cout << " Detected duplicate by md5 hash" << endl;
					}
					words.Skip();
					break; // Duplicate - don't index
				}
				else
				{
					d_md5->Put(shash, "x");
				}

			}
		}

		if (old_document)
		{
			if (doc->ModTime() == ref->DocTime())
			{
				words.Skip();
				if (debug)
					cout << " retrieved but not changed" << endl;
				words.Skip();
				break;
			}
			//
			// Since we already had a record of this document and
			// we were able to retrieve it, it must have changed
			// since the last time we scanned it.  This means that
			// we need to assign a new document ID to it and mark
			// the old one as obsolete.
			//
			words.Skip();
			int backlinks = ref->DocBackLinks();
			ref->DocState(Reference_obsolete);
			docs.Add(*ref);
			delete ref;

			current_id = docs.NextDocID();
			word_context.DocID(current_id);
			ref = new DocumentRef;
			ref->DocID(current_id);
			ref->DocURL(url.get());
			ref->DocState(Reference_normal);
			ref->DocAccessed(time(0));
			ref->DocHopCount(currenthopcount);
			ref->DocBackLinks(backlinks);
			if (debug)
				cout << " (changed) ";
		}
		RetrievedDocument(*doc, url.get(), ref);
		// Hey! If this document is marked noindex, don't even bother
		// adding new words. Mark this as gone and get rid of it!
		if (ref->DocState() == Reference_noindex)
		{
			if (debug > 1)
				cout << " ( " << ref->DocURL() << " ignored)";
			words.Skip();
		}
		else
			words.Flush();
		if (debug)
			cout << " size = " << doc->Length() << endl;

		if (urls_seen)
		{
			fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n",
				   (const char *) url.get(), doc->Length(), doc->ContentType(),
				   (int) doc->ModTime(), currenthopcount);
		}
		break;

	case Transport::Document_not_changed:
		if (debug)
			cout << " not changed" << endl;
		words.Skip();
		break;

	case Transport::Document_not_found:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " not found" << endl;
		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found);
		words.Skip();
		break;

	case Transport::Document_no_host:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " host not found" << endl;
		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host);
		words.Skip();

		// Mark the server as being down
		if (server && mark_dead_servers)
			server->IsDead(1);
		break;

	case Transport::Document_no_port:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " host not found (port)" << endl;
		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port);
		words.Skip();

		// Mark the server as being down
		if (server && mark_dead_servers)
			server->IsDead(1);
		break;

	case Transport::Document_not_parsable:
		ref->DocState(Reference_noindex);
		if (debug)
			cout << " not Parsable" << endl;
		words.Skip();
		break;

	case Transport::Document_redirect:
		if (debug)
			cout << " redirect" << endl;
		ref->DocState(Reference_obsolete);
		words.Skip();
		got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get());
		break;

	case Transport::Document_not_authorized:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " not authorized" << endl;
		words.Skip();
		break;

	case Transport::Document_not_local:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " not local" << endl;
		words.Skip();
		break;

	case Transport::Document_no_header:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " no header" << endl;
		words.Skip();
		break;

	case Transport::Document_connection_down:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " connection down" << endl;
		words.Skip();
		break;

	case Transport::Document_no_connection:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " no connection" << endl;
		words.Skip();
		break;

	case Transport::Document_not_recognized_service:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " service not recognized" << endl;

		// Mark the server as being down
		if (server && mark_dead_servers)
			server->IsDead(1);
		words.Skip();
		break;

	case Transport::Document_other_error:
		ref->DocState(Reference_not_found);
		if (debug)
			cout << " other error" << endl;
		words.Skip();
		break;
	}
	docs.Add(*ref);
	delete ref;
}


//*****************************************************************************
// void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
//   We found a document that needs to be parsed.  Since we don't know the
//   document type, we'll let the Document itself return an appropriate
//   Parsable object which we can call upon to parse the document contents.
//
void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref)
{
	n_links = 0;
	current_ref = ref;
	current_title = 0;
	word_context.Anchor(0);
	current_time = 0;
	current_head = 0;
	current_meta_dsc = 0;

	//
	// Create a parser object and let it have a go at the document.
	// We will pass ourselves as a callback object for all the got_*()
	// routines.
	// This will generate the Parsable object as a specific parser
	//
	Parsable *parsable = doc.getParsable();
	if (parsable)
		parsable->parse(*this, *base);
	else
	{					  // If we didn't get a parser, then we should get rid of this!
		ref->DocState(Reference_noindex);
		return;
	}

	// If just storing the first occurrence of each word in a document,
	// we must now flush the words we saw in that document
	if (no_store_phrases)
	{
	    DictionaryCursor cursor;
	    char *key;
	    HtWordReference wordRef;
	    for (words_to_add.Start_Get (cursor);
		    (key = words_to_add.Get_Next(cursor)); )
	    {
		word_entry *entry = (word_entry*) (words_to_add [key]);

		wordRef.Location(entry->location);
		wordRef.Flags(entry->flags);
		wordRef.Word(key);
		words.Replace(WordReference::Merge(wordRef, entry->context));
		// How do I clean up properly?
		delete entry;
	    }
	    words_to_add.Release ();
	}

	//
	// We don't need to dispose of the parsable object since it will
	// automatically be reused.
	//

	//
	// Update the document reference
	//
	ref->DocHead((char *) current_head);
	ref->DocMetaDsc((char *) current_meta_dsc);
	if (current_time == 0)
		ref->DocTime(doc.ModTime());
	else
		ref->DocTime(current_time);
	ref->DocTitle((char *) current_title);
	ref->DocSize(doc.Length());
	ref->DocAccessed(time(0));
	ref->DocLinks(n_links);
}


//*****************************************************************************
// int Retriever::Need2Get(const String &u)
//   Return TRUE if we need to retrieve the given url.  This will
//   check the list of urls we have already visited.
//
int Retriever::Need2Get(const String & u)
{
	static String url;
	url = u;

	return !visited.Exists(url);
}


//*****************************************************************************
// int Retriever::IsValidURL(const String &u)
//   Return TRUE if we need to retrieve the given url.  We will check
//   for limits here.
//
int Retriever::IsValidURL(const String & u)
{
	HtConfiguration *config = HtConfiguration::config();
	Dictionary invalids;
	Dictionary valids;
	URL aUrl(u);
	StringList tmpList;

	// A list of bad extensions, separated by spaces or tabs
	String t = config->Find(&aUrl, "bad_extensions");
	String lowerp;
	char *p = strtok(t, " \t");
	while (p)
	{
		// Extensions are case insensitive
		lowerp = p;
		lowerp.lowercase();
		invalids.Add(lowerp, 0);
		p = strtok(0, " \t");
	}

	//
	// Valid extensions are performed similarly 
	//
	// A list of valid extensions, separated by spaces or tabs

	t = config->Find(&aUrl, "valid_extensions");
	p = strtok(t, " \t");
	while (p)
	{
		// Extensions are case insensitive
		lowerp = p;
		lowerp.lowercase();
		valids.Add(lowerp, 0);
		p = strtok(0, " \t");
	}

	static String url;
	url = u;

	//
	// If the URL contains any of the patterns in the exclude list,
	// mark it as invalid
	//
	String exclude_urls = config->Find(&aUrl, "exclude_urls");
	static String *prevexcludes = 0;
	static HtRegexList *excludes = 0;
	if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0)
	{
		if (!excludes)
			excludes = new HtRegexList;
		if (prevexcludes)
			delete prevexcludes;
		prevexcludes = new String(exclude_urls);
		tmpList.Create(exclude_urls, " \t");
		excludes->setEscaped(tmpList, config->Boolean("case_sensitive"));
		tmpList.Destroy();
	}
	if (excludes->match(url, 0, 0) != 0)
	{
		if (debug > 2)
			cout << endl << "   Rejected: item in exclude list ";
		return (HTDIG_ERROR_TESTURL_EXCLUDE);
	}

	//
	// If the URL has a query string and it is in the bad query list
	// mark it as invalid
	//
	String bad_querystr = config->Find(&aUrl, "bad_querystr");
	static String *prevbadquerystr = 0;
	static HtRegexList *badquerystr = 0;
	if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0)
	{
		if (!badquerystr)
			badquerystr = new HtRegexList;
		if (prevbadquerystr)
			delete prevbadquerystr;
		prevbadquerystr = new String(bad_querystr);
		tmpList.Create(bad_querystr, " \t");
		badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive"));
		tmpList.Destroy();
	}
	char *ext = strrchr((char *) url, '?');
	if (ext && badquerystr->match(ext, 0, 0) != 0)
	{
		if (debug > 2)
			cout << endl << "   Rejected: item in bad query list ";
		return (HTDIG_ERROR_TESTURL_BADQUERY);
	}

	//
	// See if the file extension is in the list of invalid ones
	//
	String urlpath = url.get();
	int parm = urlpath.indexOf('?');	// chop off URL parameter
	if (parm >= 0)
		urlpath.chop(urlpath.length() - parm);
	ext = strrchr((char *) urlpath.get(), '.');
	String lowerext;
	if (ext && strchr(ext, '/'))	// Ignore a dot if it's not in the
		ext = NULL;		  // final component of the path.
	if (ext)
	{
		lowerext.set(ext);
		lowerext.lowercase();
		if (invalids.Exists(lowerext))
		{
			if (debug > 2)
				cout << endl << "   Rejected: Extension is invalid!";
			return (HTDIG_ERROR_TESTURL_EXTENSION);
		}
	}
	//
	// Or NOT in the list of valid ones
	//
	if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
	{
		if (debug > 2)
			cout << endl << "   Rejected: Extension is not valid!";
		return (HTDIG_ERROR_TESTURL_EXTENSION2);
	}

	//
	// If none of the limits is met, we disallow the URL
	//
	if (limits.match(url, 1, 0) == 0)
	{
		if (debug > 1)
			cout << endl << "   Rejected: URL not in the limits! ";
		return (HTDIG_ERROR_TESTURL_LIMITS);
	}
	//
	// Likewise if not in list of normalized urls
	//
	// Warning!
	// should be last in checks because of aUrl normalization
	//
		// signature()  implicitly normalizes the URL.  Be efficient...
	Server *server = (Server *) servers[aUrl.signature()];
//	aUrl.normalize();
	if (limitsn.match(aUrl.get(), 1, 0) == 0)
	{
		if (debug > 2)
			cout << endl << "   Rejected: not in \"limit_normalized\" list!";
		return (HTDIG_ERROR_TESTURL_LIMITSNORM);
	}

	//
	// After that gauntlet, check to see if the server allows it
	// (robots.txt)
	//
	if (server && server->IsDisallowed(url) != 0)
	{
		if (debug > 2)
			cout << endl << "   Rejected: forbidden by server robots.txt!";
		return (HTDIG_ERROR_TESTURL_ROBOT_FORBID);
	}

	return (1);
}


//*****************************************************************************
// StringList* Retriever::GetLocal(const String &url)
//   Returns a list of strings containing the (possible) local filenames
//   of the given url, or 0 if it's definitely not local.
//   THE CALLER MUST FREE THE STRINGLIST AFTER USE!
//   Returned strings are not hex encoded.
//
StringList *Retriever::GetLocal(const String & strurl)
{
	HtConfiguration *config = HtConfiguration::config();
	static StringList *prefixes = 0;
	String url = strurl;

	static StringList *paths = 0;
	StringList *defaultdocs = 0;
	URL aUrl(url);
	url = aUrl.get();		  // make sure we look at a parsed URL

	//
	// Initialize prefix/path list if this is the first time.
	// The list is given in format "prefix1=path1 prefix2=path2 ..."
	//
	if (!prefixes)
	{
		prefixes = new StringList();
		paths = new StringList();

		String t = config->Find("local_urls");
		char *p = strtok(t, " \t");
		while (p)
		{
			char *path = strchr(p, '=');
			if (!path)
			{
				p = strtok(0, " \t");
				continue;
			}
			*path++ = '\0';
			String *pre = new String(p);
			decodeURL(*pre);
			prefixes->Add(pre);
			String *pat = new String(path);
			decodeURL(*pat);
			paths->Add(pat);
			p = strtok(0, " \t");
		}
	}
	if (!config->Find(&aUrl, "local_default_doc").empty())
	{
		defaultdocs = new StringList();
		String t = config->Find(&aUrl, "local_default_doc");
		char *p = strtok(t, " \t");
		while (p)
		{
			String *def = new String(p);
			decodeURL(*def);
			defaultdocs->Add(def);
			p = strtok(0, " \t");
		}
		if (defaultdocs->Count() == 0)
		{
			delete defaultdocs;
			defaultdocs = 0;
		}
	}

	// Begin by hex-decoding URL...
	String hexurl = url;
	decodeURL(hexurl);
	url = hexurl.get();

	// Check first for local user...
	if (strchr(url.get(), '~'))
	{
		StringList *local = GetLocalUser(url, defaultdocs);
		if (local)
		{
			if (defaultdocs)
				delete defaultdocs;
			return local;
		}
	}

	// This shouldn't happen, but check anyway...
	if (strstr(url.get(), ".."))
		return 0;

	String *prefix, *path;
	String *defaultdoc;
	StringList *local_names = new StringList();
	prefixes->Start_Get();
	paths->Start_Get();
	while ((prefix = (String *) prefixes->Get_Next()))
	{
		path = (String *) paths->Get_Next();
		if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0)
		{
			int l = strlen(url.get()) - prefix->length() + path->length() + 4;
			String *local = new String(*path, l);
			*local += &url[prefix->length()];
			if (local->last() == '/' && defaultdocs)
			{
				defaultdocs->Start_Get();
				while ((defaultdoc = (String *) defaultdocs->Get_Next()))
				{
					String *localdefault =
						new String(*local, local->length() + defaultdoc->length() + 1);
					localdefault->append(*defaultdoc);
					local_names->Add(localdefault);
				}
				delete local;
			}
			else
				local_names->Add(local);
		}
	}
	if (local_names->Count() > 0)
	{
		if (defaultdocs)
			delete defaultdocs;
		return local_names;
	}

	if (defaultdocs)
		delete defaultdocs;
	delete local_names;
	return 0;
}


//*****************************************************************************
// StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs)
//   If the URL has ~user part, return a list of strings containing the
//   (possible) local filenames of the given url, or 0 if it's
//   definitely not local.
//   THE CALLER MUST FREE THE STRINGLIST AFTER USE!
//
StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs)
{
//  NOTE:  Native Windows does not have this contruct for the user Web files
#ifndef _MSC_VER /* _WIN32 */
	HtConfiguration *config = HtConfiguration::config();
	static StringList *prefixes = 0, *paths = 0, *dirs = 0;
	static Dictionary home_cache;
	URL aUrl(url);

	//
	// Initialize prefix/path list if this is the first time.
	// The list is given in format "prefix1=path1,dir1 ..."
	// If path is zero-length, user's home directory is looked up. 
	//
	if (!prefixes)
	{
		prefixes = new StringList();
		paths = new StringList();
		dirs = new StringList();
		String t = config->Find("local_user_urls");
		char *p = strtok(t, " \t");
		while (p)
		{
			char *path = strchr(p, '=');
			if (!path)
			{
				p = strtok(0, " \t");
				continue;
			}
			*path++ = '\0';
			char *dir = strchr(path, ',');
			if (!dir)
			{
				p = strtok(0, " \t");
				continue;
			}
			*dir++ = '\0';
			String *pre = new String(p);
			decodeURL(*pre);
			prefixes->Add(pre);
			String *pat = new String(path);
			decodeURL(*pat);
			paths->Add(pat);
			String *ptd = new String(dir);
			decodeURL(*ptd);
			dirs->Add(ptd);
			p = strtok(0, " \t");
		}
	}

	// Can we do anything about this?
	if (!strchr(url, '~') || !prefixes->Count() || strstr(url, ".."))
		return 0;

	// Split the URL to components
	String tmp = url;
	char *name = strchr((char *) tmp, '~');
	*name++ = '\0';
	char *rest = strchr(name, '/');
	if (!rest || (rest - name <= 1) || (rest - name > 32))
		return 0;
	*rest++ = '\0';

	// Look it up in the prefix/path/dir table
	prefixes->Start_Get();
	paths->Start_Get();
	dirs->Start_Get();
	String *prefix, *path, *dir;
	String *defaultdoc;
	StringList *local_names = new StringList();
	while ((prefix = (String *) prefixes->Get_Next()))
	{
		path = (String *) paths->Get_Next();
		dir = (String *) dirs->Get_Next();
		if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0)
			continue;

		String *local = new String;
		// No path, look up home directory
		if (path->length() == 0)
		{
			String *home = (String *) home_cache[name];
			if (!home)
			{
				struct passwd *passwd = getpwnam(name);
				if (passwd)
				{
					home = new String(passwd->pw_dir);
					home_cache.Add(name, home);
				}
			}
			if (home)
				*local += *home;
			else
				continue;
		}
		else
		{
			*local += *path;
			*local += name;
		}
		*local += *dir;
		*local += rest;
		if (local->last() == '/' && defaultdocs)
		{
			defaultdocs->Start_Get();
			while ((defaultdoc = (String *) defaultdocs->Get_Next()))
			{
				String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1);
				localdefault->append(*defaultdoc);
				local_names->Add(localdefault);
			}
			delete local;
		}
		else
			local_names->Add(local);
	}

	if (local_names->Count() > 0)
		return local_names;

	delete local_names;
#endif //_MSC_VER /* _WIN32 */

    return 0;
}


//*****************************************************************************
// int Retriever::IsLocalURL(const String &url)
//   Returns 1 if the given url has a (possible) local filename
//   or 0 if it's definitely not local.
//
int Retriever::IsLocalURL(const String & url)
{
	int ret;

	StringList *local_filename = GetLocal(url);
	ret = (local_filename != 0);
	if (local_filename)
		delete local_filename;

	return ret;
}


//*****************************************************************************
// void Retriever::got_word(char *word, int location, int heading)
//   The location is normalized to be in the range 0 - 1000.
//
void Retriever::got_word(const char *word, int location, int heading)
{
	if (debug > 3)
		cout << "word: " << word << '@' << location << endl;
	if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0)
		heading = 0;		  // Assume it's just normal text
	if (trackWords && strlen(word) >= (unsigned int) minimumWordLength)
	{
		String w = word;
		HtWordReference wordRef;

		if (no_store_phrases)
		{
		    // Add new word, or mark existing word as also being at
		    // this heading level
		    word_entry *entry;
		    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
		    {
			words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
		    } else
		    {
			entry->flags |= factor[heading];
		    }
		} else
		{
		    wordRef.Location(location);
		    wordRef.Flags(factor[heading]);
		    wordRef.Word(w);
		    words.Replace(WordReference::Merge(wordRef, word_context));
		}

		// Check for compound words...
		String parts = word;
		int added;
		int nparts = 1;
		do
		{
			added = 0;
			char *start = parts.get();
			char *punctp = 0, *nextp = 0, *p;
			char punct;
			int n;
			while (*start)
			{
				p = start;
				for (n = 0; n < nparts; n++)
				{
					while (HtIsStrictWordChar((unsigned char) *p))
						p++;
					punctp = p;
					if (!*punctp && n + 1 < nparts)
						break;
					while (*p && !HtIsStrictWordChar((unsigned char) *p))
						p++;
					if (n == 0)
						nextp = p;
				}
				if (n < nparts)
					break;
				punct = *punctp;
				*punctp = '\0';
				if (*start && (*p || start > parts.get()))
				{
					w = start;
					HtStripPunctuation(w);
					if (w.length() >= minimumWordLength)
					{
					        if (no_store_phrases)
						{
						    // Add new word, or mark existing word as also being at
						    // this heading level
						    word_entry *entry;
						    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
						    {
							words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
						    } else
						    {
							entry->flags |= factor[heading];
						    }
						} else
						{
						    wordRef.Word(w);
						    words.Replace(WordReference::Merge(wordRef, word_context));
						}
						if (debug > 3)
							cout << "word part: " << start << '@' << location << endl;
					}
					added++;
				}
				start = nextp;
				*punctp = punct;
			}
			nparts++;
		}
		while (added > 2);
	}
}


//*****************************************************************************
// void Retriever::got_title(const char *title)
//
void Retriever::got_title(const char *title)
{
	if (debug > 1)
		cout << "\ntitle: " << title << endl;
	current_title = title;
}


//*****************************************************************************
// void Retriever::got_author(const char *e)
//
void Retriever::got_author(const char *author)
{
	if (debug > 1)
		cout << "\nauthor: " << author << endl;
	current_ref->DocAuthor(author);
}


//*****************************************************************************
// void Retriever::got_time(const char *time)
//
void Retriever::got_time(const char *time)
{
	HtDateTime new_time(current_time);

	if (debug > 1)
		cout << "\ntime: " << time << endl;

	//
	// As defined by the Dublin Core, this should be YYYY-MM-DD
	// In the future, we'll need to deal with the scheme portion
	//  in case someone picks a different format.
	//
	new_time.SetFTime(time, "%Y-%m-%d");
	current_time = new_time.GetTime_t();

	// If we can't convert it, current_time stays the same and we get
	// the default--the date returned by the server...
}

//*****************************************************************************
// void Retriever::got_anchor(const char *anchor)
//
void Retriever::got_anchor(const char *anchor)
{
	if (debug > 2)
		cout << "anchor: " << anchor << endl;
	current_ref->AddAnchor(anchor);
	word_context.Anchor(word_context.Anchor() + 1);
}


//*****************************************************************************
// void Retriever::got_image(const char *src)
//
void Retriever::got_image(const char *src)
{
	URL url(src, *base);
	const char *image = (const char *) url.get();

	if (debug > 2)
		cout << "image: " << image << endl;

	if (images_seen)
		fprintf(images_seen, "%s\n", image);
}


//*****************************************************************************
//
void Retriever::got_href(URL & url, const char *description, int hops)
{
	DocumentRef *ref = 0;
	Server *server = 0;
	int valid_url_code = 0;

	// Rewrite the URL (if need be) before we do anything to it.
	url.rewrite();

	if (debug > 2)
		cout << "href: " << url.get() << " (" << description << ')' << endl;

	n_links++;

	if (urls_seen)
		fprintf(urls_seen, "%s\n", (const char *) url.get());

	//
	// Check if this URL falls within the valid range of URLs.
	//
	valid_url_code = IsValidURL(url.get());
	if (valid_url_code > 0)
	{
		//
		// It is valid.  Normalize it (resolve cnames for the server)
		// and check again...
		//
		if (debug > 2)
		{
			cout << "resolving '" << url.get() << "'\n";
			cout.flush();
		}

		url.normalize();

		// If it is a backlink from the current document,
		// just update that field.  Writing to the database
		// is meaningless, as it will be overwritten.
		// Adding it as a new document may even be harmful, as
		// that will be a duplicate.  This can happen if the
		// current document is never referenced before, as in a
		// start_url.

		if (strcmp(url.get(), current_ref->DocURL()) == 0)
		{
			current_ref->DocBackLinks(current_ref->DocBackLinks() + 1);
			current_ref->AddDescription(description, words);
		}
		else
		{

			//
			// First add it to the document database
			//
			ref = docs[url.get()];
			// if ref exists we have to call AddDescription even
			// if max_hop_count is reached
			if (!ref && currenthopcount + hops > max_hop_count)
				return;

			if (!ref)
			{
				//
				// Didn't see this one, yet.  Create a new reference
				// for it with a unique document ID
				//
				ref = new DocumentRef;
				ref->DocID(docs.NextDocID());
				ref->DocHopCount(currenthopcount + hops);
				ref->DocURL(url.get());
			}
			ref->DocBackLinks(ref->DocBackLinks() + 1);	// This one!
			ref->AddDescription(description, words);

			//
			// If the dig is restricting by hop count, perform the check here 
			// too
			if (currenthopcount + hops > max_hop_count)
			{
				delete ref;
				return;
			}

			if (ref->DocHopCount() > currenthopcount + hops)
				ref->DocHopCount(currenthopcount + hops);

			docs.Add(*ref);

			//
			// Now put it in the list of URLs to still visit.
			//
			if (Need2Get(url.get()))
			{
				if (debug > 1)
					cout << "\n   pushing " << url.get() << endl;
				server = (Server *) servers[url.signature()];
				if (!server)
				{
					//
					// Hadn't seen this server, yet.  Register it
					//
					String robotsURL = url.signature();
					robotsURL << "robots.txt";
					StringList *localRobotsFile = GetLocal(robotsURL.get());

					server = new Server(url, localRobotsFile);
					servers.Add(url.signature(), server);
					delete localRobotsFile;
				}
				//
				// Let's just be sure we're not pushing an empty URL
				//
				if (strlen(url.get()))
					server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()));

				String temp = url.get();
				visited.Add(temp, 0);
				if (debug)
					cout << '+';
			}
			else if (debug)
				cout << '*';
			delete ref;
		}
	}
	else
	{
		//
		// Not a valid URL
		//
		if (debug > 1)
			cout << "\nurl rejected: (level 1)" << url.get() << endl;
		if (debug == 1)
			cout << '-';

		if (urls_seen)
		{
			fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code);
		}

	}
	if (debug)
		cout.flush();
}


//*****************************************************************************
// void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref)
//
void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer)
{
	// First we must piece together the new URL, which may be relative
	URL parent(old_ref->DocURL());
	URL url(new_url, parent);

	// Rewrite the URL (if need be) before we do anything to it.
	url.rewrite();

	if (debug > 2)
		cout << "redirect: " << url.get() << endl;

	n_links++;

	if (urls_seen)
		fprintf(urls_seen, "%s\n", (const char *) url.get());

	//
	// Check if this URL falls within the valid range of URLs.
	//
	if (IsValidURL(url.get()) > 0)
	{
		//
		// It is valid.  Normalize it (resolve cnames for the server)
		// and check again...
		//
		if (debug > 2)
		{
			cout << "resolving '" << url.get() << "'\n";
			cout.flush();
		}

		url.normalize();
		//
		// First add it to the document database
		//
		DocumentRef *ref = docs[url.get()];
		if (!ref)
		{
			//
			// Didn't see this one, yet.  Create a new reference
			// for it with a unique document ID
			//
			ref = new DocumentRef;
			ref->DocID(docs.NextDocID());
			ref->DocHopCount(currenthopcount);
		}
		ref->DocURL(url.get());

		//
		// Copy the descriptions of the old DocRef to this one
		//
		List *d = old_ref->Descriptions();
		if (d)
		{
			d->Start_Get();
			String *str;
			while ((str = (String *) d->Get_Next()))
			{
				ref->AddDescription(str->get(), words);
			}
		}
		if (ref->DocHopCount() > old_ref->DocHopCount())
			ref->DocHopCount(old_ref->DocHopCount());

		// Copy the number of backlinks
		ref->DocBackLinks(old_ref->DocBackLinks());

		docs.Add(*ref);

		//
		// Now put it in the list of URLs to still visit.
		//
		if (Need2Get(url.get()))
		{
			if (debug > 1)
				cout << "   pushing " << url.get() << endl;
			Server *server = (Server *) servers[url.signature()];
			if (!server)
			{
				//
				// Hadn't seen this server, yet.  Register it
				//
				String robotsURL = url.signature();
				robotsURL << "robots.txt";
				StringList *localRobotsFile = GetLocal(robotsURL.get());

				server = new Server(url, localRobotsFile);
				servers.Add(url.signature(), server);
				delete localRobotsFile;
			}
			if (!referer || strlen(referer) == 0)
				server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0);
			else
				server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0);

			String temp = url.get();
			visited.Add(temp, 0);
		}

		delete ref;
	}
}


//*****************************************************************************
// void Retriever::got_head(const char *head)
//
void Retriever::got_head(const char *head)
{
	if (debug > 4)
		cout << "head: " << head << endl;
	current_head = head;
}

//*****************************************************************************
// void Retriever::got_meta_dsc(const char *md)
//
void Retriever::got_meta_dsc(const char *md)
{
	if (debug > 4)
		cout << "meta description: " << md << endl;
	current_meta_dsc = md;
}


//*****************************************************************************
// void Retriever::got_meta_email(const char *e)
//
void Retriever::got_meta_email(const char *e)
{
	if (debug > 1)
		cout << "\nmeta email: " << e << endl;
	current_ref->DocEmail(e);
}


//*****************************************************************************
// void Retriever::got_meta_notification(const char *e)
//
void Retriever::got_meta_notification(const char *e)
{
	if (debug > 1)
		cout << "\nmeta notification date: " << e << endl;
	current_ref->DocNotification(e);
}


//*****************************************************************************
// void Retriever::got_meta_subject(const char *e)
//
void Retriever::got_meta_subject(const char *e)
{
	if (debug > 1)
		cout << "\nmeta subect: " << e << endl;
	current_ref->DocSubject(e);
}


//*****************************************************************************
// void Retriever::got_noindex()
//
void Retriever::got_noindex()
{
	if (debug > 1)
		cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
	current_ref->DocState(Reference_noindex);
}


//*****************************************************************************
//
void Retriever::recordNotFound(const String & url, const String & referer, int reason)
{
	char *message = "";

	switch (reason)
	{
	case Transport::Document_not_found:
		message = "Not found";
		break;

	case Transport::Document_no_host:
		message = "Unknown host or unable to contact server";
		break;

	case Transport::Document_no_port:
		message = "Unknown host or unable to contact server (port)";
		break;

	default:
		break;

	}

	notFound << message << ": " << url << " Ref: " << referer << '\n';
}

//*****************************************************************************
// void Retriever::ReportStatistics(char *name)
//
void Retriever::ReportStatistics(const String & name)
{
	HtConfiguration *config = HtConfiguration::config();
	cout << name << ": Run complete\n";
	cout << name << ": " << servers.Count() << " server";
	if (servers.Count() > 1)
		cout << "s";
	cout << " seen:\n";

	Server *server;
	String buffer;
	StringList results;
	String newname = name;

	newname << ":    ";

	servers.Start_Get();
	while ((server = (Server *) servers.Get_NextElement()))
	{
		buffer = 0;
		server->reportStatistics(buffer, newname);
		results.Add(buffer);
	}
	results.Sort();

	for (int i = 0; i < results.Count(); i++)
	{
		cout << results[i] << "\n";
	}

	if (notFound.length() > 0)
	{
		cout << "\n" << name << ": Errors to take note of:\n";
		cout << notFound;
	}

	cout << endl;

	// Report HTTP connections stats
	cout << "HTTP statistics" << endl;
	cout << "===============" << endl;

	if (config->Boolean("persistent_connections"))
	{
		cout << " Persistent connections    : Yes" << endl;

		if (config->Boolean("head_before_get"))
			cout << " HEAD call before GET      : Yes" << endl;
		else
			cout << " HEAD call before GET      : No" << endl;
	}
	else
	{
		cout << " Persistent connections    : No" << endl;
	}

	HtHTTP::ShowStatistics(cout) << endl;

}