From 75eb47d55c0c5d9bd78508332d752359db75a131 Mon Sep 17 00:00:00 2001 From: Nicolas Niclausse Date: Tue, 19 Oct 2004 11:15:11 +0000 Subject: [PATCH] add a script to convert apache combined log files to tsunami XML config file SVN Revision: 424 --- Makefile | 19 +-- src/log2tsunami.pl.src | 276 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 287 insertions(+), 8 deletions(-) create mode 100755 src/log2tsunami.pl.src diff --git a/Makefile b/Makefile index 9b3052657..f42b7cae0 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ else ifeq ($(TYPE),test) OPT:=+export_all else - OPT = + OPT = endif endif INC = ./include @@ -56,6 +56,8 @@ DTD = idx-tsunami-1.0.dtd USERMANUAL = doc/user_manual.html doc/IDXDOC.css USERMANUAL_IMG = $(wildcard doc/images/*.png) USERMANUAL_SRC = doc/user_manual.tex +PERL_SCRIPTS_SRC = $(wildcard $(ESRC)/*.pl.src) +PERL_SCRIPTS = $(notdir $(basename $(PERL_SCRIPTS_SRC))) TARGET = $(addsuffix .beam, $(basename \ $(addprefix $(EBIN)/, $(notdir $(SRC))))) @@ -113,11 +115,11 @@ deb: clean: -cd priv && rm -f $(shell ls priv | grep -v builder\.erl) && cd .. -rm -f $(TARGET) $(TMP) $(BUILD_OPTIONS_FILE) builder.beam - -rm -f $(TGT_APPFILES) idx-tsunami.sh analyse_msg.pl + -rm -f $(TGT_APPFILES) idx-tsunami.sh $(PERL_SCRIPTS) -rm -f ebin/*.beam # -make -C doc clean -install: doc boot idx-tsunami.sh analyse_msg.pl install_recorder install_controller $(CONFFILE) +install: doc boot idx-tsunami.sh $(PERL_SCRIPTS) install_recorder install_controller $(CONFFILE) -rm -f $(TMP) install -d $(TARGETDIR)/priv @@ -144,7 +146,7 @@ install: doc boot idx-tsunami.sh analyse_msg.pl install_recorder install_contro # create startup script cp idx-tsunami.sh $(SCRIPT) - install analyse_msg.pl $(LIBDIR)/analyse_msg.pl + install $(PERL_SCRIPTS) $(LIBDIR)/ chmod +x $(SCRIPT) # @@ -246,9 +248,9 @@ release: $(USERMANUAL) $(USERMANUAL_SRC) $(USERMANUAL_IMG) $(DTD) \ COPYING README LISEZMOI TODO $(CONFFILE_SRC) Makefile \ priv/builder.erl idx-tsunami.sh.in vsn.mk \ - $(DEBIAN) src/analyse_msg.pl.src CONTRIBUTORS CHANGES \ + $(DEBIAN) $(PERL_SCRIPTS_SRC) CONTRIBUTORS CHANGES \ configure configure.in config.guess config.sub include.mk.in \ - install-sh idx-tsunami.spec + install-sh idx-tsunami.spec tar -C $(PACKAGE)-$(VERSION) -zxf tmp.tgz mkdir $(PACKAGE)-$(VERSION)/ebin tar zvcf $(PACKAGE)-$(VERSION).tar.gz $(PACKAGE)-$(VERSION) @@ -268,8 +270,9 @@ ebin/%.beam: src/$(RECORDER_APPLICATION)/%.erl $(INC_FILES) ebin/%.beam: src/$(CONTROLLER_APPLICATION)/%.erl $(INC_FILES) $(CC) $(OPT) -I $(INC) -o ebin $< -analyse_msg.pl: src/analyse_msg.pl.src Makefile - $(SED) -e 's;%VERSION%;$(VERSION);g' < $< > $@ +%.pl: src/%.pl.src Makefile + @$(SED) -e 's;%VERSION%;$(VERSION);g' \ + -e 's;%DTD%;$(SHARE_DIR)/$(DTD);g' < $< > $@ idx-tsunami.sh: idx-tsunami.sh.in include.mk Makefile @$(SED) \ diff --git a/src/log2tsunami.pl.src b/src/log2tsunami.pl.src new file mode 100755 index 000000000..a95007b20 --- /dev/null +++ b/src/log2tsunami.pl.src @@ -0,0 +1,276 @@ +#!/usr/bin/env perl +# -*- Mode: CPerl -*- +# +# Copyright (C) 2004 Nicolas Niclausse +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +# Auteur: Nicolas Niclausse (Nicolas.Niclausse@sophia.inria.fr) +# Version: $Id$ + +# purpose: create a config file for IDX-Tsunami from a Combined Log file + +use strict; +use Getopt::Long; +use Time::Local; + +use vars qw ($help *verbose $version $thinktime_threshold $visit_timeout + $session_threshold $max_pages $max_duration); +my %Months=('Jan','0', 'Feb','1', 'Mar','2', 'Apr','3', 'May','4', 'Jun','5', + 'Jul','6', 'Aug','7', 'Sep','8', 'Oct','9', 'Nov','10', 'Dec','11'); + +my $tagvsn = '%VERSION%'; + +GetOptions( "help",\$help, + "verbose",\$verbose, + "tt=i",\$thinktime_threshold, + "st=i",\$session_threshold, + "visit_timeout=i",\$visit_timeout, + "max_pages=i",\$max_pages, + "max_duration=i",\$max_duration, + "version",\$version + ); + +my $dtd ="%DTD%"; + +# remove thinktime less than 1 sec +$thinktime_threshold ="1" unless $thinktime_threshold; +# remove session with less than 2 requests +$session_threshold ="2"unless $session_threshold; + +my $ims = "Fri, 14 Nov 2003 02:43:31 GMT"; # if modified since ... for 304 + +# if thinktime is more than $visit_timeout, it's a new session +$visit_timeout=600 unless $visit_timeout; + +$max_pages = 100 unless $max_pages ; # 100 pages max per session +$max_duration = 3600 unless $max_duration; # 1hour max session duration + +my %hit; +my %http; +my $visite; +my ($time,$sec,$min,$hour,$mday,$mon,$year); +my $total; +my $bad = 0; +my $user; +my $id; +my $visit_tot=0; + +&usage if $help or $Getopt::Long::error; +&version if $version; + +while (<>) { + if (m@^([\w\.]+) \S+ \S+ \[(\w+/\w+/\w+:\d+:\d+:\d+)([^\]]+)\] \"(\w+) ([^\"]+)\" (\d+) (\S+) \"([^\"]*)\" \"([^\"]*)\"$@) { + my $ip = $1; + my $date = $2; + my $code = $6; + my $referer = $8; + my $method = $4; + my $user_agent = $9; + my $req = $5; + my ($url, $protocole) = split(/\s+/,$req); + $url = &replace_entities($url); + my $version; + if ($protocole =~ /HTTP\/(\d\.\d)/) { + $version=$1; + } else { + $version="1.0"; + } + + $date =~ m'(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)'; + $mday = $1; + $mon = $Months{$2}; + $year = $3 - 1900; + $hour = $4; + $min = $5; + $sec = $6; + $time = timelocal($sec,$min,$hour,$mday,$mon,$year); + $user = "$ip-$user_agent"; + if ($visite->{$user}) { + if ($time - $visite->{$user}->{'last_visit'} > $visit_timeout) { + # new visit + $visit_tot ++; + $visite->{$user}->{'id'}++; + $id = $visite->{$user}->{'id'}; + $visite->{$user}->{'last_visit'}=$time; + $visite->{$user}->{'last_referer'}=$referer; + $visite->{$user}->{$id}->{'started'}=$time; + $visite->{$user}->{$id}->{'last_request'}=$time; + $visite->{$user}->{$id}->{'page'}=1; + $visite->{$user}->{$id}->{'hit'}=1; + $visite->{$user}->{$id}->{'duration'}=0; + $visite->{$user}->{$id}->{'tsunami'} = ''."\n"; + $visite->{$user}->{$id}->{'tsunami'} .= "\t".'{$user}->{$id}->{'tsunami'} .= ' if_modified_since="'.$ims.'">'; + } else { + $visite->{$user}->{$id}->{'tsunami'} .= '>'; + } + $visite->{$user}->{$id}->{'tsunami'} .= "\n"; + } else { + # same visit + $id = $visite->{$user}->{'id'}; + $visite->{$user}->{$id}->{'hit'}++; + my $thinktime = $time - $visite->{$user}->{$id}->{'last_request'}; + $visite->{$user}->{'last_visit'}=$time; + $visite->{$user}->{$id}->{'last_request'}=$time; + $visite->{$user}->{$id}->{'tsunami'} .= "\t".''."\n\n" if $thinktime > $thinktime_threshold; + $visite->{$user}->{$id}->{'tsunami'} .= "\t".''."\n"; + # update duration + $visite->{$user}->{$id}->{'duration'} = $time - $visite->{$user}->{$id}->{'started'} ; + if ($visite->{$user}->{'last_referer'} eq $referer) { + # same page/frame + } else { + # new frame/page + $visite->{$user}->{$id}->{'page'}++; + $visite->{$user}->{'last_referer'}=$referer; + } + + } + + } else {# new visitor + $visit_tot ++; + $visite->{$user}->{'id'}=1; + $id = 1; + $visite->{$user}->{'last_visit'}=$time; + $visite->{$user}->{'last_referer'}=$referer; + $visite->{$user}->{$id}->{'started'}=$time; + $visite->{$user}->{$id}->{'last_request'}=$time; + $visite->{$user}->{$id}->{'hit'}=1; + $visite->{$user}->{$id}->{'page'}=1; + $visite->{$user}->{$id}->{'duration'}=0; + $visite->{$user}->{$id}->{'tsunami'} = ''."\n"; + $visite->{$user}->{$id}->{'tsunami'} .= "\t".''."\n"; + } + $total ++; + } else { +# print STDERR "$_\n"; + $bad ++; + } +} +my $users_tot=scalar %{$visite}; +my $page_tot=0; +my $hit_tot=0; +my $bad_visit =0; +my $bad_pages =0; +print STDERR "number of unique users is $users_tot\n" if $verbose; +print ' + +'; +print ' + + + + + + + + + + + + +'; +my $real_visit = 0; +foreach my $key (keys %$visite) { + foreach my $id (1..$visite->{$key}->{'id'}) { + my $page = $visite->{$key}->{$id}->{'page'}; + my $hit = $visite->{$key}->{$id}->{'hit'}; + $real_visit ++ if $hit > $session_threshold; + } +} +foreach my $key (sort {$visite->{$a}->{'id'} cmp $visite->{$b}->{'id'}} keys %$visite) { + my $tot_id = $visite->{$key}->{'id'}; + print STDERR "number of visit for $key is $tot_id\n" if $verbose; + foreach my $id (1..$tot_id) { + my $page = $visite->{$key}->{$id}->{'page'}; + my $hit = $visite->{$key}->{$id}->{'hit'}; + my $duration = $visite->{$key}->{$id}->{'duration'}; + if ($page < $max_pages and $duration < $max_duration) { + $page_tot += $page; + $hit_tot += $hit; + print STDERR " page=$page hit=$hit duration=$duration\n" if $verbose; + } else { + $bad_visit++; + $bad_pages +=$page; + + print STDERR "# page=$page hit=$hit duration=$duration\n" if $verbose; + + } + next unless $hit > $session_threshold; + my $pop=sprintf "%.3f",100/$real_visit; + my $tsunami = $visite->{$key}->{$id}->{'tsunami'}; + $tsunami =~ s/\\n"; + } +} +print ''; +print STDERR "real_visit = $real_visit\n" if $verbose; +print STDERR "total_visit = $visit_tot , bad visit = $bad_visit " if $verbose; +printf STDERR "page/visit = %.2f\n",($page_tot/($visit_tot-$bad_visit)) if $verbose; +print STDERR "good_pages = $page_tot , bad pages = $bad_pages " if $verbose; +printf STDERR "hit/page = %.2f\n",($hit_tot/$page_tot) if $verbose; +print STDERR "bad = $bad\n" if $verbose; + +sub replace_entities { + my $str = shift; + $str =~ s/\&/\&/g; + $str =~ s/\'/\'/g; + $str =~ s/\"/\"/g; + $str =~ s/>/\>/g; + $str =~ s/] \n","Available options:\n\t", + "[--help] (this help text)\n\t", + "[--version] (print version)\n\t", + "[--tt ] (thinktime threshold: min thinktime (def=2))\n\t", + "[--st ] (session threshold : min number of requests (def=2))\n\t", + "[--max_duration ] (maximum session duration in sec. (3600))\n\t", + "[--max_pages ] (maximum number of pages winthin a session. (100))\n\t"; + exit; + } + +sub version { +print "this script is part of IDX-TSUNAMI version $tagvsn + +Written by Nicolas Niclausse + +Copyright (C) 2004 Nicolas Niclausse + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program (see COPYING); if not, write to the +Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA."; +exit; +}