Skip to content

Commit

Permalink
Merge pull request #258 from sisimai/regexp-with-i-modifier-is-slow
Browse files Browse the repository at this point in the history
Regexp with i modifier is slow
  • Loading branch information
azumakuniyuki committed Feb 4, 2018
2 parents d256815 + f387d27 commit 2b6578c
Show file tree
Hide file tree
Showing 33 changed files with 347 additions and 428 deletions.
85 changes: 42 additions & 43 deletions Developers.mk
Expand Up @@ -15,20 +15,25 @@ LS := ls -1
CP := cp

BH_LATESTVER := 2.7.13p3
MBOXPARSERV0 := /usr/local/bouncehammer/bin/mailboxparser -T
MBOXPARSERV6 := /usr/local/bouncehammer/bin/mailboxparser -Tvvvvvv
BOUNCEHAMMER := /usr/local/bouncehammer
MBOXPARSERV0 := $(BOUNCEHAMMER)/bin/mailboxparser -T
MBOXPARSERV6 := $(BOUNCEHAMMER)/bin/mailboxparser -Tvvvvvv
PRECISIONTAB := ANALYTICAL-PRECISION
PARSERLOGDIR := var/log
MAILCLASSDIR := lib/$(NAME)/Bite/Email
JSONCLASSDIR := lib/$(NAME)/Bite/JSON
MTARELATIVES := ARF RFC3464 RFC3834
EMAIL_PARSER := sbin/emparser --delivered
BENCHMARKEMP := sbin/mp

BENCHMARKDIR := tmp/benchmark
BENCHMARKSET := tmp/sample
VELOCITYTEST := tmp/emails-for-velocity-measurement
SPEEDTESTDIR := tmp/emails-for-velocity-measurement
SAMPLEPREFIX := eml

PARSERSCRIPT := $(PERL) sbin/emparser --delivered
RELEASEVERMP := $(PERL) -MSisimai
DEVELOPVERMP := $(PERL) -I./lib -MSisimai
HOWMANYMAILS := $(DEVELOPVERMP) -lE 'print scalar @{ Sisimai->make(shift, delivered => 1) }' $(SPEEDTESTDIR)

SET_OF_EMAIL := set-of-emails
PRIVATEMAILS := $(SET_OF_EMAIL)/private
PUBLICEMAILS := $(SET_OF_EMAIL)/maildir/bsd
Expand All @@ -46,11 +51,12 @@ BH_CAN_PARSE := courier exim messagingserver postfix sendmail surfcontrol x5 \

private-sample:
@test -n "$(E)" || ( echo 'Usage: make -f Developers.mk $@ E=/path/to/email' && exit 1 )
@test -x sbin/emparser
test -f $(E)
$(EMAIL_PARSER) $(E)
$(PARSERSCRIPT) $(E)
@echo
@while true; do \
d=`$(EMAIL_PARSER) -Fjson $(E) | jq -M '.[].smtpagent' | head -1 \
d=`$(PARSERSCRIPT) -Fjson $(E) | jq -M '.[].smtpagent' | head -1 \
| tr '[A-Z]' '[a-z]' | tr -d '-' | sed -e 's/"//g' -e 's/::/-/g'`; \
if [ -d "$(PRIVATEMAILS)/$$d" ]; then \
latestfile=`ls -1 $(PRIVATEMAILS)/$$d/*.$(SAMPLEPREFIX) | tail -1`; \
Expand Down Expand Up @@ -84,7 +90,7 @@ precision-table:
l=`expr $$l + 1` ;\
done ;\
printf "%s" ' ' ;\
n0=`$(EMAIL_PARSER) --count-only $(BENCHMARKSET)/email-$$d` ;\
n0=`$(PARSERSCRIPT) --count-only $(BENCHMARKSET)/email-$$d` ;\
r0=`$(MBOXPARSERV6) $(BENCHMARKSET)/email-$$d 2>&1 | grep 'debug0:' \
| sed 's/^.*debug0:/0 /g' | cut -d' ' -f9,10` ;\
rn="`echo $$r0 | cut -d/ -f1`" ;\
Expand All @@ -106,7 +112,7 @@ precision-table:
l=`expr $$l + 1` ;\
done ;\
printf "%s" ' ' ;\
n0=`$(EMAIL_PARSER) --count-only $(BENCHMARKSET)/email-$$d` ;\
n0=`$(PARSERSCRIPT) --count-only $(BENCHMARKSET)/email-$$d` ;\
r0=`$(MBOXPARSERV6) $(BENCHMARKSET)/email-$$d 2>&1 | grep 'debug0:' \
| sed 's/^.*debug0:/0 /g' | cut -d' ' -f9,10` ;\
rn="`echo $$r0 | cut -d/ -f1`" ;\
Expand All @@ -128,7 +134,7 @@ precision-table:
l=`expr $$l + 1` ;\
done ;\
printf "%s" ' ' ;\
n0=`$(EMAIL_PARSER) --count-only $(BENCHMARKSET)/$$d` ;\
n0=`$(PARSERSCRIPT) --count-only $(BENCHMARKSET)/$$d` ;\
r0=`$(MBOXPARSERV6) $(BENCHMARKSET)/$$d 2>&1 | grep 'debug0:' \
| sed 's/^.*debug0:/0 /g' | cut -d' ' -f9,10` ;\
rn="`echo $$r0 | cut -d/ -f1`" ;\
Expand Down Expand Up @@ -247,48 +253,50 @@ parser-log:
for r in `find $(PRIVATEMAILS)/$$v -type f -name '*.eml'`; do \
echo $$r; \
echo $$r >> $(PARSERLOGDIR)/$$v.log; \
$(EMAIL_PARSER) -Fddp $$r | grep -E 'reason|diagnosticcode|deliverystatus' >> $(PARSERLOGDIR)/$$v.log; \
$(PARSERSCRIPT) -Fddp $$r | grep -E 'reason|diagnosticcode|deliverystatus' >> $(PARSERLOGDIR)/$$v.log; \
echo >> $(PARSERLOGDIR)/$$v.log; \
done; \
done

profile: benchmark-mbox
$(PERL) -d:NYTProf $(EMAIL_PARSER) -Fjson $(BENCHMARKDIR) > /dev/null
nytprofhtml

velocity-measurement:
@ $(MKDIR) $(VELOCITYTEST)
samples-for-velocity:
@ rm -fr ./$(SPEEDTESTDIR)
@ $(MKDIR) $(SPEEDTESTDIR)
@ for v in $(BH_CAN_PARSE); do \
$(CP) $(PUBLICEMAILS)/email-$$v-*.eml $(VELOCITYTEST)/; \
$(CP) $(PRIVATEMAILS)/email-$$v/*.eml $(VELOCITYTEST)/; \
$(CP) $(PUBLICEMAILS)/email-$$v-*.eml $(SPEEDTESTDIR)/; \
test -d $(PRIVATEEMAILS) && $(CP) $(PRIVATEMAILS)/email-$$v/*.eml $(SPEEDTESTDIR)/; \
done

velocity-measurement: samples-for-velocity
@ echo -------------------------------------------------------------------
@ echo `$(LS) $(VELOCITYTEST) | wc -l` emails in $(VELOCITYTEST)
@ echo -n 'Calculating the velocity of 1000 mails: multiply by '
@ echo "scale=4; 1000 / `$(LS) $(VELOCITYTEST) | wc -l`" | bc
@ echo -n 'Calculating the velocity of 2000 mails: multiply by '
@ echo "scale=4; 2000 / `$(LS) $(VELOCITYTEST) | wc -l`" | bc
@ echo `$(HOWMANYMAILS)` emails in $(SPEEDTESTDIR)
@ echo -n 'Calculating the velocity of parsing 1000 mails: multiply by '
@ echo "scale=6; 1000 / `$(HOWMANYMAILS)`" | bc
@ echo -------------------------------------------------------------------
@ echo 'Sisimai(1)' $(BENCHMARKEMP)
@ n=1; while [ $$n -le 5 ]; do \
/usr/bin/time $(BENCHMARKEMP) $(VELOCITYTEST) > /dev/null ;\
sleep 1; \
n=`expr $$n + 1`; \
done
@ uptime
@ echo -------------------------------------------------------------------
@ echo 'Sisimai(2)' $(EMAIL_PARSER)
@ if [ -x "$(BOUNCEHAMMER)/bin/mailboxparser" ]; then \
echo bounceHammer $(BH_LATESTVER); \
n=1; while [ $$n -le 5 ]; do \
/usr/bin/time $(MBOXPARSERV0) -Fjson $(SPEEDTESTDIR) > /dev/null ;\
sleep 1; \
n=`expr $$n + 1`; \
done; \
echo -------------------------------------------------------------------; \
fi
@ echo 'Sisimai' `$(RELEASEVERMP) -le 'print Sisimai->version'` $(RELEASEVERMP)
@ n=1; while [ $$n -le 5 ]; do \
/usr/bin/time $(EMAIL_PARSER) -Fjson $(VELOCITYTEST) > /dev/null ;\
/usr/bin/time $(RELEASEVERMP) -lE 'Sisimai->make(shift, "deliverd" => 1)' $(SPEEDTESTDIR) > /dev/null ;\
sleep 1; \
n=`expr $$n + 1`; \
done
@ echo -------------------------------------------------------------------
@ echo bounceHammer $(BH_LATESTVER)
@ echo 'Sisimai' `$(DEVELOPVERMP) -le 'print Sisimai->version'` $(DEVELOPVERMP)
@ n=1; while [ $$n -le 5 ]; do \
/usr/bin/time $(MBOXPARSERV0) -Fjson $(VELOCITYTEST) > /dev/null ;\
/usr/bin/time $(DEVELOPVERMP) -lE 'Sisimai->make(shift, "deliverd" => 1)' $(SPEEDTESTDIR) > /dev/null ;\
sleep 1; \
n=`expr $$n + 1`; \
done
@ echo -------------------------------------------------------------------

benchmark-mbox: sample
$(MKDIR) -p $(BENCHMARKDIR)
Expand All @@ -307,16 +315,7 @@ header-content-list: sample
cat senders-list | sort | uniq > tmp/senders-list
rm ./subject-list ./senders-list

loc:
@ for v in `find lib -type f -name '*.pm'`; do \
x=`wc -l $$v | awk '{ print $$1 }'`; \
y=`cat -n $$v | grep '\t1;' | tail -n 1 | awk '{ print $$1 }'`; \
z=`grep -E '^\s*#|^$$' $$v | wc -l | awk '{ print $$1 }'`; \
echo "$$x - ( $$x - $$y ) - $$z" | bc ;\
done | awk '{ s += $$1 } END { print s }'

clean:
$(RM) -r nytprof*
$(RM) -r cover_db
$(RM) -r ./build
$(RM) -r ./$(BENCHMARKSET)
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Expand Up @@ -22,7 +22,8 @@ RM := rm -f
.DEFAULT_GOAL = git-status
REPOS_TARGETS = git-status git-push git-commit-amend git-tag-list git-diff \
git-reset-soft git-rm-cached git-branch
DEVEL_TARGETS = profile private-sample update-analytical-precision-table loc
DEVEL_TARGETS = private-sample update-analytical-precision-table
BENCH_TARGETS = profile speed-test loc

# -----------------------------------------------------------------------------
.PHONY: clean
Expand Down Expand Up @@ -81,6 +82,9 @@ $(REPOS_TARGETS):
$(DEVEL_TARGETS):
$(MAKE) -f Developers.mk $@

$(BENCH_TARGETS):
$(MAKE) -f Benchmarks.mk $@

diff push branch:
@$(MAKE) git-$@
fix-commit-message: git-commit-amend
Expand Down
2 changes: 1 addition & 1 deletion lib/Sisimai.pm
Expand Up @@ -187,7 +187,7 @@ sub match {
my $argvs = shift || return undef;

require Sisimai::Reason;
return Sisimai::Reason->match($argvs);
return Sisimai::Reason->match(lc $argvs);
}

1;
Expand Down
2 changes: 1 addition & 1 deletion lib/Sisimai/Address.pm
Expand Up @@ -336,7 +336,7 @@ sub find {
# except a domain part is an IP address like neko@[192.0.2.222]
$e->{'address'} =~ s/\A[\[<{('`]//;
$e->{'address'} =~ s/['`>})]\z//;
$e->{'address'} =~ s/\]\z// unless $e->{'address'} =~ /[@]\[[0-9A-Z:\.]+\]\z/i;
$e->{'address'} =~ s/\]\z// unless $e->{'address'} =~ /[@]\[[0-9A-Za-z:\.]+\]\z/;

unless( $e->{'address'} =~ /\A["].+["][@]/ ) {
# Remove double-quotations
Expand Down
52 changes: 26 additions & 26 deletions lib/Sisimai/MDA.pm
Expand Up @@ -25,57 +25,57 @@ my $MarkingsOf = {
# dovecot/src/deliver/mail-send.c:94
my $ReFailures = {
'dovecot' => {
'userunknown' => qr/\AMailbox doesn't exist: /i,
'userunknown' => qr/\Amailbox doesn't exist: /,
'mailboxfull' => qr{\A(?:
Quota[ ]exceeded # Dovecot 1.2 dovecot/src/plugins/quota/quota.c
|Quota[ ]exceeded[ ][(]mailbox[ ]for[ ]user[ ]is[ ]full[)] # dovecot/src/plugins/quota/quota.c
|Not[ ]enough[ ]disk[ ]space
quota[ ]exceeded # Dovecot 1.2 dovecot/src/plugins/quota/quota.c
|quota[ ]exceeded[ ][(]mailbox[ ]for[ ]user[ ]is[ ]full[)] # dovecot/src/plugins/quota/quota.c
|not[ ]enough[ ]disk[ ]space
)
}xi,
}x,
},
'mail.local' => {
'userunknown' => qr{[:][ ](?:
unknown[ ]user[:]
|User[ ]unknown
|Invalid[ ]mailbox[ ]path
|User[ ]missing[ ]home[ ]directory
|user[ ]unknown
|invalid[ ]mailbox[ ]path
|user[ ]missing[ ]home[ ]directory
)
}xi,
}x,
'mailboxfull' => qr{(?:
Disc[ ]quota[ ]exceeded
|Mailbox[ ]full[ ]or[ ]quota[ ]exceeded
disc[ ]quota[ ]exceeded
|mailbox[ ]full[ ]or[ ]quota[ ]exceeded
)
}xi,
'systemerror' => qr/Temporary file write error/i,
}x,
'systemerror' => qr/temporary file write error/,
},
'procmail' => {
'mailboxfull' => qr/Quota exceeded while writing/i,
'systemfull' => qr/No space left to finish writing/i,
'mailboxfull' => qr/quota exceeded while writing/,
'systemfull' => qr/no space left to finish writing/,
},
'maildrop' => {
'userunknown' => qr{(?:
Invalid[ ]user[ ]specified[.]
|Cannot[ ]find[ ]system[ ]user
invalid[ ]user[ ]specified[.]
|cannot[ ]find[ ]system[ ]user
)
}xi,
'mailboxfull' => qr/maildir over quota[.]\z/i,
}x,
'mailboxfull' => qr/maildir over quota[.]\z/,
},
'vpopmail' => {
'userunknown' => qr/Sorry, no mailbox here by that name[.]/i,
'userunknown' => qr/sorry, no mailbox here by that name[.]/,
'filtered' => qr{(?:
account[ ]is[ ]locked[ ]email[ ]bounced
|user[ ]does[ ]not[ ]exist,[ ]but[ ]will[ ]deliver[ ]to[ ]
)
}xi,
'mailboxfull' => qr/(?:domain|user) is over quota/i,
}x,
'mailboxfull' => qr/(?:domain|user) is over quota/,
},
'vmailmgr' => {
'userunknown' => qr{(?>
Invalid[ ]or[ ]unknown[ ](?:base[ ]user[ ]or[ ]domain|virtual[ ]user)
|User[ ]name[ ]does[ ]not[ ]refer[ ]to[ ]a[ ]virtual[ ]user/
invalid[ ]or[ ]unknown[ ](?:base[ ]user[ ]or[ ]domain|virtual[ ]user)
|user[ ]name[ ]does[ ]not[ ]refer[ ]to[ ]a[ ]virtual[ ]user/
)
}ix,
'mailboxfull' => qr/Delivery failed due to system quota violation/i,
'mailboxfull' => qr/delivery failed due to system quota violation/,
},
};

Expand Down Expand Up @@ -132,7 +132,7 @@ sub scan {
# Detect an error reason from message patterns of the MDA.
for my $f ( @linebuffer ) {
# Try to match with each regular expression
next unless $f =~ $ReFailures->{ $agentname0 }->{ $e };
next unless lc($f) =~ $ReFailures->{ $agentname0 }->{ $e };
$reasonname = $e;
$bouncemesg = $f;
last;
Expand Down
22 changes: 12 additions & 10 deletions lib/Sisimai/MIME.pm
Expand Up @@ -7,12 +7,12 @@ use MIME::QuotedPrint ();
use Sisimai::String;

my $ReE = {
'7bit-encoded' => qr/^Content-Transfer-Encoding:[ ]*7bit$/im,
'quoted-print' => qr/^Content-Transfer-Encoding:[ ]*quoted-printable$/im,
'some-iso2022' => qr/^Content-Type:[ ]*.+;[ ]*charset=["']?(iso-2022-[-a-z0-9]+)['"]?/im,
'with-charset' => qr/^Content[-]Type:[ ]*.+[;][ ]*charset=['"]?([-0-9a-z]+)['"]?/i,
'only-charset' => qr/^[\s\t]+charset=['"]?([-0-9a-z]+)['"]?/i,
'html-message' => qr|^Content-Type:[ ]*text/html;|mi,
'7bit-encoded' => qr/^content-transfer-encoding:[ ]*7bit$/m,
'quoted-print' => qr/^content-transfer-encoding:[ ]*quoted-printable$/m,
'some-iso2022' => qr/^content-type:[ ]*.+;[ ]*charset=["']?(iso-2022-[-a-z0-9]+)['"]?/m,
'with-charset' => qr/^content[-]type:[ ]*.+[;][ ]*charset=['"]?([-0-9a-z]+)['"]?/,
'only-charset' => qr/^[\s\t]+charset=['"]?([-0-9a-z]+)['"]?/,
'html-message' => qr|^content-type:[ ]*text/html;|m,
};

sub patterns {
Expand Down Expand Up @@ -139,7 +139,7 @@ sub qprintd {

# Quoted-printable encoded part is the part of the text
my $boundary00 = __PACKAGE__->boundary($heads->{'content-type'}, 0);
if( length($boundary00) == 0 || $$argv1 !~ $ReE->{'quoted-print'} ) {
if( length($boundary00) == 0 || lc($$argv1) !~ $ReE->{'quoted-print'} ) {
# There is no boundary string or no
# Content-Transfer-Encoding: quoted-printable field.
$plain = MIME::QuotedPrint::decode($$argv1);
Expand All @@ -154,6 +154,7 @@ sub qprintd {
my $bodystring = '';
my $notdecoded = '';
my $getencoded = '';
my $lowercased = '';

my $encodename = undef;
my $ctencoding = undef;
Expand Down Expand Up @@ -185,6 +186,7 @@ sub qprintd {
}
} else {
# NOT Quoted-Printable encoded text block
$lowercased = lc $e;
if( $e =~ /\A[-]{2}[^\s]+[^-]\z/ ) {
# Start of the boundary block
# --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
Expand All @@ -197,12 +199,12 @@ sub qprintd {
'until' => qr/\Q$boundary01\E\z/,
};
}
} elsif( $e =~ $ReE->{'with-charset'} || $e =~ $ReE->{'only-charset'} ) {
} elsif( $lowercased =~ $ReE->{'with-charset'} || $lowercased =~ $ReE->{'only-charset'} ) {
# Content-Type: text/plain; charset=ISO-2022-JP
$encodename = $1;
$mimeinside = 1 if $ctencoding;

} elsif( $e =~ $ReE->{'quoted-print'} ){
} elsif( $lowercased =~ $ReE->{'quoted-print'} ){
# Content-Transfer-Encoding: quoted-printable
$ctencoding = $e;
$mimeinside = 1 if $encodename;
Expand Down Expand Up @@ -246,7 +248,7 @@ sub boundary {
my $start = shift // -1;
my $value = '';

if( $argv1 =~ /\bboundary=([^ ]+)/i ) {
if( lc $argv1 =~ /\bboundary=([^ ]+)/ ) {
# Content-Type: multipart/mixed; boundary=Apple-Mail-5--931376066
# Content-Type: multipart/report; report-type=delivery-status;
# boundary="n6H9lKZh014511.1247824040/mx.example.jp"
Expand Down
7 changes: 4 additions & 3 deletions lib/Sisimai/Message/Email.pm
Expand Up @@ -410,13 +410,14 @@ sub parse {
$bodystring = Sisimai::String->to_plain($bodystring, 1) if $mesgformat =~ m|text/html;?|;
} else {
# NOT text/plain
if( $$bodystring =~ $ReEncoding->{'quoted-print'} ) {
my $lowercased = lc $$bodystring;
if( $lowercased =~ $ReEncoding->{'quoted-print'} ) {
# Content-Transfer-Encoding: quoted-printable
$bodystring = Sisimai::MIME->qprintd($bodystring, $mailheader);
}

if( $$bodystring =~ $ReEncoding->{'7bit-encoded'} &&
$$bodystring =~ $ReEncoding->{'some-iso2022'} ) {
if( $lowercased =~ $ReEncoding->{'7bit-encoded'} &&
$lowercased =~ $ReEncoding->{'some-iso2022'} ) {
# Content-Transfer-Encoding: 7bit
# Content-Type: text/plain; charset=ISO-2022-JP
unless( lc($1) =~ /(?:us-ascii|utf[-]?8)/ ) {
Expand Down

0 comments on commit 2b6578c

Please sign in to comment.