Skip to content

Commit

Permalink
Merge pull request #490 from sisimai/7e71-reduce-regular-expressions
Browse files Browse the repository at this point in the history
Reduce Regular Expressions
  • Loading branch information
azumakuniyuki committed May 8, 2023
2 parents dd2587a + fdf49df commit 0c60351
Show file tree
Hide file tree
Showing 138 changed files with 1,794 additions and 1,269 deletions.
118 changes: 63 additions & 55 deletions lib/Sisimai/ARF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,20 @@ sub is_arf {
my $heads = shift || return 0;
my $match = 0;

state $reportfrom = qr/(?:staff[@]hotmail[.]com|complaints[@]email-abuse[.]amazonses[.]com)\z/;

if( $heads->{'content-type'} =~ /report-type=["]?feedback-report["]?/ ) {
state $reportfrom = ['staff@hotmail.com', 'complaints@email-abuse.amazonses.com'];
if( Sisimai::String->aligned(\$heads->{'content-type'}, ['report-type=', 'feedback-report']) ) {
# Content-Type: multipart/report; report-type=feedback-report; ...
$match = 1;

} elsif( index($heads->{'content-type'}, 'multipart/mixed') > -1 ) {
# Microsoft (Hotmail, MSN, Live, Outlook) uses its own report format.
# Amazon SES Complaints bounces
my $p = Sisimai::Address->s3s4($heads->{'from'});
if( $p =~ $reportfrom && index($heads->{'subject'}, 'complaint about message from ') > -1 ) {
if( index($heads->{'subject'}, 'complaint about message from ') > -1 ) {
# From: staff@hotmail.com
# From: complaints@email-abuse.amazonses.com
# Subject: complaint about message from 192.0.2.1
$match = 1;
my $cv = Sisimai::Address->s3s4($heads->{'from'});
$match = 1 if grep { index($cv, $_) > -1 } @$reportfrom;
}
}
return $match;
Expand All @@ -54,21 +53,16 @@ sub inquire {
# OpenDMARC 1.3.0 uses: This is an authentication failure report for an email message received from IP
# Abusix ARF uses: this is an autogenerated email abuse complaint regarding your network.
state $startingof = {
'rfc822' => ['Content-Type: message/rfc822', 'Content-Type: text/rfc822-headers'],
'report' => ['Content-Type: message/feedback-report'],
};
state $markingsof = {
'message' => qr{\A(?>
[Tt]his[ ]is[ ]a[ ][^ ]+[ ](?:email[ ])?[Aa]buse[ ][Rr]eport
|[Tt]his[ ]is[ ]an[ ]email[ ]abuse[ ]report
|[Tt]his[ ]is[ ](?:
a[ ][^ ]+[ ]authentication[ -]failure[ ]report
|an[ ]authentication[ -]failure[ ]report
|an[ ]autogenerated[ ]email[ ]abuse[ ]complaint
|an?[ ][^ ]+[ ]report[ ]for
)
)
}x,
'rfc822' => ['Content-Type: message/rfc822', 'Content-Type: text/rfc822-headers'],
'report' => ['Content-Type: message/feedback-report'],
'message' => [
['this is a', 'abuse report'],
['this is a', 'authentication', 'failure report'],
['this is a', ' report for'],
['this is an authentication', 'failure report'],
['this is an autogenerated email abuse complaint'],
['this is an email abuse report'],
],
};
state $indicators = Sisimai::Lhost->INDICATORS;
state $longfields = Sisimai::RFC5322->LONGFIELDS;
Expand Down Expand Up @@ -121,7 +115,8 @@ sub inquire {
# message-id of 0000-000000000000000000000000000000000@mx
# received from IP address 192.0.2.1 on
# Thu, 29 Apr 2010 00:00:00 +0900 (JST)
$commondata->{'diagnosis'} ||= $e if $e =~ $markingsof->{'message'};
my $p = lc $e;
$commondata->{'diagnosis'} ||= $e if grep { Sisimai::String->aligned(\$p, $_) } $startingof->{'message'}->@*;

unless( $readcursor ) {
# Beginning of the bounce message or message/delivery-status part
Expand All @@ -139,19 +134,19 @@ sub inquire {

if( $readcursor & $indicators->{'message-rfc822'} ) {
# message/rfc822 OR text/rfc822-headers part
if( $e =~ /X-HmXmrOriginalRecipient:[ ]*(.+)\z/ ) {
if( index($e, 'X-HmXmrOriginalRecipient:') == 0 ) {
# Microsoft ARF: original recipient.
$dscontents->[-1]->{'recipient'} = Sisimai::Address->s3s4($1);
$dscontents->[-1]->{'recipient'} = Sisimai::Address->s3s4(substr($e, index($e, ':') + 1,));
$recipients++;

# The "X-HmXmrOriginalRecipient" header appears only once so we take this opportunity
# to hard-code ARF headers missing in Microsoft's implementation.
$arfheaders->{'feedbacktype'} = 'abuse';
$arfheaders->{'agent'} = 'Microsoft Junk Mail Reporting Program';

} elsif( $e =~ /\AFrom:[ ](.+)\z/ ) {
} elsif( index($e, 'From: ') == 0 ) {
# Microsoft ARF: original sender.
$commondata->{'from'} ||= Sisimai::Address->s3s4($1);
$commondata->{'from'} ||= Sisimai::Address->s3s4(substr($e, 6,));
$previousfn = 'from';

} elsif( index($e, ' ') == 0 ) {
Expand Down Expand Up @@ -193,8 +188,7 @@ sub inquire {
# Source-IP: 192.0.2.1
$v = $dscontents->[-1];

if( $e =~ /\AOriginal-Rcpt-To:[ ][<]?(.+)[>]?\z/ ||
$e =~ /\ARedacted-Address:[ ]([^ ].+[@])\z/ ) {
if( index($e, 'Original-Rcpt-To: ') == 0 || index($e, 'Redacted-Address: ') == 0 ) {
# Original-Rcpt-To header field is optional and may appear any
# number of times as appropriate:
# Original-Rcpt-To: <user@example.com>
Expand All @@ -204,47 +198,47 @@ sub inquire {
push @$dscontents, Sisimai::Lhost->DELIVERYSTATUS;
$v = $dscontents->[-1];
}
$v->{'recipient'} = Sisimai::Address->s3s4($1);
$v->{'recipient'} = Sisimai::Address->s3s4(substr($e, index($e, ' ') + 1,));
$recipients++;

} elsif( $e =~ /\AFeedback-Type:[ ]([^ ]+)\z/ ) {
} elsif( index($e, 'Feedback-Type: ') == 0 ) {
# The header field MUST appear exactly once.
# Feedback-Type: abuse
$arfheaders->{'feedbacktype'} = $1;
$arfheaders->{'feedbacktype'} = substr($e, index($e, ' ') + 1,);

} elsif( $e =~ /\AAuthentication-Results:[ ](.+)\z/ ) {
} elsif( index($e, 'Authentication-Results: ') == 0 ) {
# "Authentication-Results" indicates the result of one or more authentication checks
# run by the report generator.
#
# Authentication-Results: mail.example.com;
# spf=fail smtp.mail=somespammer@example.com
$arfheaders->{'authres'} = $1;
$arfheaders->{'authres'} = substr($e, index($e, ' ') + 1,);

} elsif( $e =~ /\AUser-Agent:[ ](.+)\z/ ) {
} elsif( index($e, 'User-Agent: ') == 0 ) {
# The header field MUST appear exactly once.
# User-Agent: SomeGenerator/1.0
$arfheaders->{'agent'} = $1;
$arfheaders->{'agent'} = substr($e, index($e, ' ') + 1,);

} elsif( $e =~ /\A(?:Received|Arrival)-Date:[ ](.+)\z/ ) {
} elsif( index($e, 'Received-Date: ') == 0 || index($e, 'Arrival-Date: ') == 0 ) {
# Arrival-Date header is optional and MUST NOT appear more than once.
# Received-Date: Thu, 29 Apr 2010 00:00:00 JST
# Arrival-Date: Thu, 29 Apr 2010 00:00:00 +0000
$arfheaders->{'date'} = $1;
$arfheaders->{'date'} = substr($e, index($e, ' ') + 1,);

} elsif( $e =~ /\AReporting-MTA:[ ]dns;[ ](.+)\z/ ) {
} elsif( index($e, 'Reporting-MTA: dns; ') == 0 ) {
# The header is optional and MUST NOT appear more than once.
# Reporting-MTA: dns; mx.example.jp
$commondata->{'rhost'} = $1;
$commondata->{'rhost'} = substr($e, index($e, ';') + 2,);

} elsif( $e =~ /\ASource-IP:[ ](.+)\z/ ) {
} elsif( index($e, 'Source-IP: ') == 0 ) {
# The header is optional and MUST NOT appear more than once.
# Source-IP: 192.0.2.45
$arfheaders->{'rhost'} = $1;
$arfheaders->{'rhost'} = substr($e, index($e, ' ') + 1,);

} elsif( $e =~ /\AOriginal-Mail-From:[ ](.+)\z/ ) {
} elsif( index($e, 'Original-Mail-From: ') == 0 ) {
# the header is optional and MUST NOT appear more than once.
# Original-Mail-From: <somespammer@example.net>
$commondata->{'from'} ||= Sisimai::Address->s3s4($1);
$commondata->{'from'} ||= Sisimai::Address->s3s4(substr($e, index($e, ' ') + 1,));
}
} # End of if: rfc822
}
Expand All @@ -256,34 +250,46 @@ sub inquire {

unless( $recipients ) {
# The original recipient address was not found
if( $rfc822part =~ /^To: (.+[@].+)$/m ) {
if( Sisimai::String->aligned(\$rfc822part, ["\nTo: ", '@']) ) {
# pick the address from To: header in message/rfc822 part.
$dscontents->[-1]->{'recipient'} = Sisimai::Address->s3s4($1);
my $p1 = index($rfc822part, "\nTo: ") + 5;
my $p2 = index($rfc822part, "\n", $p1 + 1);
my $cm = $p2 > 0 ? $p2 - $p1 : 255;
$dscontents->[-1]->{'recipient'} = Sisimai::Address->s3s4(substr($rfc822part, $p1, $cm));
$recipients = 1;
}

} else {
# Insert pseudo recipient address when there is no valid recipient address in the message.
$dscontents->[-1]->{'recipient'} = Sisimai::Address->undisclosed('r');
while(1) {
# Insert pseudo recipient address when there is no valid recipient address in the message
# for example,
# Date: Thu, 29 Apr 2015 23:34:45 +0000
# To: "undisclosed"
# Subject: Nyaan
# Message-ID: <ffffffffffffffffffffffff00000000@example.net>
last if index($dscontents->[-1]->{'recipient'}, '@') > 0;
$dscontents->[-1]->{'recipient'} = Sisimai::Address->undisclosed(1);
$recipients = 1;
last;
}
$recipients = 1;
}

unless( $rfc822part =~ /\bFrom: [^ ]+[@][^ ]+\b/ ) {
unless( Sisimai::String->aligned(\$rfc822part, ['From: ', '@']) ) {
# There is no "From:" header in the original message Append the value of "Original-Mail-From"
# value as a sender address.
$rfc822part .= 'From: '.$commondata->{'from'}."\n" if $commondata->{'from'};
}

if( $mhead->{'subject'} =~ /complaint about message from (\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3})/ ) {
if( index($mhead->{'subject'}, 'complaint about message from ') > -1 ) {
# Microsoft ARF: remote host address.
$arfheaders->{'rhost'} = $1;
$arfheaders->{'rhost'} = substr($mhead->{'subject'}, rindex($mhead->{'subject'}, ' ') + 1,);
$commondata->{'diagnosis'} = sprintf(
"This is a Microsoft email abuse report for an email message received from IP %s on %s",
$arfheaders->{'rhost'}, $mhead->{'date'});
}

for my $e ( @$dscontents ) {
# AOL = http://forums.cpanel.net/f43/aol-brutal-work-71473.html
$e->{'recipient'} = Sisimai::Address->s3s4($rcptintext) if $e->{'recipient'} =~ /\A[^ ]+[@]\z/;
$e->{'recipient'} = Sisimai::Address->s3s4($rcptintext) if substr($e->{'recipient'}, -1, 1) eq '@';
$e->{ $_ } ||= $arfheaders->{ $_ } for keys %$arfheaders;
delete $e->{'authres'};

Expand All @@ -301,10 +307,12 @@ sub inquire {
# The value of "Reporting-MTA" header
$e->{'rhost'} = $commondata->{'rhost'};

} elsif( $e->{'diagnosis'} =~ /\breceived from IP address ([^ ]+)/ ) {
} else {
# Try to get an IP address from the error message
# This is an email abuse report for an email message received from IP address 24.64.1.1
# on Thu, 29 Apr 2010 00:00:00 +0000
$e->{'rhost'} = $1;
my $ip = Sisimai::String->ipv4($e->{'diagnosis'}) || [];
$e->{'rhost'} = $ip->[0] if scalar @$ip;
}
}
return { 'ds' => $dscontents, 'rfc822' => $rfc822part };
Expand Down
63 changes: 34 additions & 29 deletions lib/Sisimai/Address.pm
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,12 @@ BUILD_REGULAR_EXPRESSIONS: {

sub undisclosed {
# Return pseudo recipient or sender address
# @param [String] atype Address type: 'r' or 's'
# @param [String] argv0 Address type: true = recipient, false = sender
# @return [String, undef] Pseudo recipient address or sender address or undef when the $atype
# is neither 'r' nor 's'
my $class = shift;
my $atype = shift || return undef;

return undef unless $atype =~ /\A(?:r|s)\z/;
my $local = $atype eq 'r' ? 'recipient' : 'sender';
my $argv0 = shift // 0;
my $local = $argv0 ? 'recipient' : 'sender';
return sprintf("undisclosed-%s-in-headers%slibsisimai.org.invalid", $local, '@');
}

Expand All @@ -71,19 +69,20 @@ sub is_emailaddress {

sub is_mailerdaemon {
# Check that the argument is mailer-daemon or not
# @param [String] email Email address
# @param [String] argv0 Email address
# @return [Integer] 0: Not mailer-daemon
# 1: Mailer-daemon
my $class = shift;
my $email = shift // return 0;
state $match = qr{(?>
(?:mailer-daemon|postmaster)[@]
|[<(](?:mailer-daemon|postmaster)[)>]
|\A(?:mailer-daemon|postmaster)\z
|[ ]?mailer-daemon[ ]
)
}x;
return 1 if lc($email) =~ $match;
my $argv0 = shift // return 0;
my $email = lc $argv0;

state $postmaster = [
'mailer-daemon@', '<mailer-daemon>', '(mailer-daemon)', ' mailer-daemon ',
'postmaster@', '<postmaster>', '(postmaster)'
];
return 1 if grep { index($email, $_) > -1 } @$postmaster;
return 1 if $email eq 'mailer-daemon';
return 1 if $email eq 'postmaster';
return 0;
}

Expand All @@ -110,12 +109,12 @@ sub new {

my $heads = ['<'];
my $tails = ['>', ',', '.', ';'];
my $point = rindex($argvs->{'address'}, '@');

if( $argvs->{'address'} =~ /\A([^\s]+)[@]([^@]+)\z/ ||
$argvs->{'address'} =~ /\A(["].+?["])[@]([^@]+)\z/ ) {
if( $point > 0 ) {
# Get the local part and the domain part from the email address
my $lpart = $1; for my $e ( @$heads ) { $lpart =~ s/\A$e//g if substr($lpart, 0, 1) eq $e }
my $dpart = $2; for my $e ( @$tails ) { $dpart =~ s/$e\z//g if substr($dpart, -1, 1) eq $e }
my $lpart = substr($argvs->{'address'}, 0, $point);
my $dpart = substr($argvs->{'address'}, $point+1,);
my $email = __PACKAGE__->expand_verp($argvs->{'address'}) || '';
my $alias = 0;

Expand All @@ -125,7 +124,7 @@ sub new {
$alias = 1 if $email;
}

if( $email =~ /\A.+[@].+?\z/ ) {
if( index($email, '@') > 0 ) {
# The address is a VERP or an alias
if( $alias ) {
# The address is an alias: neko+nyaan@example.jp
Expand All @@ -136,8 +135,11 @@ sub new {
$thing->{'verp'} = $argvs->{'address'};
}
}
$thing->{'user'} = $lpart;
$thing->{'host'} = $dpart;

do { while( substr($lpart, 0, 1) eq $_ ) { substr($lpart, 0, 1, '') }} for @$heads;
do { while( substr($dpart, -1, 1) eq $_ ) { substr($dpart, -1, 1, '') }} for @$tails;
$thing->{'user'} = $lpart;
$thing->{'host'} = $dpart;
$thing->{'address'} = $lpart.'@'.$dpart;

} else {
Expand Down Expand Up @@ -167,6 +169,7 @@ sub find {
my $argv1 = shift // return undef; y/\r//d, y/\n//d for $argv1; # Remove CR, NL
my $addrs = shift // undef;

require Sisimai::String;
state $indicators = {
'email-address' => (1 << 0), # <neko@example.org>
'quoted-string' => (1 << 1), # "Neko, Nyaan"
Expand Down Expand Up @@ -321,7 +324,7 @@ sub find {
# Display name like "Neko, Nyaan"
$v->{'name'} .= $e;
next unless $readcursor & $indicators->{'quoted-string'};
next if $v->{'name'} =~ /\x5c["]\z/; # "Neko, Nyaan \"...
next if substr($v->{'name'}, -2, 2) eq qq|\x5c"|; # "Neko, Nyaan \"...
$readcursor &= ~$indicators->{'quoted-string'};
$p = '';
}
Expand Down Expand Up @@ -351,11 +354,13 @@ sub find {

if( $v->{'address'} ) {
# Remove the comment from the address
if( $v->{'address'} =~ /(.*)([(].+[)])(.*)/ ) {
if( Sisimai::String->aligned(\$v->{'address'}, ['(', ')']) ) {
# (nyaan)nekochan@example.org, nekochan(nyaan)cat@example.org or
# nekochan(nyaan)@example.org
$v->{'address'} = $1.$3;
$v->{'comment'} = $2;
my $p1 = index($v->{'address'}, '(');
my $p2 = index($v->{'address'}, ')');
$v->{'address'} = substr($v->{'address'}, 0, $p1).substr($v->{'address'}, $p2 + 1,);
$v->{'comment'} = substr($v->{'address'}, $p1, $p2 - $p1 - 1);
}
push @readbuffer, $v;
}
Expand All @@ -364,7 +369,7 @@ sub find {
for my $e ( @readbuffer ) {
# The element must not include any character except from 0x20 to 0x7e.
next if $e->{'address'} =~ /[^\x20-\x7e]/;
unless( $e->{'address'} =~ /\A.+[@].+\z/ ) {
if( index($e->{'address'}, '@') == -1 ) {
# Allow if the argument is MAILER-DAEMON
next unless __PACKAGE__->is_mailerdaemon($e->{'address'});
}
Expand All @@ -374,7 +379,7 @@ sub find {
s/\A[\[<{('`]//g, s/[.,'`>});]\z//g for $e->{'address'};
$e->{'address'} =~ s/[^A-Za-z]\z//g unless index($e->{'address'}, '@[') > 1;

unless( $e->{'address'} =~ /\A["].+["][@]/ ) {
if( index($e->{'address'}, '"@') < 0 ) {
# Remove double-quotations
substr($e->{'address'}, 0, 1, '') if substr($e->{'address'}, 0, 1) eq '"';
substr($e->{'address'}, -1, 1, '') if substr($e->{'address'}, -1, 1) eq '"';
Expand Down Expand Up @@ -609,7 +614,7 @@ azumakuniyuki
=head1 COPYRIGHT
Copyright (C) 2014-2022 azumakuniyuki, All rights reserved.
Copyright (C) 2014-2023 azumakuniyuki, All rights reserved.
=head1 LICENSE
Expand Down

0 comments on commit 0c60351

Please sign in to comment.