Permalink
Browse files

Slash::Utility::Data::approveCharref. In its current incarnation,

removes character references (both numeric, and entity) which fiddle
with display stuff.  Who writes these standards?!
  • Loading branch information...
1 parent 225ddb0 commit eb857324a5be682d6e7ce131a4e76213f717dbfe @jamiemccarthy jamiemccarthy committed May 18, 2002
Showing with 87 additions and 4 deletions.
  1. +85 −4 Slash/Utility/Data/Data.pm
  2. +2 −0 themes/slashcode/htdocs/comments.pl
View
89 Slash/Utility/Data/Data.pm
@@ -44,6 +44,7 @@ use vars qw($VERSION @EXPORT);
($VERSION) = ' $Revision$ ' =~ /\$Revision:\s+([^\s]+)/;
@EXPORT = qw(
addDomainTags
+ approveCharref
parseDomainTags
balanceTags
changePassword
@@ -591,8 +592,9 @@ Private function. Strips out "bad" HTML by removing unbalanced HTML
tags and sending balanced tags through C<approveTag>. The "unbalanced"
checker is primitive; no "E<lt>" or "E<gt>" tags will are allowed inside
tag attributes (such as E<lt>A NAME="E<gt>"E<gt>), that breaks the tag.
-Also, whitespace is inserted between adjacent tags, so "E<lt>BRE<gt>E<lt>BRE<gt>"
-becomes "E<lt>BRE<gt> E<lt>BRE<gt>".
+Whitespace is inserted between adjacent tags, so "E<lt>BRE<gt>E<lt>BRE<gt>"
+becomes "E<lt>BRE<gt> E<lt>BRE<gt>". And character references are routed
+through C<approveCharref>.
=over 4
@@ -612,7 +614,7 @@ Processed string.
=item Dependencies
-C<approveTag> function.
+C<approveTag> function, C<approveCharref> function.
=back
@@ -635,7 +637,6 @@ sub stripBadHtml {
> # close bracket
}{$1&gt;}gx;
-
# Encode stray <
1 while $str =~ s{
< # open bracket
@@ -646,6 +647,8 @@ sub stripBadHtml {
)
}{&lt;$1}gx;
+ $str =~ s/\&(.*?);/approveCharref($1)/sge;
+
return $str;
}
@@ -1040,6 +1043,84 @@ sub approveTag {
#========================================================================
+=head2 approveCharref(CHARREF)
+
+Private function. Checks to see if a character reference (minus the
+leading & and trailing ;) is OK. If so, returns the whole character
+reference (including & and ;), and if not, returns the empty string.
+See <http://www.w3.org/TR/html4/charset.html#h-5.3> for definitions and
+explanations of character references.
+
+=over 4
+
+=item Parameters
+
+=over 4
+
+=item CHARREF
+
+HTML character reference to check.
+
+=back
+
+=item Return value
+
+Character reference after processing.
+
+=item Dependencies
+
+None.
+
+=back
+
+=cut
+
+sub approveCharref {
+ my($charref) = @_;
+
+ my $ok = 1; # Everything not forbidden is permitted.
+
+ # At the moment, only entities that change the direction of text
+ # are forbidden. For more information, see
+ # <http://www.w3.org/TR/html4/struct/dirlang.html#bidirection>
+ # and <http://www.htmlhelp.com/reference/html40/special/bdo.html>.
+ my %bad_numeric = map { $_, 1 }
+ qw( 8204 8205 8206 8207 8236 8237 8238 );
+ my %bad_entity = map { $_, 1 }
+ qw( zwnj zwj lrm rlm );
+
+ if ($charref =~ /^#/) {
+ # Probably a numeric character reference.
+ my $decimal = 0;
+ if ($charref =~ /^#x([0-9a-f]+)$/i) {
+ # Hexadecimal encoding.
+ $decimal = hex($1); # always returns a positive integer
+ } elsif ($charref =~ /^#(\d+)$/) {
+ # Decimal encoding.
+ $decimal = $1;
+ } else {
+ # Unknown, assume flawed.
+ $ok = 0;
+ }
+ $ok = 0 if $decimal <= 0 || $decimal > 65534; # sanity check
+ $ok = 0 if $bad_numeric{$decimal};
+ } elsif ($charref =~ /^([a-z0-9]+)$/i) {
+ # Character entity.
+ my $entity = lc $1;
+ $ok = 0 if $bad_entity{$entity};
+ } else {
+ # Unknown character reference type, assume flawed.
+ $ok = 0;
+ }
+ if ($ok) {
+ return "&$charref;";
+ } else {
+ return "";
+ }
+}
+
+#========================================================================
+
=head2 fixparam(DATA)
Prepares data to be a parameter in a URL. Such as:
View
2 themes/slashcode/htdocs/comments.pl
@@ -721,6 +721,8 @@ sub validateComment {
$$subj =~ s/\(Score(.*)//i;
$$subj =~ s/Score:(.*)//i;
+ $$subj =~ s/\&(.*?);/approveCharref($1)/sge;
+
for ($$comm, $$subj) {
my $d = decode_entities($_);
$d =~ s/&#?[a-zA-Z0-9]+;//g; # remove entities we don't know

0 comments on commit eb85732

Please sign in to comment.