Skip to content

Commit

Permalink
Updates to:
Browse files Browse the repository at this point in the history
#2
#009
#017
#034
#054
#064
get_comments
Our @HTML_NAMED_ENTITIES
  • Loading branch information
Bryan committed Nov 2, 2016
1 parent 935926b commit da0f94b
Showing 1 changed file with 43 additions and 30 deletions.
73 changes: 43 additions & 30 deletions bin/checkwiki.pl
Expand Up @@ -11,7 +11,7 @@
## ##
## AUTHOR: Stefan Kühn, Bryan White ## AUTHOR: Stefan Kühn, Bryan White
## LICENCE: GPLv3 ## LICENCE: GPLv3
## VERSION: 2016/9/19 ## VERSION: 2016/11/02
## ##
########################################################################### ###########################################################################


Expand Down Expand Up @@ -143,39 +143,39 @@
wikivoyage ); wikivoyage );


# See http://turner.faculty.swau.edu/webstuff/htmlsymbols.html # See http://turner.faculty.swau.edu/webstuff/htmlsymbols.html
our @HTML_NAMED_ENTITIES = qw( aacute Aacute acirc Acirc aelig AElig our @HTML_NAMED_ENTITIES = qw( aacute Aacute acute acirc Acirc aelig AElig
agrave Agrave alpha Alpha aring Aring asymp atilde Atilde auml Auml beta Beta agrave Agrave alpha Alpha aring Aring asymp atilde Atilde auml Auml beta Beta
bdquo brvbar bull ccedil Ccedil cent chi Chi clubs copy crarr darr dArr deg bdquo brvbar bull ccedil Ccedil cent chi Chi clubs copy crarr darr dArr deg
delta Delta diams divide eacute Eacute ecirc Ecirc egrave Egrave delta Delta diams divide eacute Eacute ecirc Ecirc egrave Egrave
epsilon Epsilon equiv eta Eta eth ETH euml Euml euro fnof frac12 frac14 epsilon Epsilon equiv eta Eta eth ETH euml Euml euro fnof frac12 frac14
frac34 frasl gamma Gamma ge harr hArr hearts hellip iacute Iacute icirc Icirc frac34 frasl gamma Gamma ge harr hArr hearts hellip iacute Iacute icirc Icirc
iexcl igrave Igrave infin int iota Iota iquest iuml Iuml kappa Kappa iexcl igrave Igrave infin int iota Iota iquest iuml Iuml kappa Kappa
lambda Lambda laquo larr lArr ldquo le loz lsaquo lsquo micro middot lambda Lambda laquo larr lArr ldquo le loz lsaquo lsquo micro middot minus
mu Mu ne not ntilde Ntilde nu Nu oacute Oacute ocirc Ocirc oelig OElig mu Mu ne not ntilde Ntilde nu Nu oacute Oacute ocirc Ocirc oelig OElig
ograve Ograve oline omega Omega omicron Omicron ordf ordm oslash Oslash ograve Ograve oline omega Omega omicron Omicron ordf ordm oslash Oslash
otilde Otilde ouml Ouml para part permil phi Phi pi Pi piv plusm pound prod otilde Otilde ouml Ouml para part permil phi Phi pi Pi piv plusmn pound Prime prime prod
psi Psi quot radic raquo rarr rArr rdquo reg rho Rho raquo rsaquo rsquo psi Psi quot radic raquo rarr rArr rdquo reg rho Rho raquo rsaquo rsquo sbquo
scaron Scaron sect sigma Sigma sigmaf spades sum sup1 sup2 sup3 szlig scaron Scaron sect sigma Sigma sigmaf spades sum sup1 sup2 sup3 szlig
tau Tau theta Theta thetasym thorn THORN tilde trade uacute Uacute uarr uArr tau Tau theta Theta thetasym thorn THORN tilde times trade uacute Uacute uarr uArr
ucirc Ucirc ugrave Ugrave upsih upsilon Upsilon uuml Uuml xi Xi yacute Yacute ucirc Ucirc ugrave Ugrave upsih upsilon Upsilon uuml Uuml xi Xi yacute Yacute
yen yuml Yuml zeta Zeta ); yen yuml Yuml zeta Zeta );


# FOR #011. DO NOT CONVERT GREEK LETTERS THAT LOOK LIKE LATIN LETTERS. # FOR #011. DO NOT CONVERT GREEK LETTERS THAT LOOK LIKE LATIN LETTERS.
# Alpha (A), Beta (B), Epsilon (E), Zeta (Z), Eta (E), Kappa (K), kappa (k), Mu (M), Nu (N), nu (v), Omicron (O), omicron (o), Rho (P), Tau (T), Upsilon (Y), upsilon (o) and Chi (X). # Alpha (A), Beta (B), Epsilon (E), Zeta (Z), Eta (E), Kappa (K), kappa (k), Mu (M), Nu (N), nu (v), Omicron (O), omicron (o), Rho (P), Tau (T), Upsilon (Y), upsilon (o) and Chi (X).
our @HTML_NAMED_ENTITIES_011 = qw( aacute Aacute acirc Acirc aelig AElig our @HTML_NAMED_ENTITIES_011 = qw( aacute Aacute acute acirc Acirc aelig AElig
agrave Agrave alpha aring Aring asymp atilde Atilde auml Auml beta bdquo agrave Agrave alpha aring Aring asymp atilde Atilde auml Auml beta bdquo
brvbar bull ccedil Ccedil cent chi clubs copy crarr darr dArr deg brvbar bull ccedil Ccedil cent chi clubs copy crarr darr dArr deg
delta Delta diams divide eacute Eacute ecirc Ecirc egrave Egrave delta Delta diams divide eacute Eacute ecirc Ecirc egrave Egrave
epsilon equiv eta eth ETH euml Euml euro fnof frac12 frac14 epsilon equiv eta eth ETH euml Euml euro fnof frac12 frac14
frac34 frasl gamma Gamma ge harr hArr hearts hellip iacute Iacute icirc Icirc frac34 frasl gamma Gamma ge harr hArr hearts hellip iacute Iacute icirc Icirc
iexcl igrave Igrave infin int iota Iota iquest iuml Iuml iexcl igrave Igrave infin int iota Iota iquest iuml Iuml
lambda Lambda laquo larr lArr ldquo le loz lsaquo lsquo micro middot lambda Lambda laquo larr lArr ldquo le loz lsaquo lsquo micro middot minus
mu ne not ntilde Ntilde oacute Oacute ocirc Ocirc oelig OElig mu ne not ntilde Ntilde oacute Oacute ocirc Ocirc oelig OElig
ograve Ograve oline omega Omega ordf ordm oslash Oslash ograve Ograve oline omega Omega ordf ordm oslash Oslash
otilde Otilde ouml Ouml para part permil phi Phi pi Pi piv plusm pound prod otilde Otilde ouml Ouml para part permil phi Phi pi Pi piv plusmn pound Prime prime prod
psi Psi quot radic raquo rarr rArr rdquo reg rho raquo rsaquo rsquo psi Psi quot radic raquo rarr rArr rdquo reg rho raquo rsaquo rsquo sbquo
scaron Scaron sect sigma Sigma sigmaf spades sum sup1 sup2 sup3 szlig scaron Scaron sect sigma Sigma sigmaf spades sum sup1 sup2 sup3 szlig
tau theta Theta thetasym thorn THORN tilde trade uacute Uacute uarr uArr tau theta Theta thetasym thorn THORN tilde times trade uacute Uacute uarr uArr
ucirc Ucirc ugrave Ugrave upsih upsilon uuml Uuml xi Xi yacute Yacute ucirc Ucirc ugrave Ugrave upsih upsilon uuml Uuml xi Xi yacute Yacute
yen yuml Yuml zeta Zeta ); yen yuml Yuml zeta Zeta );


Expand Down Expand Up @@ -682,13 +682,14 @@ sub scan_pages {
if ( $title ne "" ) { if ( $title ne "" ) {
update_ui() if ++$artcount % 500 == 0; update_ui() if ++$artcount % 500 == 0;


if ( $artcount > 5790000 ) { #if ( $artcount > 5786150 ) {
$page_namespace = 0; $page_namespace = 0;
$title = case_fixer($title); $title = case_fixer($title);
$revision = $page->revision; $revision = $page->revision;
$text = $revision->text; $text = $revision->text;
check_article(); check_article();
}
#}


#$end_of_dump = 'yes' if ( $artcount > 10000 ); #$end_of_dump = 'yes' if ( $artcount > 10000 );
#$end_of_dump = 'yes' if ( $Error_counter > 40000 ) #$end_of_dump = 'yes' if ( $Error_counter > 40000 )
Expand Down Expand Up @@ -970,7 +971,7 @@ sub get_comments {
error_005_Comment_no_correct_end($snippet); error_005_Comment_no_correct_end($snippet);
} }


$text =~ s/<!--(.*?)-->/<!--CheckWiki-->/sg; $text =~ s/<!--(.*?)-->//sg;
} }


return (); return ();
Expand Down Expand Up @@ -1173,7 +1174,6 @@ sub get_score {
sub get_graph { sub get_graph {


$text =~ s/<graph(.*?)<\/graph>/<graph>CheckWiki<\/graph>/sg; $text =~ s/<graph(.*?)<\/graph>/<graph>CheckWiki<\/graph>/sg;
print "\n\n" . $text . "\n\n";
return (); return ();
} }


Expand Down Expand Up @@ -2131,10 +2131,11 @@ sub error_002_have_br {
<br\s*\/\s*[^ ]> # <br\/t> <br\s*\/\s*[^ ]> # <br\/t>
|<br[^ ]\/> # <brt \/> |<br[^ ]\/> # <brt \/>
|<br[^ \/]> # <brt> |<br[^ \/]> # <brt>
|<br\s*\/\s*[^ >] # <br |<br\s*\/\s*[^ >] # <br
|<br\s*[^ >\/] # <br |<br\s*[^ >\/] # <br
|<[^ w]br[^\/]*\s*> # <tbr> or < br> |<[^ w]br[^\/]*\s*> # <tbr> or < br>
|<br\h*[^ \v>\/] # <br t> \v is newline |<br\h*[^ \v>\/] # <br t> \v is newline
|<\/hr>
)/xi )/xi
) )
{ {
Expand All @@ -2143,6 +2144,7 @@ sub error_002_have_br {
error_register( $error_code, $test_line ); error_register( $error_code, $test_line );
} }


# CHECK FOR </center/> or <center/>
for my $temp (@TAG_LIST_002) { for my $temp (@TAG_LIST_002) {
if ( $text =~ /(<\s*\/?\s*$temp\s*\/\s*>)/i ) { if ( $text =~ /(<\s*\/?\s*$temp\s*\/\s*>)/i ) {
my $test_line = substr( $text, $-[0], 40 ); my $test_line = substr( $text, $-[0], 40 );
Expand Down Expand Up @@ -2374,7 +2376,7 @@ sub error_009_more_then_one_category_in_a_line {
if ( $page_namespace == 0 or $page_namespace == 104 ) { if ( $page_namespace == 0 or $page_namespace == 104 ) {


if ( $text =~ if ( $text =~
/\[\[($Cat_regex):(.*?)\]\]([ ]*)\[\[($Cat_regex):(.*?)\]\]/g ) /\[\[($Cat_regex):(.*?)\]\]([ ]*)\[\[($Cat_regex):(.*?)\]\]/ig )
{ {


my $error_text = my $error_text =
Expand Down Expand Up @@ -2570,7 +2572,7 @@ sub error_016_unicode_control_characters {
my $search = "\x{200E}|\x{FEFF}"; my $search = "\x{200E}|\x{FEFF}";
if ( $project eq 'enwiki' ) { if ( $project eq 'enwiki' ) {
$search = $search $search = $search
. "|\x{007F}|\x{200B}|\x{2028}|\x{202A}|\x{202C}|\x{202D}|\x{202E}|\x{00A0}|\x{00AD}|\x{202B}|\x{200F}|\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}"; . "|\x{007F}|\x{200B}|\x{2028}|\x{202A}|\x{202C}|\x{202D}|\x{202E}|\x{00A0}|\x{00AD}|\x{202B}|\x{200F}|\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{FFC}";
} }


if ( $text =~ /($search)/ or $text =~ /(\p{Co})/ ) { if ( $text =~ /($search)/ or $text =~ /(\p{Co})/ ) {
Expand All @@ -2592,6 +2594,7 @@ sub error_016_unicode_control_characters {
$test_text =~ s/\x{202D}/\{202D\}/; $test_text =~ s/\x{202D}/\{202D\}/;
$test_text =~ s/\x{202E}/\{202E\}/; $test_text =~ s/\x{202E}/\{202E\}/;
$test_text =~ s/\x{FEFF}/\{FEFF\}/; $test_text =~ s/\x{FEFF}/\{FEFF\}/;
$test_text =~ s/\x{FFFC}/\{FFFC\}/;
$test_text =~ s/\x{00A0}/\{00A0\}/; $test_text =~ s/\x{00A0}/\{00A0\}/;


error_register( $error_code, $test_text ); error_register( $error_code, $test_text );
Expand Down Expand Up @@ -2624,7 +2627,7 @@ sub error_017_category_double {


foreach my $j ( $i + 1 .. $Category_counter ) { foreach my $j ( $i + 1 .. $Category_counter ) {
my $test2 = $Category[$j][2]; my $test2 = $Category[$j][2];
$test2 =~ s/_//g; $test2 =~ s/_/ /g;


if ( $test2 ne q{} ) { if ( $test2 ne q{} ) {
$test2 = $test2 =
Expand Down Expand Up @@ -3095,7 +3098,7 @@ sub error_034_template_programming_elements {
or $project eq "bewiki" ) or $project eq "bewiki" )
{ {
if ( $text =~ if ( $text =~
/(#if:|#ifeq:|#switch:|#ifexist:|{{fullpagename}}|{{sitename}}|{{namespace}}|{{basepagename}}|{{pagename}}|{{subpagename}}|{{namespacenumber}}|{{talkpagename}}|{{fullpagenamee}}|__noindex__|__index__|__forcetoc__|__nonewsectionlink__|{{subst:)/i /(#if:|#ifeq:|#switch:|#ifexist:|{{fullpagename}}|{{sitename}}|{{namespace}}|{{basepagename}}|{{pagename}}|{{subpagename}}|{{namespacenumber}}|{{fullpagenamee}}|__noindex__|__index__|__forcetoc__|__nonewsectionlink__|{{subst:)/i
) )
{ {
my $test_line = substr( $text, $-[0], 40 ); my $test_line = substr( $text, $-[0], 40 );
Expand All @@ -3105,7 +3108,7 @@ sub error_034_template_programming_elements {
} }
else { else {
if ( $text =~ if ( $text =~
/({{{|#if:|#ifeq:|#switch:|#ifexist:|{{fullpagename}}|{{sitename}}|{{namespace}}|{{basepagename}}|{{pagename}}|{{subpagename}}|{{namespacenumber}}|{{talkpagename}}|{{fullpagenamee}}|__noindex__|__index__|__nonewsectionlink__|{{subst:)/i /({{{|#if:|#ifeq:|#switch:|#ifexist:|{{fullpagename}}|{{sitename}}|{{namespace}}|{{basepagename}}|{{pagename}}|{{subpagename}}|{{namespacenumber}}|{{fullpagenamee}}|__noindex__|__index__|__nonewsectionlink__|{{subst:)/i
) )
{ {
my $test_line = substr( $text, $-[0], 40 ); my $test_line = substr( $text, $-[0], 40 );
Expand Down Expand Up @@ -3768,7 +3771,7 @@ sub error_054_break_in_list {
foreach (@Lines) { foreach (@Lines) {


if ( index( $_, q{*} ) == 0 ) { if ( index( $_, q{*} ) == 0 ) {
if ( $_ =~ /<br([ ]+)?(\/)?([ ]+)?>([ ]+)?$/i ) { if ( $_ =~ /<br([ ]+)?(\/)?([ ]+)?>([ \t]+)?$/i ) {
error_register( $error_code, substr( $_, 0, 40 ) ); error_register( $error_code, substr( $_, 0, 40 ) );
} }
} }
Expand Down Expand Up @@ -4087,7 +4090,11 @@ sub error_064_link_equal_linktext {


# Account for [[Foo|''Foo'']] and [[Foo|'''Foo''']] # Account for [[Foo|''Foo'']] and [[Foo|'''Foo''']]
$temp_text =~ $temp_text =~
s/\[\[\s*([^|:\]]*)\s*\|\s*('{2,})\s*(.)/\[\[$1\|$2\u$3/g; s/\[\[\s*([^|:\]]*)\s*\|\s*('+)\s*(.)/\[\[$1\|$2\u$3/g;

# Account for [[Foo|"Foo"]]
$temp_text =~
s/\[\[\s*([^|:\]]*)\s*\|\s*("|`|«|»|„|“)\s*(.)/\[\[$1\|$2\u$3/g;
} }


if ( $temp_text =~ /(\[\[\s*([^|:]*)\s*\|\2\s*[.,]?\s*\]\])/ ) { if ( $temp_text =~ /(\[\[\s*([^|:]*)\s*\|\2\s*[.,]?\s*\]\])/ ) {
Expand All @@ -4096,8 +4103,14 @@ sub error_064_link_equal_linktext {
} }


# Account for [[Foo|''Foo'']] and [[Foo|'''Foo''']] # Account for [[Foo|''Foo'']] and [[Foo|'''Foo''']]
elsif ( elsif ( $temp_text =~ /(\[\[\s*([^|:]*)\s*\|'+\2\s*'+\s*\]\])/ ) {
$temp_text =~ /(\[\[\s*([^|:]*)\s*\|'{2,}\2\s*'{2,}\s*\]\])/ ) my $found_text = $1;
error_register( $error_code, $found_text );
}

# Account for [[Foo|"Foo"]]
elsif ( $temp_text =~
/(\[\[\s*([^|:]*)\s*\|("|`|«|„)\2\s*(“|`|»|")\s*\]\])/ )
{ {
my $found_text = $1; my $found_text = $1;
error_register( $error_code, $found_text ); error_register( $error_code, $found_text );
Expand Down

0 comments on commit da0f94b

Please sign in to comment.