-
Notifications
You must be signed in to change notification settings - Fork 8
/
domainTagifyComments
executable file
·207 lines (180 loc) · 5.41 KB
/
domainTagifyComments
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/perl -w
# This code is a part of Slash, and is released under the GPL.
# Copyright 1997-2005 by Open Source Technology Group. See README
# and COPYING for more information, or see http://slashcode.com/.
# $Id$
use strict;
use URI;
use Getopt::Std;
use File::Basename;
use Digest::MD5 'md5_hex';
use Benchmark;
use Slash;
use Slash::Utility;
use Slash::DB;
use vars qw( $slashdb $constants $vars $comment_table );
(my $VERSION) = ' $Revision$ ' =~ /\$Revision:\s+([^\s]+)/;
my $PROGNAME = basename($0);
my %opts;
# Remember to doublecheck these match usage()!
usage('Options used incorrectly') unless getopts('hu:f:l:F:L:v', \%opts);
usage() if $opts{'h'};
version() if $opts{'v'};
$opts{'u'} ||= 'slash';
createEnvironment($opts{u});
$slashdb = getCurrentDB();
$constants = getCurrentStatic();
my $vars = $slashdb->sqlSelectAllHashref("name", "name, value", "vars");
$comment_table = 'comments';
my($min_cid, $max_cid) = (0, 0);
if ($opts{'f'} or $opts{'l'}) {
# if "-f" specified, do the last day by default
$opts{'f'} ||= time-(86400*7);
$opts{'l'} ||= time;
$min_cid = $slashdb->sqlSelect(
"MIN(cid)",
$comment_table,
"UNIX_TIMESTAMP(date) > " . $slashdb->sqlQuote($opts{'f'})
);
$max_cid = $slashdb->sqlSelect(
"MAX(cid)",
$comment_table,
"UNIX_TIMESTAMP(date) < " . $slashdb->sqlQuote($opts{'l'})
);
} else {
# do the last 100 comments by default
$max_cid = $opts{'L'} || $slashdb->sqlSelect("MAX(cid)", $comment_table);
$min_cid = $opts{'F'} || $max_cid-100;
$min_cid = 0 if $min_cid < 0;
}
# main program logic (in braces to offset nicely)
{
print "Adding domain tags to comments from $min_cid to $max_cid.\n";
print "Caution - this script is largely untested. If you don't know\n";
print "what you're doing, hit ^C in the next 5 seconds.\n";
sleep 5;
my $t0 = new Benchmark;
my $count_changed = 0;
for (my $cid = $min_cid; $cid <= $max_cid; ++$cid) {
my $text = $slashdb->sqlSelect(
"comment",
"comment_text",
"cid=$cid"
);
next unless $text and length($text) > 15;
my $new_text = myAddDomainTags($text);
next unless $new_text ne $text;
my $new_signature = md5_hex($new_text);
$slashdb->sqlUpdate("comment_text", { comment => $new_text }, "cid=$cid");
$slashdb->sqlUpdate($comment_table, { signature => $new_signature }, "cid=$cid");
$slashdb->sqlUpdate("comments", { signature => $new_signature }, "cid=$cid")
if $comment_table ne "comments";
++$count_changed;
printf "%5d. cid %7d: %5d -> %5d bytes\n",
$count_changed, $cid, length($text), length($new_text);
sleep 1 if ($count_changed % 100) == 0; # give DB a rest
}
my $t1 = new Benchmark;
print "$count_changed comments (of " . ($max_cid-$min_cid) . ")" .
" domaintagified in: " .
timestr(timediff($t1, $t0), 'noc') . "\n";
}
exit 0;
# subroutines
sub myAddDomainTags {
my($html) = @_;
# First step is to eliminate unclosed <A> tags.
my $in_a = 0;
$html =~ s
{
( < (/?) A \b[^>]* > )
}{
my $old_in_a = $in_a;
my $new_in_a = !$2;
$in_a = $new_in_a;
(($old_in_a && $new_in_a) ? "</A>" : "") . $1
}gixe;
$html .= "</A>" if $in_a;
# Now, since we know that every <A> has a </A>, this pattern will
# match and let the subroutine above do its magic properly.
$html =~ s
{
(<A[ ]HREF="
([^">]+)
">[\000-\377]+?)
</A\b[^>]*>
}{
$1 . _url_to_domain_tag($2)
}gixe;
return $html;
}
sub _url_to_domain_tag {
my($link) = @_;
my $absolutedir = $vars->{absolutedir}{value};
my $uri = URI->new_abs($link, $absolutedir);
my($info, $host, $scheme) = ("", "", "");
if ($uri->can("host") and $host = $uri->host) {
$info = lc $host;
if ($info =~ m/^([\d.]+)\.in-addr\.arpa$/) {
$info = join(".", reverse split /\./, $1);
}
if ($info =~ m/^(\d{1,3}\.){3}\d{1,3}$/) {
# leave a numeric IP address alone
} elsif ($info =~ m/([\w-]+\.[a-z]{3,4})$/) {
# a.b.c.d.com -> d.com
$info = $1;
} elsif ($info =~ m/([\w-]+\.[a-z]{2,4}\.[a-z]{2})$/) {
# a.b.c.d.co.uk -> d.co.uk
$info = $1;
} elsif ($info =~ m/([a-z]+\.[a-z]{2})$/) {
# a.b.c.realdomain.gr -> realdomain.gr
$info = $1;
} else {
# any other a.b.c.d.e -> c.d.e
my @info = split /\./, $info;
my $num_levels = scalar @info;
if ($num_levels >= 3) {
$info = join(".", @info[-3..-1]);
}
}
} elsif ($uri->can("scheme") and $scheme = $uri->scheme) {
# Most schemes, like ftp or http, have a host. Some,
# most notably mailto and news, do not. For those,
# at least give the user an idea of why not, by
# listing the scheme.
$info = lc $scheme;
} else {
$info = "?";
}
if (length($info) >= 25) {
$info = substr($info, 0, 10) . "..." . substr($info, -10);
}
return "</a $info>";
}
sub usage {
print "*** $_[0]\n" if $_[0];
# Remember to doublecheck these match getopts()!
print <<EOT;
Usage: $PROGNAME [OPTIONS] [#comments]
This utility creates test comments for a given Slash site. This program is for
testing purposes, only, particularly for those ambitious Slash users out there
who want to try their hand at modifying the comment or moderation systems.
Main options:
-h Help (this message)
-v Version
-u Virtual user (default is "slash")
-f First date (unix timestamp e.g. 987100000)
-l Last date (unix timestamp e.g. 987500000)
EOT
exit;
}
sub version {
print <<EOT;
$PROGNAME $VERSION
This code is a part of Slash, and is released under the GPL.
Copyright 1997-2005 by Open Source Technology Group. See README
and COPYING for more information, or see http://slashcode.com/.
EOT
exit;
}
__END__