Permalink
Browse files

Two major changes together (sorry) -- getSimilarStory and slashizedLi…

…nks.

"Similar story" code adds a task to plugins/Admin which generates a
list of "uncommon words" and URLs (currently stored in a var, which I
don't much like but it's there for now) from recent stories.  Each story
preview/edit uses that list and the list of words in the edited story to
find a list of similar stories, which can help admins find useful links
and duplicates.  And "slashizeLinks," part of Slash::Utility::Data and
referenced by admin.pl, Slash.pm and article.pl, converts all article
and comments links internal to a Slash site (including .shtml links),
in stories, to a standard format which is parsed at display time.  This
ensures that such links are in a standard format, and that static pages
link to static pages and dynamic pages to dynamic.  Also, a few minor
little bugs and spelling mistakes are fixed.
  • Loading branch information...
1 parent 3672774 commit 7736a750ad3632ca7807b8cdba5cfe96d95e7d23 @jamiemccarthy jamiemccarthy committed May 26, 2002
@@ -4772,6 +4772,128 @@ sub getStory {
}
########################################################
+sub getSimilarStories {
+ my($self, $story, $max_wanted) = @_;
+ $max_wanted ||= 100;
+ my $constants = getCurrentStatic();
+ my($title, $introtext, $bodytext) =
+ ($story->{title} || "",
+ $story->{introtext} || "",
+ $story->{bodytext} || "");
+ my $not_original_sid = $story->{sid} || "";
+ $not_original_sid = " AND stories.sid != "
+ . $self->sqlQuote($not_original_sid)
+ if $not_original_sid;
+
+ my $text = "$title $introtext $bodytext";
+ # Find a list of all the words in the current story.
+ my $text_words = findWords($text);
+ # Load up the list of words in recent stories (the only ones we
+ # need to concern ourselves with looking for).
+ my @recent_uncommon_words = split " ",
+ ($self->getVar("uncommonstorywords", "value") || "");
+ # If we don't (yet) know the list of uncommon words, return now.
+ return [ ] unless @recent_uncommon_words;
+ # Find the intersection of this story and recent stories.
+ my @text_uncommon_words =
+ sort {
+ $text_words->{$b}{weight} <=> $text_words->{$a}{weight}
+ ||
+ $a cmp $b
+ }
+ grep { $text_words->{$_}{count} }
+ grep { length($_) > 3 }
+ @recent_uncommon_words;
+#use Data::Dumper;
+#print STDERR "text_words: " . Dumper($text_words);
+#print STDERR "uncommon intersection: '@text_uncommon_words'\n";
+ # If there is no intersection, return now.
+ return [ ] unless @text_uncommon_words;
+ # Find previous stories which have used these words.
+ my $where = "";
+ my @where_clauses = ( );
+ for my $word (@text_uncommon_words) {
+ my $word_q = $self->sqlQuote('%' . $word . '%');
+ push @where_clauses, "stories.title LIKE $word_q";
+ push @where_clauses, "story_text.introtext LIKE $word_q";
+ push @where_clauses, "story_text.bodytext LIKE $word_q";
+ }
+ $where = join(" OR ", @where_clauses);
+ my $n_days = $constants->{similarstorydays} || 30;
+ my $stories = $self->sqlSelectAllHashref(
+ "sid",
+ "stories.sid AS sid, title, introtext, bodytext,
+ time, displaystatus",
+ "stories, story_text",
+ "stories.sid = story_text.sid $not_original_sid
+ AND stories.time >= DATE_SUB(NOW(), INTERVAL $n_days DAY)
+ AND ($where)"
+ );
+#print STDERR "similar stories: " . Dumper($stories);
+
+ for my $sid (keys %$stories) {
+ # Add up the weights of each story in turn, for how closely
+ # they match with the current story. Include a multiplier
+ # based on the length of the match.
+ my $s = $stories->{$sid};
+ $s->{weight} = 0;
+ for my $word (@text_uncommon_words) {
+ my $word_weight = 0;
+ my $wr = qr{(?i:\b\Q$word\E)};
+ my $m = log(length $word);
+ $word_weight += 2.0*$m * (() = $s->{title} =~ m{$wr}g);
+ $word_weight += 1.0*$m * (() = $s->{introtext} =~ m{$wr}g);
+ $word_weight += 0.5*$m * (() = $s->{bodytext} =~ m{$wr}g);
+ $s->{word_hr}{$word} = $word_weight if $word_weight > 0;
+ $s->{weight} += $word_weight;
+ }
+ # Round off weight to 0 decimal places (to an integer).
+ $s->{weight} = sprintf("%.0f", $s->{weight});
+ # Store (the top-scoring-ten of) the words that connected
+ # the original story to this story.
+ $s->{words} = [
+ sort { $s->{word_hr}{$b} <=> $s->{word_hr}{$a} }
+ keys %{$s->{word_hr}}
+ ];
+ $#{$s->{words}} = 9 if $#{$s->{words}} > 9;
+ }
+ # If any stories match and are above the threshold, return them.
+ # Pull out the top $max_wanted scorers. Then sort them by time.
+ my $minweight = $constants->{similarstoryminweight} || 4;
+ my @sids = sort {
+ $stories->{$b}{weight} <=> $stories->{$a}{weight}
+ ||
+ $stories->{$b}{'time'} cmp $stories->{$a}{'time'}
+ ||
+ $a cmp $b
+ } grep { $stories->{$_}{weight} >= $minweight } keys %$stories;
+#print STDERR "all sids @sids stories " . Dumper($stories);
+ return [ ] if !@sids;
+ $#sids = $max_wanted-1 if $#sids > $max_wanted-1;
+ # Now that we have only the ones we want, push them onto the
+ # return list sorted by time.
+ my $ret_ar = [ ];
+ for my $sid (sort {
+ $stories->{$b}{'time'} cmp $stories->{$a}{'time'}
+ ||
+ $stories->{$b}{weight} <=> $stories->{$a}{weight}
+ ||
+ $a cmp $b
+ } @sids) {
+ push @$ret_ar, {
+ sid => $sid,
+ weight => sprintf("%.0f", $stories->{$sid}{weight}),
+ title => $stories->{$sid}{title},
+ 'time' => $stories->{$sid}{'time'},
+ words => $stories->{$sid}{words},
+ displaystatus => $stories->{$sid}{displaystatus},
+ };
+ }
+#print STDERR "ret_ar " . Dumper($ret_ar);
+ return $ret_ar;
+}
+
+########################################################
sub getAuthor {
my($self) = @_;
_genericCacheRefresh($self, 'authors_cache', getCurrentStatic('block_expire'));
@@ -1029,6 +1029,58 @@ sub createAuthorCache {
}
########################################################
+# For plugins/Admin/refresh_uncommon.pl
+sub refreshUncommonStoryWords {
+ my($self) = @_;
+ my $constants = getCurrentStatic();
+ my $ignore_threshold = $constants->{uncommonstoryword_thresh} || 2;
+ my $n_days = $constants->{similarstorydays} || 30;
+ $ignore_threshold = int($n_days/$ignore_threshold+0.5);
+
+ # First, get a collection of all words posted in stories for the last
+ # however-many days.
+ my $arr = $self->sqlSelectAll(
+ "title, introtext, bodytext",
+ "story_text, stories",
+ "stories.sid = story_text.sid
+ AND stories.time >= DATE_SUB(NOW(), INTERVAL $n_days DAY)"
+ );
+ my $word_hr = { };
+ for my $ar (@$arr) {
+ findWords($ar->[0], 8 , $word_hr) if $ar->[0]; # title
+ findWords($ar->[1], 1 , $word_hr) if $ar->[1]; # introtext
+ findWords($ar->[2], 0.5, $word_hr) if $ar->[2]; # bodytext
+ }
+#use Data::Dumper; print STDERR Dumper($word_hr);
+
+ # The only words that count as uncommon are the ones that appear in
+ # stories less frequently than once every uncommonstoryword_thresh
+ # days. Everything else is, well, too common to bother with.
+ my @uncommon_words = ( );
+ my $maxlen = $constants->{uncommonstorywords_maxlen} || 65000;
+ my $minlen = $constants->{uncommonstoryword_minlen} || 3;
+ my $length = $maxlen+1;
+ @uncommon_words =
+ sort {
+ $word_hr->{$b}{weight} <=> $word_hr->{$a}{weight}
+ ||
+ length($b) <=> length($a)
+ ||
+ $a cmp $b
+ }
+ grep { $word_hr->{$_}{count} <= $ignore_threshold }
+ grep { length($_) > $minlen }
+ keys %$word_hr;
+#print STDERR "@uncommon_words\n";
+ my $uncommon_words = substr(join(" ", @uncommon_words), 0, $maxlen);
+ if (length($uncommon_words) == $maxlen) {
+ $uncommon_words =~ s/\s+\S+\Z//;
+ }
+
+ $self->setVar("uncommonstorywords", $uncommon_words);
+}
+
+########################################################
# For tasks/flush_formkeys.pl
sub deleteOldFormkeys {
my($self, $timeframe) = @_;
View
@@ -198,6 +198,8 @@ sub selectComments {
# If we are refreshing, PRINT the totals so freshenup/archive
# tasks can update without having to redo all of the work.
+ push @{$comments->{0}{totals}}, 0
+ while scalar(@{$comments->{0}{totals}}) < $max-$min;
my $hp = join ',', @{$comments->{0}{totals}};
print STDERR "count $count, hitparade $hp\n";
}
@@ -912,8 +914,13 @@ sub displayStory {
# well, also, dispStory needs a story reference, not an SID,
# though that could be changed -- pudge
+ # And now we're also calling parseSlashizedLinks. - 2002/05/24 Jamie
+
$story->{storytime} = timeCalc($story->{'time'});
+ $story->{introtext} = parseSlashizedLinks($story->{introtext});
+ $story->{bodytext} = parseSlashizedLinks($story->{bodytext});
+
# get extra data from section table for this story
# (if exists)
# this only needs to run for slashdot
Oops, something went wrong. Retry.

0 comments on commit 7736a75

Please sign in to comment.