Fish out interesting links from pages that don't have RSS feeds.

skington · Mar 24, 2012 · c91efe1 · c91efe1
1 parent db96c3d
commit c91efe1
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 29 deletions.
diff --git a/TODO b/TODO
@@ -1,26 +1,3 @@
-Working out the image (and therefore knowing whether the page has changed):
-
-1) Look at the largest image. This works most of the time.
-
-2) Look for a redirect to a canonical URL - e.g. explosm.net/comics to
-explosm.net/comics/2452, order of the stick, PvP
-
-3) Look for some sort of update - e.g. 2dgoggles.com
-
-4) Check the status of some other image - e.g. dresdencodak.com,
-abominable.cc, sisterclaire.com, somethingpositive.net, sluggy.com, wondermark
-
-5) Cope with HTTP::Async not coping with https URLs -
-e.g. ubersoft.net/comic/hd 
-
-6) Cope with webcomics requiring a click-through - e.g. oglaf
-
-7) Improve performance by not fetching paypal images - done?
-
-8) Let people choose whether to see news or not.
-
-Working out canonical URLs
-
 RSS feed issues:
 Doonesbury has a non-Doonesbury-related slate feed
 No feed for Dilbert (but there's a link to it).
@@ -44,4 +21,16 @@ Sinfest: image is /comikaze/comics/yyyy-mm-dd.gif. Link is to /archive_page.php?
 Gunnerkrigg Court: link to /archive_page.php?comicID=nnnn, contains img src prev_a.jpg, alt 'Previous'. Today's page exists.
 Sluggy: useless RSS feed. Link to /comics/archives/daily/yymmdd, contains image class "ui-icon-seek-prev", text "Prev.". Today's link redirects to front page.
 El Goonish Shive: finds correct image, /comics/yyyy/mm/yyyymmdd_....png. Link to /?date=yyyy-mm-dd, contains image src /templates/home/arrow_prev.gif. Today's link exists.
-Project Apollo: link to abnnn.html, contains image, alt 'Previous Page', src layout/button-back.png. Page doesn't exist. Site fucked.
+Project Apollo: link to abnnn.html, contains image, alt 'Previous Page', src layout/button-back.png. Page doesn't exist. Site fucked.
+
+So, interesting things:
+* link title Prev or Previous
+* class contains Prev
+* rel contains prev
+* image back.jpg or prev_a.gif or arrow_prev.gif
+* image alt Previous
+* link text contains PREVIOUS or Prev
+
+General TODO:
+
+* Get alt text
diff --git a/lib/webcomics.pm b/lib/webcomics.pm
@@ -123,14 +123,20 @@ sub addnew {
         return template 'addnew_response', \%template_params;
     }
 
+    # Extract all interesting-looking links.
+    $template_params{links}
+        = [identify_interesting_links(extract_links($url, $tree))];
+
     # Find the largest image on the page and find out how to identify
     # it.
-    my $largest_image = find_largest_image($url, $tree);
-    $template_params{image} = $largest_image;
+    if (q{Care about this} eq q{A lot}) {
+        my $largest_image = find_largest_image($url, $tree);
+        $template_params{image} = $largest_image;
 
-    # Work out how to identify this image.
-    $template_params{identifiers}
-        = [identifiers_from_element($tree, $largest_image->{element})];
+        # Work out how to identify this image.
+        $template_params{identifiers}
+            = [identifiers_from_element($tree, $largest_image->{element})];
+    }
 
     # We're done with our tree, so delete it to free up memory.
     $tree->delete;
@@ -541,6 +547,58 @@ sub identify_sequence_regexstr {
     return %sequence_regexstr_length;
 }
 
+# Supplied with a list of links, as returned by extract_links, returns
+# only those that look interesting.
+
+sub identify_interesting_links {
+    my (@links) = @_;
+
+    my @interesting_links;
+    tag:
+    for my $link (@links) {
+        for my $attribute (keys %$link) {
+            if ($link->{$attribute}
+                =~ /(?: \b | _) (?: prev (?: ious)? | back ) \b /xi)
+            {
+                push @interesting_links, $link;
+                next tag;
+            }
+        }
+    }
+    return @interesting_links;
+}
+
+# Supplied with a URL and a HTML::TreeBuilder tree, returns a list of
+# hashes with the following keys:
+#  url: the absolute URL for this URL
+#  text: the text of the link
+#  css: the CSS for this element
+#  rel: any rel attributes
+
+sub extract_links {
+    my ($url, $tree) = @_;
+
+    my @links = $tree->look_down(_tag => 'a');
+    my @link_data;
+    for my $link (@links) {
+        my %link_data = ( url => URI->new_abs($link->attr('href'), $url) );
+        for my $attribute ('class', 'id', 'rel') {
+            if ($link->attr($attribute)) {
+                $link_data{$attribute} = $link->attr($attribute);
+            }
+        }
+        if (my $image = $link->look_down(_tag => 'img')) {
+            $link_data{img_alt} = $image->attr('alt');
+            $link_data{img_src} = URI->new_abs($image->attr('src'), $url);
+        }
+        if (my $text = $link->as_text) {
+            $link_data{text} = $text;
+        }
+        push @link_data, \%link_data;
+    }
+    return @link_data;
+}
+
 # Supplied with a URL and a HTML::TreeBuilder tree, returns a hashref with the
 # following fields:
 #   element: HTML::Element object for the <img> tag

diff --git a/views/addnew_response.tt b/views/addnew_response.tt
@@ -50,6 +50,34 @@ value="<% entry.date_entry %>" />
 
 <% END %>
 
+<% IF links %>
+  <ul>
+  <% FOREACH link IN links %>
+    <li>
+    URL: <% link.url %>
+	<% IF link.text %>
+	  <br/> Link text "<% link.text %>".
+	<% END %>
+	<% IF link.id %>
+	  <br/> ID "<% link.id %>".
+	<% END %>
+	<% IF link.class %>
+	  <br/> Class "<% link.class %>".
+	<% END %>
+	<% IF link.rel %>
+	  <br/> Rel "<% link.rel %>".
+	<% END %>
+	<% IF link.img_alt %>
+	  </br> Image alt "<% link.img_alt %>".
+	<% END %>
+	<% IF link.img_src %>
+	  </br> Image link "<% link.img_src %>".
+	<% END %>
+    </li>
+  <% END %>
+  </ul>
+<% END %>
+
 <% IF image.url %>
 <p>Image URL: <% image.url %> </p>
 <ul>