Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

use Bundler for gem; require guess_html_encoding for Ruby 1.9.2

  • Loading branch information...
commit 975ce96b5e97b3c80163ed35b797b38fce57e676 1 parent 15cb43c
@cantino cantino authored
View
28 .gitignore
@@ -1,23 +1,7 @@
-pkg/*
-=======
-## MAC OS
.DS_Store
-
-## TEXTMATE
-*.tmproj
-tmtags
-
-## EMACS
-*~
-\#*
-.\#*
-
-## VIM
-*.swp
-
-## PROJECT::GENERAL
-coverage
-rdoc
-pkg
-
-## PROJECT::SPECIFIC
+.gem
+.bundle
+Gemfile.lock
+pkg/*
+.idea
+.rvmrc
View
4 Gemfile
@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+
+# Specify your gem's dependencies in ruby-readability.gemspec
+gemspec
View
17 README
@@ -15,13 +15,28 @@ Example:
source = open('http://lab.arc90.com/experiments/readability/').read
puts Readability::Document.new(source).content
-There is also a command-line tool for testing readability in bin/readability.
+Options:
+
+ You may provide additions options to Readability::Document.new, including:
+
+ :tags - the base whitelist of tags to sanitize, defaults to %w[div p]
+ :remove_empty_nodes - remove <p> tags that have no text content; this will also remove p tags that contain only images
+ :attributes - whitelist of allowed attributes
+ :debug - provide debugging output, defaults false
+ :encoding - if this page is of a known encoding, you can specify it; if left unspecified, the encoding will be guessed (only in Ruby 1.9.2)
+ :html_headers - in Ruby 1.9.2 these will be passed to the guess_html_encoding gem to aid with encoding guessing
+
+Readability comes with a command-line tool for experimentation in bin/readability.
Usage: readability [options] URL
-d, --debug Show debug output
-i, --images Keep images and links
-h, --help Show this message
+Potential issues:
+
+* If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
+
===
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
View
45 Rakefile
@@ -1,45 +1,6 @@
-require 'rubygems'
-require 'rake'
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
-begin
- require 'jeweler'
- Jeweler::Tasks.new do |gem|
- gem.name = "ruby-readability"
- gem.summary = %Q{Port of arc90's readability project to ruby}
- gem.description = %Q{Port of arc90's readability project to ruby}
- gem.email = "andrew@iterationlabs.com"
- gem.homepage = "http://github.com/iterationlabs/ruby-readability"
- gem.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
- gem.add_development_dependency "rspec", ">= 1.2.9"
- gem.add_dependency 'nokogiri', '>= 1.4.2'
- end
- Jeweler::GemcutterTasks.new
-rescue LoadError
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
-end
-
-require 'spec/rake/spectask'
-Spec::Rake::SpecTask.new(:spec) do |spec|
- spec.libs << 'lib' << 'spec'
- spec.spec_files = FileList['spec/**/*_spec.rb']
-end
-
-Spec::Rake::SpecTask.new(:rcov) do |spec|
- spec.libs << 'lib' << 'spec'
- spec.pattern = 'spec/**/*_spec.rb'
- spec.rcov = true
-end
-
-task :spec => :check_dependencies
+RSpec::Core::RakeTask.new(:spec)
task :default => :spec
-
-require 'rake/rdoctask'
-Rake::RDocTask.new do |rdoc|
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
-
- rdoc.rdoc_dir = 'rdoc'
- rdoc.title = "ruby-readability #{version}"
- rdoc.rdoc_files.include('README*')
- rdoc.rdoc_files.include('lib/**/*.rb')
-end
View
1  VERSION
@@ -1 +0,0 @@
-0.2.4
View
16 lib/readability.rb
@@ -1,5 +1,6 @@
require 'rubygems'
require 'nokogiri'
+require 'guess_html_encoding'
module Readability
class Document
@@ -9,24 +10,29 @@ class Document
:remove_unlikely_candidates => true,
:weight_classes => true,
:clean_conditionally => true,
- :remove_empty_nodes => true,
- :encoding => 'UTF-8'
+ :remove_empty_nodes => true
}.freeze
attr_accessor :options, :html
def initialize(input, options = {})
- @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@options = DEFAULT_OPTIONS.merge(options)
+ @input = input
+
+ if RUBY_VERSION == "1.9.2" && !@options[:encoding]
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
+ @options[:encoding] = @input.encoding.to_s
+ end
+
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
- @encoding = @options[:encoding]
make_html
end
def make_html
- @html = Nokogiri::HTML(@input, nil, @encoding)
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
end
REGEXES = {
View
1  lib/ruby-readability.rb
@@ -0,0 +1 @@
+require 'readability'
View
73 ruby-readability.gemspec
@@ -1,64 +1,23 @@
-# Generated by jeweler
-# DO NOT EDIT THIS FILE DIRECTLY
-# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
Gem::Specification.new do |s|
- s.name = %q{ruby-readability}
- s.version = "0.2.4"
-
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
- s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
- s.date = %q{2011-06-09}
- s.default_executable = %q{readability}
+ s.name = "ruby-readability"
+ s.version = '0.2.5'
+ s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
+ s.email = ["andrew@iterationlabs.com"]
+ s.homepage = "http://github.com/iterationlabs/ruby-readability"
+ s.summary = %q{Port of arc90's readability project to ruby}
s.description = %q{Port of arc90's readability project to ruby}
- s.email = %q{andrew@iterationlabs.com}
- s.executables = ["readability"]
- s.extra_rdoc_files = [
- "README"
- ]
- s.files = [
- ".document",
- "README",
- "Rakefile",
- "VERSION",
- "bin/readability",
- "lib/readability.rb",
- "ruby-readability.gemspec",
- "spec/fixtures/cant_read.html",
- "spec/fixtures/sample.html",
- "spec/fixtures/samples/blogpost_with_links-fragments.rb",
- "spec/fixtures/samples/blogpost_with_links.html",
- "spec/fixtures/samples/channel4-1-fragments.rb",
- "spec/fixtures/samples/channel4-1.html",
- "spec/fixtures/samples/foxnews-india1-fragments.rb",
- "spec/fixtures/samples/foxnews-india1.html",
- "spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb",
- "spec/fixtures/samples/globemail-ottawa-cuts.html",
- "spec/fixtures/should_not_truncate.txt",
- "spec/readability_spec.rb",
- "spec/spec.opts",
- "spec/spec_helper.rb"
- ]
- s.homepage = %q{http://github.com/iterationlabs/ruby-readability}
- s.require_paths = ["lib"]
- s.rubygems_version = %q{1.3.7}
- s.summary = %q{Port of arc90's readability project to ruby}
- if s.respond_to? :specification_version then
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
- s.specification_version = 3
+ s.rubyforge_project = "ruby-readability"
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
- s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
- s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
- else
- s.add_dependency(%q<rspec>, [">= 1.2.9"])
- s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
- end
- else
- s.add_dependency(%q<rspec>, [">= 1.2.9"])
- s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
- end
-end
+ s.files = `git ls-files`.split("\n")
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+ s.require_paths = ["lib"]
+ s.add_development_dependency "rspec", ">= 2.6"
+ s.add_dependency 'nokogiri', '>= 1.4.2'
+ s.add_dependency 'guess_html_encoding', '> 0.0.0'
+end
View
1  spec/fixtures/samples/blogpost_with_links-fragments.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
# This sample originally from http://softarhive.net
$required_fragments = [
View
2  spec/fixtures/samples/channel4-1-fragments.rb
@@ -1,4 +1,4 @@
-
+# encoding: UTF-8
# This sample originally from http://www.channel4.com/news/articles/world/judge+rules+briton+can+be+forcefed/3578372
$required_fragments = [
View
2  spec/fixtures/samples/foxnews-india1-fragments.rb
@@ -1,4 +1,4 @@
-
+# encoding: UTF-8
# This sample originally from http://www.foxnews.com/world/2010/05/14/police-killed-bus-touches-high-voltage-wire-central-india/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed:+foxnews/latest+(Text+-+Latest+Headlines)
$required_fragments = [
View
2  spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
@@ -1,4 +1,4 @@
-
+# encoding: UTF-8
# Originally from http://www.theglobeandmail.com/news/national/ottawa-cuts-already-vacant-positions/article1494400/
$required_fragments = [
View
30 spec/readability_spec.rb
@@ -1,4 +1,4 @@
-require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
+require 'spec_helper'
describe Readability do
before do
@@ -115,11 +115,9 @@
b[:content_score] <=> a[:content_score]
}.first[:elem][:id].should == "body"
end
- end
- describe "score_paragraphs" do
context "when two consequent br tags are used instead of p" do
- before :each do
+ it "should assign the higher score to the first paragraph in this particular example" do
@doc = Readability::Document.new(<<-HTML)
<html>
<head>
@@ -140,9 +138,6 @@
</html>
HTML
@candidates = @doc.score_paragraphs(0)
- end
-
- it "should assign the higher score to the first paragraph in this particular example" do
@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
end
end
@@ -204,14 +199,13 @@
end
it "should output expected fragments of text" do
-
checks = 0
@samples.each do |sample|
html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
doc = Readability::Document.new(html).content
load "fixtures/samples/#{sample}-fragments.rb"
- puts "testing #{sample}..."
+ #puts "testing #{sample}..."
$required_fragments.each do |required_text|
doc.should include(required_text)
@@ -223,7 +217,23 @@
checks += 1
end
end
- puts "Performed #{checks} checks."
+ #puts "Performed #{checks} checks."
+ end
+ end
+
+ describe "encoding guessing" do
+ context "with ruby 1.9.2" do
+ it "should correctly guess and enforce HTML encoding" do
+
+ end
+
+ it "should allow encoding guessing to be skipped" do
+
+ end
+
+ it "should allow encoding guessing to be overridden" do
+
+ end
end
end
end
View
11 spec/spec_helper.rb
@@ -1,10 +1,9 @@
-$LOAD_PATH.unshift(File.dirname(__FILE__))
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+#$LOAD_PATH.unshift(File.dirname(__FILE__))
+#$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
require 'rubygems'
require 'readability'
-require 'spec'
-require 'spec/autorun'
-
-Spec::Runner.configure do |config|
+#require 'spec'
+#require 'spec/autorun'
+RSpec.configure do |c|
end
Please sign in to comment.
Something went wrong with that request. Please try again.