Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

355 lines (282 sloc) 12.262 kb
# -*- encoding: utf-8 -*-
require 'helper'
describe PostRank::URI do
let(:igvita) { 'http://igvita.com/' }
context "escaping" do
it "should escape PostRank::URI string" do
PostRank::URI.escape('id=1').should == 'id%3D1'
end
it "should escape spaces as %20's" do
PostRank::URI.escape('id= 1').should match('%20')
end
end
context "unescape" do
it "should unescape PostRank::URI" do
PostRank::URI.unescape(PostRank::URI.escape('id=1')).should == 'id=1'
end
it "should unescape PostRank::URI with spaces" do
PostRank::URI.unescape(PostRank::URI.escape('id= 1')).should == 'id= 1'
end
context "accept improperly escaped PostRank::URI strings" do
# See http://tools.ietf.org/html/rfc3986#section-2.3
it "should unescape PostRank::URI with spaces encoded as '+'" do
PostRank::URI.unescape('id=+1').should == 'id= 1'
end
it "should unescape PostRank::URI with spaces encoded as '+'" do
PostRank::URI.unescape('id%3D+1').should == 'id= 1'
end
it "should unescape PostRank::URI with spaces encoded as %20" do
PostRank::URI.unescape('id=%201').should == 'id= 1'
end
end
end
context "normalize" do
def n(uri)
PostRank::URI.normalize(uri).to_s
end
it "should normalize paths in PostRank::URIs" do
n('http://igvita.com/').should == igvita
n('http://igvita.com').to_s.should == igvita
n('http://igvita.com///').should == igvita
n('http://igvita.com/../').should == igvita
n('http://igvita.com/a/b/../../').should == igvita
n('http://igvita.com/a/b/../..').should == igvita
end
it "should normalize query strings in PostRank::URIs" do
n('http://igvita.com/?').should == igvita
n('http://igvita.com?').should == igvita
n('http://igvita.com/a/../?').should == igvita
end
it "should normalize anchors in PostRank::URIs" do
n('http://igvita.com#test').should == igvita
n('http://igvita.com#test#test').should == igvita
n('http://igvita.com/a/../?#test').should == igvita
end
it "should clean whitespace in PostRank::URIs" do
n('http://igvita.com/a/../? ').should == igvita
n('http://igvita.com/a/../? #test').should == igvita
n('http://igvita.com/ /../').should == igvita
end
it "should default to http scheme if missing" do
n('igvita.com').should == igvita
n('https://test.com/').to_s.should == 'https://test.com/'
end
it "should downcase hostname" do
n('IGVITA.COM').should == igvita
n('IGVITA.COM/ABC').should == (igvita + "ABC")
end
it "should remove trailing slash on paths" do
n('http://igvita.com/').should == 'http://igvita.com/'
n('http://igvita.com/a').should == 'http://igvita.com/a'
n('http://igvita.com/a/').should == 'http://igvita.com/a'
n('http://igvita.com/a/b').should == 'http://igvita.com/a/b'
n('http://igvita.com/a/b/').should == 'http://igvita.com/a/b'
end
end
context "canonicalization" do
def c(uri)
PostRank::URI.c18n(uri).to_s
end
context "query parameters" do
it "should handle nester parameters" do
c('igvita.com/?id=a&utm_source=a').should == 'http://igvita.com/?id=a'
end
it "should preserve order of parameters" do
url = 'http://a.com/?'+('a'..'z').to_a.shuffle.map {|e| "#{e}=#{e}"}.join("&")
c(url).should == url
end
it "should remove Google Analytics parameters" do
c('igvita.com/?id=a&utm_source=a').should == 'http://igvita.com/?id=a'
c('igvita.com/?id=a&utm_source=a&utm_valid').should == 'http://igvita.com/?id=a&utm_valid'
end
it "should remove awesm/sms parameters" do
c('igvita.com/?id=a&utm_source=a&awesm=b').should == 'http://igvita.com/?id=a'
c('igvita.com/?id=a&sms_ss=a').should == 'http://igvita.com/?id=a'
end
end
context "hashbang" do
it "should rewrite twitter links to crawlable versions" do
c('http://twitter.com/#!/igrigorik').should == 'http://twitter.com/igrigorik'
c('http://twitter.com/#!/a/statuses/1').should == 'http://twitter.com/a/statuses/1'
c('http://nontwitter.com/#!/a/statuses/1').should == 'http://nontwitter.com/#!/a/statuses/1'
end
end
context "tumblr" do
it "should strip slug" do
c('http://test.tumblr.com/post/4533459403/some-text').should == 'http://test.tumblr.com/post/4533459403/'
c('http://tumblr.com/xjl2evo3hh').should == 'http://tumblr.com/xjl2evo3hh'
end
end
context "embedded links" do
it "should extract embedded redirects from Google News" do
u = c('http://news.google.com/news/url?sa=t&fd=R&&url=http://www.ctv.ca/CTVNews/Politics/20110111/')
u.should == 'http://www.ctv.ca/CTVNews/Politics/20110111'
end
it "should extract embedded redirects from xfruits.com" do
u = c('http://xfruits.com/MrGroar/?url=http%3A%2F%2Faap.lesroyaumes.com%2Fdepeches%2Fdepeche351820908.html')
u.should == 'http://aap.lesroyaumes.com/depeches/depeche351820908.html'
end
it "should extract embedded redirects from MySpace" do
u = c('http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fghanaian-chronicle.com%2Fnews%2Fother-news%2Fcanadian-high-commissioner-urges-media%2F&t=Canadian%20High%20Commissioner%20urges%20media')
u.should == 'http://ghanaian-chronicle.com/news/other-news/canadian-high-commissioner-urges-media'
end
end
end
context "clean" do
def c(uri)
PostRank::URI.clean(uri)
end
it "should unescape, c18n and normalize" do
c('http://igvita.com/?id=1').should == 'http://igvita.com/?id=1'
c('igvita.com/?id=1').should == 'http://igvita.com/?id=1'
c('http://igvita.com/?id= 1').should == 'http://igvita.com/?id=%201'
c('http://igvita.com/?id=+1').should == 'http://igvita.com/?id=%201'
c('http://igvita.com/?id%3D%201').should == 'http://igvita.com/?id=%201'
c('igvita.com/a/..?id=1&utm_source=a&awesm=b#c').should == 'http://igvita.com/?id=1'
c('igvita.com?id=<>').should == 'http://igvita.com/?id=%3C%3E'
c('igvita.com?id="').should == 'http://igvita.com/?id=%22'
c('test.tumblr.com/post/23223/text-stub').should == 'http://test.tumblr.com/post/23223'
end
it "should clean host specific parameters" do
YAML.load_file('spec/c18n_hosts.yml').each do |orig, clean|
c(orig).should == clean
end
end
end
context "hash" do
def h(uri, opts = {})
PostRank::URI.hash(uri, opts)
end
it "should compute the MD5 hash without cleaning the URI" do
hash = '55fae8910d312b7878a3201ed653b881'
h('http://everburning.com/feed/post/1').should == hash
h('everburning.com/feed/post/1').should_not == hash
end
it "should normalize the URI if requested and compute MD5 hash" do
hash = '55fae8910d312b7878a3201ed653b881'
h('http://EverBurning.Com/feed/post/1', :clean => true).should == hash
h('Everburning.com/feed/post/1', :clean => true).should == hash
h('everburning.com/feed/post/1', :clean => true).should == hash
h('everburning.com/feed/post/1/', :clean => true).should == hash
end
end
context "extract" do
def e(text)
PostRank::URI.extract(text)
end
context "TLDs" do
it "should not pick up bad grammar as a domain name and think it has a link" do
e("yah.lets").should be_empty
end
it "should not pickup bad TLDS" do
e('stuff.zz a.b.c d.zq').should be_empty
end
end
it "should extract twitter links with hashbangs" do
e('test http://twitter.com/#!/igrigorik').should include('http://twitter.com/igrigorik')
end
it "should extract mobile twitter links with hashbangs" do
e('test http://mobile.twitter.com/#!/_mm6').should include('http://mobile.twitter.com/_mm6')
end
it "should handle a URL that comes after text without a space" do
e("text:http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
e("text;http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
e("text.http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
e("text-http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
end
it "should not pick up anything on or after the first . in the path of a URL with a shortener domain" do
e("http://bit.ly/9cJ2mz......if ur pickin up anythign here, u FAIL.").should == ["http://bit.ly/9cJ2mz"]
end
it "should pickup urls without protocol" do
u = e('abc.com abc.co')
u.should include('http://abc.com/')
u.should include('http://abc.co/')
end
it "should pickup urls inside tags" do
u = e("<a href='http://bit.ly/3fds3'>abc.com</a>")
u.should include('http://abc.com/')
end
context "multibyte characters" do
it "should stop extracting URLs at the full-width CJK space character" do
e("http://www.youtube.com/watch?v=w_j4Lda25jA  とんかつ定食").should == ["http://www.youtube.com/watch?v=w_j4Lda25jA"]
end
end
end
context "href extract" do
it "should extract links from html text" do
g,b = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
g.first.should == 'http://google.com/'
b.first.should == 'http://b.com/'
g.last.should == 'link to google'
b.last.should == 'stuff'
end
it "should handle empty hrefs" do
lambda do
l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
l.should be_empty
end.should_not raise_error
end
context "relative paths" do
it "should reject relative paths" do
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
l.should be_empty
end
it "should resolve relative paths if host is provided" do
i = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com").first
i.first.should == 'http://igvita.com/stuff'
i.last.should == 'link to stuff'
end
end
context "domain extraction" do
url_list = {
"http://alex.pages.example.com" => "example.com",
"alex.pages.example.com" => "example.com",
"http://example.com/2011/04/01/blah" => "example.com",
"http://example.com" => "example.com",
"example.com" => "example.com",
"ExampLe.com" => "example.com",
"ExampLe.com:3000" => "example.com",
"http://alex.pages.example.COM" => "example.com",
"http://www.example.ag.it/2011/04/01/blah" => "example.ag.it",
"ftp://www.example.com/2011/04/01/blah" => 'example.com',
"http://com" => nil,
"http://alex.pages.examplecom" => nil,
"example" => nil,
"http://127.0.0.1" => nil,
"localhost" => nil,
"hello-there.com/you" => "hello-there.com"
}
url_list.each_pair do |url, expected_result|
it "should extract #{expected_result.inspect} from #{url}" do
u = PostRank::URI.clean(url, :raw => true)
u.domain.should == expected_result
end
end
end
end
context "parse" do
it 'should not fail on large host-part look-alikes' do
PostRank::URI.parse('a'*64+'.ca').host.should == nil
end
it 'should not pancake javascript scheme URIs' do
PostRank::URI.parse('javascript:void(0);').scheme.should == 'javascript'
end
it 'should not pancake mailto scheme URIs' do
PostRank::URI.parse('mailto:void(0);').scheme.should == 'mailto'
end
it 'should not pancake xmpp scheme URIs' do
PostRank::URI.parse('xmpp:void(0);').scheme.should == 'xmpp'
end
end
context 'valid?' do
it 'marks incomplete URI string as invalid' do
PostRank::URI.valid?('/path/page.html').should be_false
end
it 'marks www.test.c as invalid' do
PostRank::URI.valid?('http://www.test.c').should be_false
end
it 'marks www.test.com as valid' do
PostRank::URI.valid?('http://www.test.com').should be_true
end
end
end
Jump to Line
Something went wrong with that request. Please try again.