Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

first commit

  • Loading branch information...
commit 28f1ad7bef8d6010369aa1386859d095e9ee9e25 0 parents
@rinzi rinzi authored
25 LICENSE.rdoc
@@ -0,0 +1,25 @@
+= License
+
+(The MIT License)
+
+Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
8 Manifest
@@ -0,0 +1,8 @@
+LICENSE.rdoc
+Manifest
+README.rdoc
+Rakefile
+lib/robotstxt.rb
+lib/robotstxt/parser.rb
+test/parser_test.rb
+test/robotstxt_test.rb
45 README.rdoc
@@ -0,0 +1,45 @@
+= Robotstxt
+
+Robotstxt is an Ruby robots.txt file parser.
+
+Robotstxt Parser allows you to the check the accessibility of URLs and get other data.
+
+Full support for the robots.txt RFC, wildcards and Sitemap: rules.
+
+
+== Features
+
+* Check if the URL is allowed to be crawled from your Robot
+* Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs
+
+== Requirements
+
+* Ruby >= 1.8.7
+
+
+== Installation
+
+This library is intended to be installed via the
+RubyGems[http://rubyforge.org/projects/rubygems/] system.
+
+ $ gem install robotstxt
+
+You might need administrator privileges on your system to install it.
+
+
+
+== Author
+
+Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
+
+
+== Resources
+
+* {Homepage}[http://www.simonerinzivillo.it/]
+
+
+
+== License
+
+Copyright (c) 2009 Simone Rinzivillo, Robotstxt is released under the MIT license.
+
57 Rakefile
@@ -0,0 +1,57 @@
+$:.unshift(File.dirname(__FILE__) + "/lib")
+
+require 'rubygems'
+require 'rake'
+require 'echoe'
+require 'robotstxt'
+
+
+# Common package properties
+PKG_NAME = 'robotstxt'
+PKG_VERSION = Robotstxt::VERSION
+RUBYFORGE_PROJECT = 'robotstxt'
+
+if ENV['SNAPSHOT'].to_i == 1
+ PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
+end
+
+
+Echoe.new(PKG_NAME, PKG_VERSION) do |p|
+ p.author = "Simone Rinzivillo"
+ p.email = "srinzivillo@gmail.com"
+ p.summary = "Robotstxt is an Ruby robots.txt file parser"
+ p.url = "http://www.simonerinzivillo.it"
+ p.project = RUBYFORGE_PROJECT
+ p.description = <<-EOD
+ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. \
+ Full support for the robots.txt RFC, wildcards and Sitemap: rules.
+ EOD
+
+ p.need_zip = true
+
+ p.development_dependencies += ["rake ~>0.8",
+ "echoe ~>3.1"]
+
+ p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
+end
+
+
+desc "Open an irb session preloaded with this library"
+task :console do
+ sh "irb -rubygems -I lib -r robotstxt.rb"
+end
+
+begin
+ require 'code_statistics'
+ desc "Show library's code statistics"
+ task :stats do
+ CodeStatistics.new(["Robotstxt", "lib"],
+ ["Tests", "test"]).to_s
+ end
+rescue LoadError
+ puts "CodeStatistics (Rails) is not available"
+end
+
+Dir["tasks/**/*.rake"].each do |file|
+ load(file)
+end
261 doc/classes/Robotstxt.html
@@ -0,0 +1,261 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>Module: Robotstxt [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
+ <script type="text/javascript">
+ // <![CDATA[
+
+ function popupCode( url ) {
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
+ }
+
+ function toggleCode( id ) {
+ if ( document.getElementById )
+ elem = document.getElementById( id );
+ else if ( document.all )
+ elem = eval( "document.all." + id );
+ else
+ return false;
+
+ elemStyle = elem.style;
+
+ if ( elemStyle.display != "block" ) {
+ elemStyle.display = "block"
+ } else {
+ elemStyle.display = "none"
+ }
+
+ return true;
+ }
+
+ // Make codeblocks hidden by default
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
+
+ // ]]>
+ </script>
+
+</head>
+<body>
+
+
+ <div id="classHeader">
+ <table class="header-table">
+ <tr class="top-aligned-row">
+ <td><strong>Module</strong></td>
+ <td class="class-name-in-header">Robotstxt</td>
+ </tr>
+ <tr class="top-aligned-row">
+ <td><strong>In:</strong></td>
+ <td>
+
+
+ <a href="../files/lib/robotstxt_rb.html">
+
+ lib/robotstxt.rb
+
+ </a>
+
+
+ <br />
+
+
+ <a href="../files/lib/robotstxt/parser_rb.html">
+
+ lib/robotstxt/parser.rb
+
+ </a>
+
+
+ <br />
+
+ </td>
+ </tr>
+
+
+ </table>
+ </div>
+ <!-- banner header -->
+
+ <div id="bodyContent">
+
+ <div id="contextContent">
+
+ </div>
+
+
+ <div id="method-list">
+ <h3 class="section-bar">Methods</h3>
+
+ <div class="name-list">
+
+ <a href="#M000001">allowed?</a>&nbsp;&nbsp;
+
+ <a href="#M000002">sitemaps</a>&nbsp;&nbsp;
+
+ </div>
+ </div>
+
+ </div>
+
+ <!-- if includes -->
+
+ <div id="section">
+
+ <div id="class-list">
+ <h3 class="section-bar">Classes and Modules</h3>
+
+ Class <a href="Robotstxt/Parser.html" class="link">Robotstxt::Parser</a><br />
+
+ </div>
+
+ <div id="constants-list">
+ <h3 class="section-bar">Constants</h3>
+
+ <div class="name-list">
+ <table summary="Constants">
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">NAME</td>
+ <td>=</td>
+ <td class="context-item-value">'Robotstxt'</td>
+
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">GEM</td>
+ <td>=</td>
+ <td class="context-item-value">'robotstxt'</td>
+
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">AUTHORS</td>
+ <td>=</td>
+ <td class="context-item-value">['Simone Rinzivillo &lt;srinzivillo@gmail.com&gt;']</td>
+
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">VERSION</td>
+ <td>=</td>
+ <td class="context-item-value">'0.5.2'</td>
+
+ </tr>
+
+ </table>
+ </div>
+ </div>
+
+
+
+
+ <!-- if method_list -->
+
+ <div id="methods">
+
+ <h3 class="section-bar">Public Class methods</h3>
+
+
+ <div id="method-M000001" class="method-detail">
+ <a name="M000001"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000001" class="method-signature">
+
+ <span class="method-name">allowed?</span><span class="method-args">(url, robot_id)</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+Check if the <tt>URL</tt> is allowed to be crawled from the current
+<tt>Robot_id</tt>. Robots:Allowed? returns <tt>true</tt> if the robots.txt
+file does not block the access to the URL.
+</p>
+<pre>
+ Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
+</pre>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000001-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt.rb, line 35</span>
+35: <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">allowed?</span>(<span class="ruby-identifier">url</span>, <span class="ruby-identifier">robot_id</span>)
+36:
+37: <span class="ruby-identifier">u</span> = <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>)
+38: <span class="ruby-identifier">r</span> = <span class="ruby-constant">Robotstxt</span><span class="ruby-operator">::</span><span class="ruby-constant">Parser</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">robot_id</span>)
+39: <span class="ruby-identifier">r</span>.<span class="ruby-identifier">allowed?</span>(<span class="ruby-identifier">url</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">r</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">u</span>.<span class="ruby-identifier">scheme</span> <span class="ruby-operator">+</span> <span class="ruby-value str">'://'</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">u</span>.<span class="ruby-identifier">host</span>)
+40:
+41: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+ <div id="method-M000002" class="method-detail">
+ <a name="M000002"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000002" class="method-signature">
+
+ <span class="method-name">sitemaps</span><span class="method-args">(url, robot_id)</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+Analyze the robots.txt file to return an <tt>Array</tt> containing the list
+of XML Sitemaps URLs.
+</p>
+<pre>
+ Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest')
+</pre>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000002-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt.rb, line 47</span>
+47: <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">sitemaps</span>(<span class="ruby-identifier">url</span>, <span class="ruby-identifier">robot_id</span>)
+48:
+49: <span class="ruby-identifier">u</span> = <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>)
+50: <span class="ruby-identifier">r</span> = <span class="ruby-constant">Robotstxt</span><span class="ruby-operator">::</span><span class="ruby-constant">Parser</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">robot_id</span>)
+51: <span class="ruby-identifier">r</span>.<span class="ruby-identifier">sitemaps</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">r</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">u</span>.<span class="ruby-identifier">scheme</span> <span class="ruby-operator">+</span> <span class="ruby-value str">'://'</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">u</span>.<span class="ruby-identifier">host</span>)
+52:
+53: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+
+ </div>
+
+
+
+
+ </div>
+
+<div id="validator-badges">
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
+</div>
+
+</body>
+</html>
445 doc/classes/Robotstxt/Parser.html
@@ -0,0 +1,445 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>Class: Robotstxt::Parser [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
+ <script type="text/javascript">
+ // <![CDATA[
+
+ function popupCode( url ) {
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
+ }
+
+ function toggleCode( id ) {
+ if ( document.getElementById )
+ elem = document.getElementById( id );
+ else if ( document.all )
+ elem = eval( "document.all." + id );
+ else
+ return false;
+
+ elemStyle = elem.style;
+
+ if ( elemStyle.display != "block" ) {
+ elemStyle.display = "block"
+ } else {
+ elemStyle.display = "none"
+ }
+
+ return true;
+ }
+
+ // Make codeblocks hidden by default
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
+
+ // ]]>
+ </script>
+
+</head>
+<body>
+
+
+ <div id="classHeader">
+ <table class="header-table">
+ <tr class="top-aligned-row">
+ <td><strong>Class</strong></td>
+ <td class="class-name-in-header">Robotstxt::Parser</td>
+ </tr>
+ <tr class="top-aligned-row">
+ <td><strong>In:</strong></td>
+ <td>
+
+
+ <a href="../../files/lib/robotstxt/parser_rb.html">
+
+ lib/robotstxt/parser.rb
+
+ </a>
+
+
+ <br />
+
+ </td>
+ </tr>
+
+
+ <tr class="top-aligned-row">
+ <td><strong>Parent:</strong></td>
+ <td>
+
+ Object
+
+ </td>
+ </tr>
+
+ </table>
+ </div>
+ <!-- banner header -->
+
+ <div id="bodyContent">
+
+ <div id="contextContent">
+
+ </div>
+
+
+ <div id="method-list">
+ <h3 class="section-bar">Methods</h3>
+
+ <div class="name-list">
+
+ <a href="#M000005">allowed?</a>&nbsp;&nbsp;
+
+ <a href="#M000007">found?</a>&nbsp;&nbsp;
+
+ <a href="#M000004">get</a>&nbsp;&nbsp;
+
+ <a href="#M000003">new</a>&nbsp;&nbsp;
+
+ <a href="#M000006">sitemaps</a>&nbsp;&nbsp;
+
+ </div>
+ </div>
+
+ </div>
+
+ <!-- if includes -->
+
+ <div id="section">
+
+
+
+ <div id="attribute-list">
+ <h3 class="section-bar">Attributes</h3>
+
+ <div class="name-list">
+ <table>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">body</td>
+
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
+
+ <td class="context-item-desc"></td>
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">found</td>
+
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
+
+ <td class="context-item-desc"></td>
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">robot_id</td>
+
+ <td class="context-item-value">&nbsp;[RW]&nbsp;</td>
+
+ <td class="context-item-desc"></td>
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">rules</td>
+
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
+
+ <td class="context-item-desc"></td>
+ </tr>
+
+ <tr class="top-aligned-row context-row">
+ <td class="context-item-name">sitemaps</td>
+
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
+
+ <td class="context-item-desc"></td>
+ </tr>
+
+ </table>
+ </div>
+ </div>
+
+
+ <!-- if method_list -->
+
+ <div id="methods">
+
+ <h3 class="section-bar">Public Class methods</h3>
+
+
+ <div id="method-M000003" class="method-detail">
+ <a name="M000003"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000003" class="method-signature">
+
+ <span class="method-name">new</span><span class="method-args">(robot_id = nil)</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
+</p>
+<p>
+<tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
+</p>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000003-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt/parser.rb, line 29</span>
+29: <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">robot_id</span> = <span class="ruby-keyword kw">nil</span>)
+30:
+31: <span class="ruby-ivar">@robot_id</span> = <span class="ruby-value str">'*'</span>
+32: <span class="ruby-ivar">@rules</span> = []
+33: <span class="ruby-ivar">@sitemaps</span> = []
+34: <span class="ruby-ivar">@robot_id</span> = <span class="ruby-identifier">robot_id</span>.<span class="ruby-identifier">downcase</span> <span class="ruby-keyword kw">if</span> <span class="ruby-operator">!</span><span class="ruby-identifier">robot_id</span>.<span class="ruby-identifier">nil?</span>
+35:
+36: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+ <h3 class="section-bar">Public Instance methods</h3>
+
+
+ <div id="method-M000005" class="method-detail">
+ <a name="M000005"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000005" class="method-signature">
+
+ <span class="method-name">allowed?</span><span class="method-args">(var)</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+Check if the <tt>URL</tt> is allowed to be crawled from the current
+Robot_id.
+</p>
+<pre>
+ client = Robotstxt::Robotstxtistance.new('my_robot_id')
+ if client.get('http://www.simonerinzivillo.it')
+ client.allowed?('http://www.simonerinzivillo.it/no-dir/')
+ end
+</pre>
+<p>
+This method returns <tt>true</tt> if the robots.txt file does not block the
+access to the URL.
+</p>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000005-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt/parser.rb, line 94</span>
+ 94: <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">allowed?</span>(<span class="ruby-identifier">var</span>)
+ 95: <span class="ruby-identifier">is_allow</span> = <span class="ruby-keyword kw">true</span>
+ 96: <span class="ruby-identifier">url</span> = <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">var</span>)
+ 97: <span class="ruby-identifier">querystring</span> = (<span class="ruby-operator">!</span><span class="ruby-identifier">url</span>.<span class="ruby-identifier">query</span>.<span class="ruby-identifier">nil?</span>) <span class="ruby-operator">?</span> <span class="ruby-value str">'?'</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">url</span>.<span class="ruby-identifier">query</span> <span class="ruby-operator">:</span> <span class="ruby-value str">''</span>
+ 98: <span class="ruby-identifier">url_path</span> = <span class="ruby-identifier">url</span>.<span class="ruby-identifier">path</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">querystring</span>
+ 99:
+100: <span class="ruby-ivar">@rules</span>.<span class="ruby-identifier">each</span> {<span class="ruby-operator">|</span><span class="ruby-identifier">ua</span><span class="ruby-operator">|</span>
+101:
+102: <span class="ruby-keyword kw">if</span> <span class="ruby-ivar">@robot_id</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">ua</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">||</span> <span class="ruby-identifier">ua</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value str">'*'</span>
+103:
+104: <span class="ruby-identifier">ua</span>[<span class="ruby-value">1</span>].<span class="ruby-identifier">each</span> {<span class="ruby-operator">|</span><span class="ruby-identifier">d</span><span class="ruby-operator">|</span>
+105:
+106: <span class="ruby-identifier">is_allow</span> = <span class="ruby-keyword kw">false</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">url_path</span>.<span class="ruby-identifier">match</span>(<span class="ruby-value str">'^'</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">d</span> ) <span class="ruby-operator">||</span> <span class="ruby-identifier">d</span> <span class="ruby-operator">==</span> <span class="ruby-value str">'/'</span>
+107:
+108: }
+109:
+110: <span class="ruby-keyword kw">end</span>
+111:
+112: }
+113: <span class="ruby-identifier">is_allow</span>
+114: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+ <div id="method-M000007" class="method-detail">
+ <a name="M000007"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000007" class="method-signature">
+
+ <span class="method-name">found?</span><span class="method-args">()</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+This method returns <tt>true</tt> if the Robots.txt parsing is gone.
+</p>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000007-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt/parser.rb, line 131</span>
+131: <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">found?</span>
+132: <span class="ruby-operator">!</span><span class="ruby-operator">!</span><span class="ruby-ivar">@found</span>
+133: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+ <div id="method-M000004" class="method-detail">
+ <a name="M000004"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000004" class="method-signature">
+
+ <span class="method-name">get</span><span class="method-args">(hostname)</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+Requires and parses the Robots.txt file for the <tt>hostname</tt>.
+</p>
+<pre>
+ client = Robotstxt::Robotstxtistance.new('my_robot_id')
+ client.get('http://www.simonerinzivillo.it')
+</pre>
+<p>
+This method returns <tt>true</tt> if the parsing is gone.
+</p>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000004-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt/parser.rb, line 47</span>
+47: <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get</span>(<span class="ruby-identifier">hostname</span>)
+48:
+49: <span class="ruby-ivar">@ehttp</span> = <span class="ruby-keyword kw">true</span>
+50: <span class="ruby-identifier">url</span> = <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">hostname</span>)
+51:
+52: <span class="ruby-keyword kw">begin</span>
+53: <span class="ruby-identifier">http</span> = <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">url</span>.<span class="ruby-identifier">host</span>, <span class="ruby-identifier">url</span>.<span class="ruby-identifier">port</span>)
+54: <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">url</span>.<span class="ruby-identifier">scheme</span> <span class="ruby-operator">==</span> <span class="ruby-value str">'https'</span>
+55: <span class="ruby-identifier">http</span>.<span class="ruby-identifier">verify_mode</span> = <span class="ruby-constant">OpenSSL</span><span class="ruby-operator">::</span><span class="ruby-constant">SSL</span><span class="ruby-operator">::</span><span class="ruby-constant">VERIFY_NONE</span>
+56: <span class="ruby-identifier">http</span>.<span class="ruby-identifier">use_ssl</span> = <span class="ruby-keyword kw">true</span>
+57: <span class="ruby-keyword kw">end</span>
+58:
+59: <span class="ruby-identifier">response</span> = <span class="ruby-identifier">http</span>.<span class="ruby-identifier">request</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span><span class="ruby-operator">::</span><span class="ruby-constant">Get</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'/robots.txt'</span>))
+60:
+61: <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">response</span>
+62: <span class="ruby-keyword kw">when</span> <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTPSuccess</span> <span class="ruby-keyword kw">then</span>
+63: <span class="ruby-ivar">@found</span> = <span class="ruby-keyword kw">true</span>
+64: <span class="ruby-ivar">@body</span> = <span class="ruby-identifier">response</span>.<span class="ruby-identifier">body</span>
+65: <span class="ruby-identifier">parse</span>()
+66:
+67: <span class="ruby-keyword kw">else</span>
+68: <span class="ruby-ivar">@found</span> = <span class="ruby-keyword kw">false</span>
+69: <span class="ruby-keyword kw">end</span>
+70:
+71: <span class="ruby-keyword kw">return</span> <span class="ruby-ivar">@found</span>
+72:
+73: <span class="ruby-keyword kw">rescue</span> <span class="ruby-constant">Timeout</span><span class="ruby-operator">::</span><span class="ruby-constant">Error</span>, <span class="ruby-constant">Errno</span><span class="ruby-operator">::</span><span class="ruby-constant">EINVAL</span>, <span class="ruby-constant">Errno</span><span class="ruby-operator">::</span><span class="ruby-constant">ECONNRESET</span> =<span class="ruby-operator">&gt;</span> <span class="ruby-identifier">e</span>
+74: <span class="ruby-keyword kw">if</span> <span class="ruby-ivar">@ehttp</span>
+75: <span class="ruby-ivar">@ettp</span> = <span class="ruby-keyword kw">false</span>
+76: <span class="ruby-keyword kw">retry</span>
+77: <span class="ruby-keyword kw">else</span>
+78: <span class="ruby-keyword kw">return</span> <span class="ruby-keyword kw">nil</span>
+79: <span class="ruby-keyword kw">end</span>
+80: <span class="ruby-keyword kw">end</span>
+81:
+82: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+ <div id="method-M000006" class="method-detail">
+ <a name="M000006"></a>
+
+ <div class="method-heading">
+
+ <a href="#M000006" class="method-signature">
+
+ <span class="method-name">sitemaps</span><span class="method-args">()</span>
+
+ </a>
+
+ </div>
+
+ <div class="method-description">
+
+ <p>
+Analyze the robots.txt file to return an <tt>Array</tt> containing the list
+of XML Sitemaps URLs.
+</p>
+<pre>
+ client = Robotstxt::Robotstxtistance.new('my_robot_id')
+ if client.get('http://www.simonerinzivillo.it')
+ client.sitemaps.each{ |url|
+ puts url
+ }
+ end
+</pre>
+
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000006-source">
+<pre>
+ <span class="ruby-comment cmt"># File lib/robotstxt/parser.rb, line 125</span>
+125: <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">sitemaps</span>
+126: <span class="ruby-ivar">@sitemaps</span>
+127: <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+
+ </div>
+ </div>
+
+
+
+ </div>
+
+
+
+
+ </div>
+
+<div id="validator-badges">
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
+</div>
+
+</body>
+</html>
1  doc/created.rid
@@ -0,0 +1 @@
+Tue, 08 Dec 2009 15:03:24 +0100
123 doc/files/LICENSE_rdoc.html
@@ -0,0 +1,123 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>File: LICENSE.rdoc [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
+ <script type="text/javascript">
+ // <![CDATA[
+
+ function popupCode( url ) {
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
+ }
+
+ function toggleCode( id ) {
+ if ( document.getElementById )
+ elem = document.getElementById( id );
+ else if ( document.all )
+ elem = eval( "document.all." + id );
+ else
+ return false;
+
+ elemStyle = elem.style;
+
+ if ( elemStyle.display != "block" ) {
+ elemStyle.display = "block"
+ } else {
+ elemStyle.display = "none"
+ }
+
+ return true;
+ }
+
+ // Make codeblocks hidden by default
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
+
+ // ]]>
+ </script>
+
+</head>
+<body>
+
+
+ <div id="fileHeader">
+ <h1>LICENSE.rdoc</h1>
+ <table class="header-table">
+ <tr class="top-aligned-row">
+ <td><strong>Path:</strong></td>
+ <td>LICENSE.rdoc
+
+ </td>
+ </tr>
+ <tr class="top-aligned-row">
+ <td><strong>Last Update:</strong></td>
+ <td>2009-12-08 13:48:26 +0100</td>
+ </tr>
+ </table>
+ </div>
+ <!-- banner header -->
+
+ <div id="bodyContent">
+
+ <div id="contextContent">
+
+ <div id="description">
+ <h1>License</h1>
+<p>
+(The MIT License)
+</p>
+<p>
+Copyright &#169; 2009 Simone Rinzivillo <srinzivillo@gmail.com>
+</p>
+<p>
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+&#8220;Software&#8221;), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to the
+following conditions:
+</p>
+<p>
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+</p>
+<p>
+THE SOFTWARE IS PROVIDED &#8220;AS IS&#8221;, WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
+NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+</p>
+
+ </div>
+
+ </div>
+
+
+ </div>
+
+ <!-- if includes -->
+
+ <div id="section">
+
+
+
+
+ <!-- if method_list -->
+
+
+
+
+ </div>
+
+<div id="validator-badges">
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
+</div>
+
+</body>
+</html>
152 doc/files/README_rdoc.html
@@ -0,0 +1,152 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>File: README.rdoc [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
+ <script type="text/javascript">
+ // <![CDATA[
+
+ function popupCode( url ) {
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
+ }
+
+ function toggleCode( id ) {
+ if ( document.getElementById )
+ elem = document.getElementById( id );
+ else if ( document.all )
+ elem = eval( "document.all." + id );
+ else
+ return false;
+
+ elemStyle = elem.style;
+
+ if ( elemStyle.display != "block" ) {
+ elemStyle.display = "block"
+ } else {
+ elemStyle.display = "none"
+ }
+
+ return true;
+ }
+
+ // Make codeblocks hidden by default
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
+
+ // ]]>
+ </script>
+
+</head>
+<body>
+
+
+ <div id="fileHeader">
+ <h1>README.rdoc</h1>
+ <table class="header-table">
+ <tr class="top-aligned-row">
+ <td><strong>Path:</strong></td>
+ <td>README.rdoc
+
+ </td>
+ </tr>
+ <tr class="top-aligned-row">
+ <td><strong>Last Update:</strong></td>
+ <td>2009-12-06 16:03:41 +0100</td>
+ </tr>
+ </table>
+ </div>
+ <!-- banner header -->
+
+ <div id="bodyContent">
+
+ <div id="contextContent">
+
+ <div id="description">
+ <h1><a href="../classes/Robotstxt.html">Robotstxt</a></h1>
+<p>
+<a href="../classes/Robotstxt.html">Robotstxt</a> is an Ruby robots.txt
+file parser.
+</p>
+<p>
+<a href="../classes/Robotstxt.html">Robotstxt</a> Parser allows you to the
+check the accessibility of URLs and get other data.
+</p>
+<p>
+Full support for the robots.txt RFC, wildcards and Sitemap: rules.
+</p>
+<h2>Features</h2>
+<ul>
+<li>Check if the URL is allowed to be crawled from your Robot
+
+</li>
+<li>Analyze the robots.txt file to return an Array containing the list of XML
+Sitemaps URLs
+
+</li>
+</ul>
+<h2>Requirements</h2>
+<ul>
+<li>Ruby >= 1.8.7
+
+</li>
+</ul>
+<h2>Installation</h2>
+<p>
+This library is intended to be installed via the <a
+href="http://rubyforge.org/projects/rubygems/">RubyGems</a> system.
+</p>
+<pre>
+ $ gem install robotstxt
+</pre>
+<p>
+You might need administrator privileges on your system to install it.
+</p>
+<h2>Author</h2>
+<table>
+<tr><td valign="top">Author:</td><td><a href="http://www.simonerinzivillo.it/">Simone Rinzivillo</a>
+<srinzivillo@gmail.com>
+
+</td></tr>
+</table>
+<h2>Resources</h2>
+<ul>
+<li><a href="http://www.simonerinzivillo.it/">Homepage</a>
+
+</li>
+</ul>
+<h2>License</h2>
+<p>
+Copyright &#169; 2009 Simone Rinzivillo, <a
+href="../classes/Robotstxt.html">Robotstxt</a> is released under the MIT
+license.
+</p>
+
+ </div>
+
+ </div>
+
+
+ </div>
+
+ <!-- if includes -->
+
+ <div id="section">
+
+
+
+
+ <!-- if method_list -->
+
+
+
+
+ </div>
+
+<div id="validator-badges">
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
+</div>
+
+</body>
+</html>
124 doc/files/lib/robotstxt/parser_rb.html
@@ -0,0 +1,124 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>File: parser.rb [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
+ <script type="text/javascript">
+ // <![CDATA[
+
+ function popupCode( url ) {
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
+ }
+
+ function toggleCode( id ) {
+ if ( document.getElementById )
+ elem = document.getElementById( id );
+ else if ( document.all )
+ elem = eval( "document.all." + id );
+ else
+ return false;
+
+ elemStyle = elem.style;
+
+ if ( elemStyle.display != "block" ) {
+ elemStyle.display = "block"
+ } else {
+ elemStyle.display = "none"
+ }
+
+ return true;
+ }
+
+ // Make codeblocks hidden by default
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
+
+ // ]]>
+ </script>
+
+</head>
+<body>
+
+
+ <div id="fileHeader">
+ <h1>parser.rb</h1>
+ <table class="header-table">
+ <tr class="top-aligned-row">
+ <td><strong>Path:</strong></td>
+ <td>lib/robotstxt/parser.rb
+
+ </td>
+ </tr>
+ <tr class="top-aligned-row">
+ <td><strong>Last Update:</strong></td>
+ <td>2009-12-07 21:46:26 +0100</td>
+ </tr>
+ </table>
+ </div>
+ <!-- banner header -->
+
+ <div id="bodyContent">
+
+ <div id="contextContent">
+
+ <div id="description">
+ <h1>Ruby <a href="../../../classes/Robotstxt.html">Robotstxt</a></h1>
+<p>
+An Ruby Robots.txt parser.
+</p>
+<table>
+<tr><td valign="top">Category:</td><td>Net
+
+</td></tr>
+<tr><td valign="top">Package:</td><td><a href="../../../classes/Robotstxt.html">Robotstxt</a>
+
+</td></tr>
+<tr><td valign="top">Author:</td><td>Simone Rinzivillo <srinzivillo@gmail.com>
+
+</td></tr>
+<tr><td valign="top">License:</td><td>MIT License
+
+</td></tr>
+</table>
+
+ </div>
+
+ <div id="requires-list">
+ <h3 class="section-bar">Required files</h3>
+
+ <div class="name-list">
+
+ net/http&nbsp;&nbsp;
+
+ uri&nbsp;&nbsp;
+
+ </div>
+ </div>
+
+ </div>
+
+
+ </div>
+
+ <!-- if includes -->
+
+ <div id="section">
+
+
+
+
+ <!-- if method_list -->
+
+
+
+
+ </div>
+
+<div id="validator-badges">
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
+</div>
+
+</body>
+</html>
124 doc/files/lib/robotstxt_rb.html
@@ -0,0 +1,124 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>File: robotstxt.rb [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
+ <script type="text/javascript">
+ // <![CDATA[
+
+ function popupCode( url ) {
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
+ }
+
+ function toggleCode( id ) {
+ if ( document.getElementById )
+ elem = document.getElementById( id );
+ else if ( document.all )
+ elem = eval( "document.all." + id );
+ else
+ return false;
+
+ elemStyle = elem.style;
+
+ if ( elemStyle.display != "block" ) {
+ elemStyle.display = "block"
+ } else {
+ elemStyle.display = "none"
+ }
+
+ return true;
+ }
+
+ // Make codeblocks hidden by default
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
+
+ // ]]>
+ </script>
+
+</head>
+<body>
+
+
+ <div id="fileHeader">
+ <h1>robotstxt.rb</h1>
+ <table class="header-table">
+ <tr class="top-aligned-row">
+ <td><strong>Path:</strong></td>
+ <td>lib/robotstxt.rb
+
+ </td>
+ </tr>
+ <tr class="top-aligned-row">
+ <td><strong>Last Update:</strong></td>
+ <td>2009-12-08 14:03:20 +0100</td>
+ </tr>
+ </table>
+ </div>
+ <!-- banner header -->
+
+ <div id="bodyContent">
+
+ <div id="contextContent">
+
+ <div id="description">
+ <h1>Ruby <a href="../../classes/Robotstxt.html">Robotstxt</a></h1>
+<p>
+An Ruby Robots.txt parser.
+</p>
+<table>
+<tr><td valign="top">Category:</td><td>Net
+
+</td></tr>
+<tr><td valign="top">Package:</td><td><a href="../../classes/Robotstxt.html">Robotstxt</a>
+
+</td></tr>
+<tr><td valign="top">Author:</td><td>Simone Rinzivillo <srinzivillo@gmail.com>
+
+</td></tr>
+<tr><td valign="top">License:</td><td>MIT License
+
+</td></tr>
+</table>
+
+ </div>
+
+ <div id="requires-list">
+ <h3 class="section-bar">Required files</h3>
+
+ <div class="name-list">
+
+ robotstxt/parser&nbsp;&nbsp;
+
+ uri&nbsp;&nbsp;
+
+ </div>
+ </div>
+
+ </div>
+
+
+ </div>
+
+ <!-- if includes -->
+
+ <div id="section">
+
+
+
+
+ <!-- if method_list -->
+
+
+
+
+ </div>
+
+<div id="validator-badges">
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
+</div>
+
+</body>
+</html>
27 doc/fr_class_index.html
@@ -0,0 +1,27 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<!--
+
+ Classes [Robotstxt]
+
+ -->
+<head>
+ <title>Classes [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
+ <base target="docwin" />
+</head>
+<body>
+<div class="index">
+ <h1 class="section-bar">Classes</h1>
+ <div id="index-entries">
+
+ <a href="classes/Robotstxt.html">Robotstxt</a><br />
+
+ <a href="classes/Robotstxt/Parser.html">Robotstxt::Parser</a><br />
+
+ </div>
+</div>
+</body>
+</html>
31 doc/fr_file_index.html
@@ -0,0 +1,31 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<!--
+
+ Files [Robotstxt]
+
+ -->
+<head>
+ <title>Files [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
+ <base target="docwin" />
+</head>
+<body>
+<div class="index">
+ <h1 class="section-bar">Files</h1>
+ <div id="index-entries">
+
+ <a href="files/LICENSE_rdoc.html">LICENSE.rdoc</a><br />
+
+ <a href="files/README_rdoc.html">README.rdoc</a><br />
+
+ <a href="files/lib/robotstxt_rb.html">lib/robotstxt.rb</a><br />
+
+ <a href="files/lib/robotstxt/parser_rb.html">lib/robotstxt/parser.rb</a><br />
+
+ </div>
+</div>
+</body>
+</html>
37 doc/fr_method_index.html
@@ -0,0 +1,37 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<!--
+
+ Methods [Robotstxt]
+
+ -->
+<head>
+ <title>Methods [Robotstxt]</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
+ <base target="docwin" />
+</head>
+<body>
+<div class="index">
+ <h1 class="section-bar">Methods</h1>
+ <div id="index-entries">
+
+ <a href="classes/Robotstxt.html#M000001">allowed? (Robotstxt)</a><br />
+
+ <a href="classes/Robotstxt/Parser.html#M000005">allowed? (Robotstxt::Parser)</a><br />
+
+ <a href="classes/Robotstxt/Parser.html#M000007">found? (Robotstxt::Parser)</a><br />
+
+ <a href="classes/Robotstxt/Parser.html#M000004">get (Robotstxt::Parser)</a><br />
+
+ <a href="classes/Robotstxt/Parser.html#M000003">new (Robotstxt::Parser)</a><br />
+
+ <a href="classes/Robotstxt.html#M000002">sitemaps (Robotstxt)</a><br />
+
+ <a href="classes/Robotstxt/Parser.html#M000006">sitemaps (Robotstxt::Parser)</a><br />
+
+ </div>
+</div>
+</body>
+</html>
21 doc/index.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<!--
+
+ Robotstxt
+
+ -->
+<head>
+ <title>Robotstxt</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+</head>
+<frameset rows="20%, 80%">
+ <frameset cols="25%,35%,45%">
+ <frame src="fr_file_index.html" title="Files" name="Files" />
+ <frame src="fr_class_index.html" name="Classes" />
+ <frame src="fr_method_index.html" name="Methods" />
+ </frameset>
+ <frame src="files/README_rdoc.html" name="docwin" />
+</frameset>
+</html>
299 doc/rdoc-style.css
@@ -0,0 +1,299 @@
+body {
+ font-family: Verdana,Arial,Helvetica,sans-serif;
+ font-size: 90%;
+ margin: 0;
+ margin-left: 40px;
+ padding: 0;
+ background: white;
+ color: black;
+}
+
+h1, h2, h3, h4 {
+ margin: 0;
+ background: transparent;
+}
+
+h1 {
+ font-size: 150%;
+}
+
+h2,h3,h4 {
+ margin-top: 1em;
+}
+
+:link, :visited {
+ background: #eef;
+ color: #039;
+ text-decoration: none;
+}
+
+:link:hover, :visited:hover {
+ background: #039;
+ color: #eef;
+}
+
+/* Override the base stylesheet's Anchor inside a table cell */
+td > :link, td > :visited {
+ background: transparent;
+ color: #039;
+ text-decoration: none;
+}
+
+/* and inside a section title */
+.section-title > :link, .section-title > :visited {
+ background: transparent;
+ color: #eee;
+ text-decoration: none;
+}
+
+/* === Structural elements =================================== */
+
+.index {
+ margin: 0;
+ margin-left: -40px;
+ padding: 0;
+ font-size: 90%;
+}
+
+.index :link, .index :visited {
+ margin-left: 0.7em;
+}
+
+.index .section-bar {
+ margin-left: 0px;
+ padding-left: 0.7em;
+ background: #ccc;
+ font-size: small;
+}
+
+#classHeader, #fileHeader {
+ width: auto;
+ color: white;
+ padding: 0.5em 1.5em 0.5em 1.5em;
+ margin: 0;
+ margin-left: -40px;
+ border-bottom: 3px solid #006;
+}
+
+#classHeader :link, #fileHeader :link,
+#classHeader :visited, #fileHeader :visited {
+ background: inherit;
+ color: white;
+}
+
+#classHeader td, #fileHeader td {
+ background: inherit;
+ color: white;
+}
+
+#fileHeader {
+ background: #057;
+}
+
+#classHeader {
+ background: #048;
+}
+
+.class-name-in-header {
+ font-size: 180%;
+ font-weight: bold;
+}
+
+#bodyContent {
+ padding: 0 1.5em 0 1.5em;
+}
+
+#description {
+ padding: 0.5em 1.5em;
+ background: #efefef;
+ border: 1px dotted #999;
+}
+
+#description h1, #description h2, #description h3,
+#description h4, #description h5, #description h6 {
+ color: #125;
+ background: transparent;
+}
+
+#validator-badges {
+ text-align: center;
+}
+
+#validator-badges img {
+ border: 0;
+}
+
+#copyright {
+ color: #333;
+ background: #efefef;
+ font: 0.75em sans-serif;
+ margin-top: 5em;
+ margin-bottom: 0;
+ padding: 0.5em 2em;
+}
+
+/* === Classes =================================== */
+
+table.header-table {
+ color: white;
+ font-size: small;
+}
+
+.type-note {
+ font-size: small;
+ color: #dedede;
+}
+
+.section-bar {
+ color: #333;
+ border-bottom: 1px solid #999;
+ margin-left: -20px;
+}
+
+.section-title {
+ background: #79a;
+ color: #eee;
+ padding: 3px;
+ margin-top: 2em;
+ margin-left: -30px;
+ border: 1px solid #999;
+}
+
+.top-aligned-row {
+ vertical-align: top
+}
+
+.bottom-aligned-row {
+ vertical-align: bottom
+}
+
+#diagram img {
+ border: 0;
+}
+
+/* --- Context section classes ----------------------- */
+
+.context-row { }
+
+.context-item-name {
+ font-family: monospace;
+ font-weight: bold;
+ color: black;
+}
+
+.context-item-value {
+ font-size: small;
+ color: #448;
+}
+
+.context-item-desc {
+ color: #333;
+ padding-left: 2em;
+}
+
+/* --- Method classes -------------------------- */
+
+.method-detail {
+ background: #efefef;
+ padding: 0;
+ margin-top: 0.5em;
+ margin-bottom: 1em;
+ border: 1px dotted #ccc;
+}
+
+.method-heading {
+ color: black;
+ background: #ccc;
+ border-bottom: 1px solid #666;
+ padding: 0.2em 0.5em 0 0.5em;
+}
+
+.method-signature {
+ color: black;
+ background: inherit;
+}
+
+.method-name {
+ font-weight: bold;
+}
+
+.method-args {
+ font-style: italic;
+}
+
+.method-description {
+ padding: 0 0.5em 0 0.5em;
+}
+
+/* --- Source code sections -------------------- */
+
+:link.source-toggle, :visited.source-toggle {
+ font-size: 90%;
+}
+
+div.method-source-code {
+ background: #262626;
+ color: #ffdead;
+ margin: 1em;
+ padding: 0.5em;
+ border: 1px dashed #999;
+ overflow: auto;
+}
+
+div.method-source-code pre {
+ color: #ffdead;
+}
+
+/* --- Ruby keyword styles --------------------- */
+
+.standalone-code {
+ background: #221111;
+ color: #ffdead;
+ overflow: auto;
+}
+
+.ruby-constant {
+ color: #7fffd4;
+ background: transparent;
+}
+
+.ruby-keyword {
+ color: #00ffff;
+ background: transparent;
+}
+
+.ruby-ivar {
+ color: #eedd82;
+ background: transparent;
+}
+
+.ruby-operator {
+ color: #00ffee;
+ background: transparent;
+}
+
+.ruby-identifier {
+ color: #ffdead;
+ background: transparent;
+}
+
+.ruby-node {
+ color: #ffa07a;
+ background: transparent;
+}
+
+.ruby-comment {
+ color: #b22222;
+ font-weight: bold;
+ background: transparent;
+}
+
+.ruby-regexp {
+ color: #ffa07a;
+ background: transparent;
+}
+
+.ruby-value {
+ color: #7fffd4;
+ background: transparent;
+}
55 lib/robotstxt.rb
@@ -0,0 +1,55 @@
+#
+# = Ruby Robotstxt
+#
+# An Ruby Robots.txt parser.
+#
+#
+# Category:: Net
+# Package:: Robotstxt
+# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
+# License:: MIT License
+#
+#--
+#
+#++
+
+
+require 'robotstxt/parser'
+require 'uri'
+
+
+
+module Robotstxt
+
+ NAME = 'Robotstxt'
+ GEM = 'robotstxt'
+ AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
+ VERSION = '0.5.2'
+
+
+ # Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
+ # Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
+ #
+ # Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
+ #
+ def self.allowed?(url, robot_id)
+
+ u = URI.parse(url)
+ r = Robotstxt::Parser.new(robot_id)
+ r.allowed?(url) if r.get(u.scheme + '://' + u.host)
+
+ end
+
+ # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
+ #
+ # Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest')
+ #
+ def self.sitemaps(url, robot_id)
+
+ u = URI.parse(url)
+ r = Robotstxt::Parser.new(robot_id)
+ r.sitemaps if r.get(u.scheme + '://' + u.host)
+
+ end
+
+end
169 lib/robotstxt/parser.rb
@@ -0,0 +1,169 @@
+#
+# = Ruby Robotstxt
+#
+# An Ruby Robots.txt parser.
+#
+#
+# Category:: Net
+# Package:: Robotstxt
+# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
+# License:: MIT License
+#
+#--
+#
+#++
+
+require 'net/http'
+require 'uri'
+
+
+module Robotstxt
+ class Parser
+ attr_accessor :robot_id
+ attr_reader :found, :body, :sitemaps, :rules
+
+ # Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
+ #
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
+ #
+ def initialize(robot_id = nil)
+
+ @robot_id = '*'
+ @rules = []
+ @sitemaps = []
+ @robot_id = robot_id.downcase if !robot_id.nil?
+
+ end
+
+
+ # Requires and parses the Robots.txt file for the <tt>hostname</tt>.
+ #
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
+ # client.get('http://www.simonerinzivillo.it')
+ #
+ #
+ # This method returns <tt>true</tt> if the parsing is gone.
+ #
+ def get(hostname)
+
+ @ehttp = true
+ url = URI.parse(hostname)
+
+ begin
+ http = Net::HTTP.new(url.host, url.port)
+ if url.scheme == 'https'
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+ http.use_ssl = true
+ end
+
+ response = http.request(Net::HTTP::Get.new('/robots.txt'))
+
+ case response
+ when Net::HTTPSuccess then
+ @found = true
+ @body = response.body
+ parse()
+
+ else
+ @found = false
+ end
+
+ return @found
+
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
+ if @ehttp
+ @ettp = false
+ retry
+ else
+ return nil
+ end
+ end
+
+ end
+
+
+ # Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
+ #
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
+ # if client.get('http://www.simonerinzivillo.it')
+ # client.allowed?('http://www.simonerinzivillo.it/no-dir/')
+ # end
+ #
+ # This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
+ #
+ def allowed?(var)
+ is_allow = true
+ url = URI.parse(var)
+ querystring = (!url.query.nil?) ? '?' + url.query : ''
+ url_path = url.path + querystring
+
+ @rules.each {|ua|
+
+ if @robot_id == ua[0] || ua[0] == '*'
+
+ ua[1].each {|d|
+
+ is_allow = false if url_path.match('^' + d ) || d == '/'
+
+ }
+
+ end
+
+ }
+ is_allow
+ end
+
+ # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
+ #
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
+ # if client.get('http://www.simonerinzivillo.it')
+ # client.sitemaps.each{ |url|
+ # puts url
+ # }
+ # end
+ #
+ def sitemaps
+ @sitemaps
+ end
+
+ # This method returns <tt>true</tt> if the Robots.txt parsing is gone.
+ #
+ def found?
+ !!@found
+ end
+
+
+ private
+
+ def parse()
+ @body = @body.downcase
+
+ @body.each_line {|r|
+
+ case r
+ when /^#.+$/
+
+ when /^\s*user-agent\s*:.+$/
+
+ @rules << [ r.split(':')[1].strip, [], []]
+
+ when /^\s*disallow\s*:.+$/
+ r = r.split(':')[1].strip
+ @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
+
+ when /^\s*allow\s*:.+$/
+ r = r.split(':')[1].strip
+ @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
+
+ when /^\s*sitemap\s*:.+$/
+ @sitemaps<< r.split(':')[1].strip + r.split(':')[2].strip if r.length > 0
+
+ end
+
+ }
+
+
+ end
+
+ end
+end
43 test/parser_test.rb
@@ -0,0 +1,43 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+
+require 'test/unit'
+require 'robotstxt'
+
+class TestParser < Test::Unit::TestCase
+
+ def setup
+ @client = Robotstxt::Parser.new('rubytest')
+ @client.get('http://www.simonerinzivillo.it')
+ end
+
+ def test_initialize
+ client = Robotstxt::Parser.new('*')
+ assert_instance_of Robotstxt::Parser, client
+ end
+
+ def test_get_file_robotstxt
+ assert @client.get('http://www.simonerinzivillo.it')
+ end
+
+ def test_robotstxt_isfound
+ assert @client.found?()
+ end
+
+ def test_url_allowed
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/no-dir/')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/page.html')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/page.php?var=0')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php?var=0')
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/')
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php')
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php?var=0')
+ end
+
+ def test_sitemaps
+ assert @client.sitemaps.length() > 0
+ end
+
+end
19 test/robotstxt_test.rb
@@ -0,0 +1,19 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+
+require 'test/unit'
+require 'robotstxt'
+
+class TestRobotstxt < Test::Unit::TestCase
+
+
+ def test_allowed
+ assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
+ assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
+ end
+
+ def test_sitemaps
+ assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
+ end
+
+
+end
Please sign in to comment.
Something went wrong with that request. Please try again.