Skip to content
Browse files

update

  • Loading branch information...
1 parent 486d6ef commit 20ec22ffc15a310544015705815cdbe0288ec453 @sahib sahib committed Jun 22, 2012
Showing with 1,322 additions and 1,252 deletions.
  1. +42 −42 _modules/{ → archive}/cli/cmdparser.html
  2. +28 −34 _modules/{ → archive}/cmanager/crawlmanager.html
  3. +44 −61 _modules/{ → archive}/cmanager/intervalmanager.html
  4. +22 −22 _modules/{ → archive}/config/reader.html
  5. +30 −30 _modules/{ → archive}/crawler/cleaner.html
  6. +42 −43 _modules/{ → archive}/crawler/crawljob.html
  7. +32 −33 _modules/{ → archive}/crawler/dbgen.html
  8. +28 −26 _modules/{ → archive}/crawler/extractor.html
  9. +54 −52 _modules/{ → archive}/crawler/filter.html
  10. +34 −33 _modules/{ → archive}/crawler/git.html
  11. +21 −21 _modules/{ → archive}/crawler/rsync.html
  12. +28 −39 _modules/{ → archive}/crawler/wget.html
  13. +22 −22 _modules/{ → archive}/crawler/xmlgen.html
  14. +22 −28 _modules/{ → archive}/crawler/xmlreader.html
  15. +25 −25 _modules/{ → archive}/dbrecover/pickle_recover.html
  16. +27 −37 _modules/{ → archive}/dbrecover/recover.html
  17. +26 −26 _modules/{ → archive}/dbrecover/repair.html
  18. +28 −36 _modules/{ → archive}/dbrecover/xml_recover.html
  19. +30 −32 _modules/{ → archive}/javadapter/server.html
  20. +28 −24 _modules/{ → archive}/util/filelock.html
  21. +23 −20 _modules/{ → archive}/util/files.html
  22. +39 −27 _modules/{ → archive}/util/paths.html
  23. +26 −40 _modules/{ → archive}/util/times.html
  24. +23 −23 _modules/index.html
  25. +1 −1 _sources/cleaner.txt
  26. +1 −1 _sources/cli.txt
  27. +2 −2 _sources/config.txt
  28. +2 −2 _sources/crawlstuff.txt
  29. +1 −1 _sources/dbgen.txt
  30. +1 −1 _sources/extractor.txt
  31. +1 −1 _sources/filter.txt
  32. +1 −1 _sources/git.txt
  33. +6 −3 _sources/index.txt
  34. +2 −2 _sources/intervalmanager.txt
  35. +46 −3 _sources/intro.txt
  36. +16 −15 _sources/javadapter.txt
  37. +6 −6 _sources/recover.txt
  38. +2 −2 _sources/rsync.txt
  39. +4 −4 _sources/utils.txt
  40. +1 −1 _sources/wget.txt
  41. +2 −2 _sources/xml2metadata.txt
  42. +8 −8 cleaner.html
  43. +16 −16 cli.html
  44. +4 −4 config.html
  45. +24 −24 crawlstuff.html
  46. +19 −19 dbgen.html
  47. +7 −7 extractor.html
  48. +8 −8 filter.html
  49. +142 −140 genindex.html
  50. +25 −25 git.html
  51. +23 −6 index.html
  52. +36 −36 intervalmanager.html
  53. +48 −3 intro.html
  54. +29 −28 javadapter.html
  55. BIN objects.inv
  56. +15 −25 py-modindex.html
  57. +23 −23 recover.html
  58. +3 −3 rsync.html
  59. +1 −1 searchindex.js
  60. +56 −36 utils.html
  61. +8 −8 wget.html
  62. +8 −8 xml2metadata.html
View
84 _modules/cli/cmdparser.html → _modules/archive/cli/cmdparser.html
@@ -7,38 +7,38 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>cli.cmdparser &mdash; Webarchiv 1.0 documentation</title>
+ <title>archive.cli.cmdparser &mdash; Webarchiv 1.0 documentation</title>
- <link rel="stylesheet" href="../../_static/nature.css" type="text/css" />
- <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
- URL_ROOT: '../../',
+ URL_ROOT: '../../../',
VERSION: '1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
- <script type="text/javascript" src="../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../_static/doctools.js"></script>
- <link rel="top" title="Webarchiv 1.0 documentation" href="../../index.html" />
- <link rel="up" title="Module code" href="../index.html" />
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+ <link rel="top" title="Webarchiv 1.0 documentation" href="../../../index.html" />
+ <link rel="up" title="Module code" href="../../index.html" />
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" accesskey="U">Module code</a> &raquo;</li>
</ul>
</div>
@@ -47,7 +47,7 @@
<div class="bodywrapper">
<div class="body">
- <h1>Source code for cli.cmdparser</h1><div class="highlight"><pre>
+ <h1>Source code for archive.cli.cmdparser</h1><div class="highlight"><pre>
<span class="c">#!/usr/bin/env python</span>
<span class="c"># encoding: utf-8</span>
@@ -94,21 +94,21 @@
<span class="c"># pip install docopt</span>
<span class="kn">from</span> <span class="nn">docopt</span> <span class="kn">import</span> <span class="n">docopt</span>
-<span class="kn">from</span> <span class="nn">init.init</span> <span class="kn">import</span> <span class="n">init_archive</span>
-<span class="kn">from</span> <span class="nn">dbrecover.recover</span> <span class="kn">import</span> <span class="n">rebuild</span><span class="p">,</span> <span class="n">remove</span>
-<span class="kn">from</span> <span class="nn">dbrecover.repair</span> <span class="kn">import</span> <span class="n">repair</span>
+<span class="kn">from</span> <span class="nn">archive.init.init</span> <span class="kn">import</span> <span class="n">init_archive</span>
+<span class="kn">from</span> <span class="nn">archive.dbrecover.recover</span> <span class="kn">import</span> <span class="n">rebuild</span><span class="p">,</span> <span class="n">remove</span>
+<span class="kn">from</span> <span class="nn">archive.dbrecover.repair</span> <span class="kn">import</span> <span class="n">repair</span>
-<span class="kn">import</span> <span class="nn">cmanager.intervalmanager</span> <span class="kn">as</span> <span class="nn">imgur</span>
-<span class="kn">import</span> <span class="nn">javadapter.server</span> <span class="kn">as</span> <span class="nn">javadapter</span>
-<span class="kn">import</span> <span class="nn">config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
-<span class="kn">import</span> <span class="nn">util.filelock</span> <span class="kn">as</span> <span class="nn">lock</span>
-<span class="kn">import</span> <span class="nn">util.paths</span> <span class="kn">as</span> <span class="nn">paths</span>
+<span class="kn">import</span> <span class="nn">archive.cmanager.intervalmanager</span> <span class="kn">as</span> <span class="nn">imgur</span>
+<span class="kn">import</span> <span class="nn">archive.javadapter.server</span> <span class="kn">as</span> <span class="nn">javadapter</span>
+<span class="kn">import</span> <span class="nn">archive.config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
+<span class="kn">import</span> <span class="nn">archive.util.filelock</span> <span class="kn">as</span> <span class="nn">lock</span>
+<span class="kn">import</span> <span class="nn">archive.util.paths</span> <span class="kn">as</span> <span class="nn">paths</span>
<span class="n">LOCKFILE</span> <span class="o">=</span> <span class="s">&#39;global&#39;</span>
-<div class="viewcode-block" id="Cli"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli">[docs]</a><span class="k">class</span> <span class="nc">Cli</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli">[docs]</a><span class="k">class</span> <span class="nc">Cli</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Archive commandline intepreter</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -142,9 +142,9 @@
<span class="n">filename</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">paths</span><span class="o">.</span><span class="n">get_log_dir</span><span class="p">(),</span> <span class="s">&#39;archive.log&#39;</span><span class="p">),</span>
<span class="n">format</span><span class="o">=</span><span class="s">&#39;</span><span class="si">%(asctime)s</span><span class="s"> - </span><span class="si">%(levelname)s</span><span class="s"> - </span><span class="si">%(message)s</span><span class="s">&#39;</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">IOError</span> <span class="k">as</span> <span class="n">err</span><span class="p">:</span>
- <span class="k">print</span><span class="p">(</span><span class="s">&#39;Cannot open log - file structure probably does not exist yet:&#39;</span><span class="p">,</span> <span class="n">err</span><span class="p">)</span>
- <span class="k">else</span><span class="p">:</span>
- <span class="k">print</span><span class="p">(</span><span class="s">&#39;Logging will be printed to logfile only.&#39;</span><span class="p">)</span>
+ <span class="c"># Disable warning for initialization</span>
+ <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;init&#39;</span><span class="p">]</span> <span class="ow">is</span> <span class="bp">False</span><span class="p">:</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&#39;Cannot open log - file structure probably does not exist yet:&#39;</span><span class="p">,</span> <span class="n">err</span><span class="p">)</span>
<span class="c"># iterating through arguments</span>
<span class="k">for</span> <span class="n">module</span><span class="p">,</span> <span class="n">handler</span> <span class="ow">in</span> <span class="n">submodules</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
@@ -155,24 +155,24 @@
<span class="k">print</span><span class="p">(</span><span class="s">&quot;archive is currently locked with global.lock.&quot;</span><span class="p">)</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
-<div class="viewcode-block" id="Cli.handle_init"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.handle_init">[docs]</a> <span class="k">def</span> <span class="nf">handle_init</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.handle_init"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.handle_init">[docs]</a> <span class="k">def</span> <span class="nf">handle_init</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Initializes archive paths</span>
<span class="sd"> &quot;&quot;&quot;</span>
- <span class="k">try</span><span class="p">:</span>
- <span class="n">path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;&lt;path&gt;&#39;</span><span class="p">]</span>
+ <span class="n">path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;&lt;path&gt;&#39;</span><span class="p">]</span>
+ <span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">init_archive</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
- <span class="k">except</span> <span class="ne">KeyError</span><span class="p">:</span>
+ <span class="k">else</span><span class="p">:</span>
<span class="n">init_archive</span><span class="p">()</span>
</div>
-<div class="viewcode-block" id="Cli.cmd_loop"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.cmd_loop">[docs]</a> <span class="k">def</span> <span class="nf">cmd_loop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shell</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">cv</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.cmd_loop"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.cmd_loop">[docs]</a> <span class="k">def</span> <span class="nf">cmd_loop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shell</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">cv</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The cmdloop runs in a seperate thread.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">shell</span><span class="o">.</span><span class="n">cmdloop</span><span class="p">()</span>
<span class="n">i</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
</div>
-<div class="viewcode-block" id="Cli.handle_crawler"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.handle_crawler">[docs]</a> <span class="k">def</span> <span class="nf">handle_crawler</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.handle_crawler"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.handle_crawler">[docs]</a> <span class="k">def</span> <span class="nf">handle_crawler</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Starts and controls crawler commandline</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -199,7 +199,7 @@
<span class="n">cv</span><span class="o">.</span><span class="n">release</span><span class="p">()</span>
<span class="n">cmd_thread</span><span class="o">.</span><span class="n">join</span><span class="p">()</span>
</div>
-<div class="viewcode-block" id="Cli.handle_javadapter"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.handle_javadapter">[docs]</a> <span class="k">def</span> <span class="nf">handle_javadapter</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.handle_javadapter"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.handle_javadapter">[docs]</a> <span class="k">def</span> <span class="nf">handle_javadapter</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Starts javadapter commandline</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -216,7 +216,7 @@
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;--start&#39;</span><span class="p">]:</span>
<span class="n">server</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
</div>
-<div class="viewcode-block" id="Cli.handle_db"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.handle_db">[docs]</a> <span class="k">def</span> <span class="nf">handle_db</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.handle_db"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.handle_db">[docs]</a> <span class="k">def</span> <span class="nf">handle_db</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="s">&#39;Handle &quot;db&quot; submodule&#39;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;--rebuild&#39;</span><span class="p">]:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__filelock</span><span class="o">.</span><span class="n">acquire</span><span class="p">()</span>
@@ -229,17 +229,17 @@
<span class="k">except</span> <span class="ne">OSError</span> <span class="k">as</span> <span class="n">err</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s">&#39;Unable to delete database:&#39;</span><span class="p">,</span> <span class="n">err</span><span class="p">)</span>
</div>
-<div class="viewcode-block" id="Cli.handle_config"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.handle_config">[docs]</a> <span class="k">def</span> <span class="nf">handle_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.handle_config"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.handle_config">[docs]</a> <span class="k">def</span> <span class="nf">handle_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes Config Handler operations</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;--get&#39;</span><span class="p">]:</span>
- <span class="k">print</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;&lt;confurl&gt;&#39;</span><span class="p">]))</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;--get&#39;</span><span class="p">]))</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">__arguments</span><span class="p">[</span><span class="s">&#39;--set&#39;</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="c"># TODO: Wait for config implementation.</span>
</div>
-<div class="viewcode-block" id="Cli.handle_repair"><a class="viewcode-back" href="../../cli.html#cli.cmdparser.Cli.handle_repair">[docs]</a> <span class="k">def</span> <span class="nf">handle_repair</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cli.handle_repair"><a class="viewcode-back" href="../../../cli.html#archive.cli.cmdparser.Cli.handle_repair">[docs]</a> <span class="k">def</span> <span class="nf">handle_repair</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes archive rapair tool</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -257,7 +257,7 @@
<div class="sphinxsidebarwrapper">
<div id="searchbox" style="display: none">
<h3>Quick search</h3>
- <form class="search" action="../../search.html" method="get">
+ <form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
@@ -276,13 +276,13 @@
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" >Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" >Module code</a> &raquo;</li>
</ul>
</div>
<div class="footer">
View
62 _modules/cmanager/crawlmanager.html → _modules/archive/cmanager/crawlmanager.html
@@ -7,38 +7,38 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>cmanager.crawlmanager &mdash; Webarchiv 1.0 documentation</title>
+ <title>archive.cmanager.crawlmanager &mdash; Webarchiv 1.0 documentation</title>
- <link rel="stylesheet" href="../../_static/nature.css" type="text/css" />
- <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
- URL_ROOT: '../../',
+ URL_ROOT: '../../../',
VERSION: '1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
- <script type="text/javascript" src="../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../_static/doctools.js"></script>
- <link rel="top" title="Webarchiv 1.0 documentation" href="../../index.html" />
- <link rel="up" title="Module code" href="../index.html" />
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+ <link rel="top" title="Webarchiv 1.0 documentation" href="../../../index.html" />
+ <link rel="up" title="Module code" href="../../index.html" />
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" accesskey="U">Module code</a> &raquo;</li>
</ul>
</div>
@@ -47,7 +47,7 @@
<div class="bodywrapper">
<div class="body">
- <h1>Source code for cmanager.crawlmanager</h1><div class="highlight"><pre>
+ <h1>Source code for archive.cmanager.crawlmanager</h1><div class="highlight"><pre>
<span class="c">#!/usr/bin/env python</span>
<span class="c"># encoding: utf-8</span>
@@ -58,9 +58,10 @@
<span class="n">__author__</span> <span class="o">=</span> <span class="s">&#39;Christoph Piechula&#39;</span>
<span class="kn">import</span> <span class="nn">multiprocessing.pool</span> <span class="kn">as</span> <span class="nn">mpool</span>
-<span class="kn">import</span> <span class="nn">util.files</span> <span class="kn">as</span> <span class="nn">utl</span>
-<span class="kn">import</span> <span class="nn">config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
-<span class="kn">import</span> <span class="nn">crawler.crawljob</span> <span class="kn">as</span> <span class="nn">job</span>
+<span class="kn">import</span> <span class="nn">archive.util.files</span> <span class="kn">as</span> <span class="nn">utl</span>
+<span class="kn">import</span> <span class="nn">archive.config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.crawljob</span> <span class="kn">as</span> <span class="nn">job</span>
+
<span class="kn">import</span> <span class="nn">threading</span>
<span class="kn">import</span> <span class="nn">logging</span>
@@ -89,7 +90,7 @@
<span class="k">return</span> <span class="n">ident</span>
-<div class="viewcode-block" id="CrawlerManager"><a class="viewcode-back" href="../../crawlstuff.html#cmanager.crawlmanager.CrawlerManager">[docs]</a><span class="k">class</span> <span class="nc">CrawlerManager</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerManager"><a class="viewcode-back" href="../../../crawlstuff.html#archive.cmanager.crawlmanager.CrawlerManager">[docs]</a><span class="k">class</span> <span class="nc">CrawlerManager</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Crawljob manager, threadpool which launches a pool of</span>
<span class="sd"> threads</span>
@@ -103,7 +104,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__urls</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">urls</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__pool</span> <span class="o">=</span> <span class="n">mpool</span><span class="o">.</span><span class="n">ThreadPool</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">&#39;crawler.maxInst&#39;</span><span class="p">))</span>
-<div class="viewcode-block" id="CrawlerManager.start"><a class="viewcode-back" href="../../crawlstuff.html#cmanager.crawlmanager.CrawlerManager.start">[docs]</a> <span class="k">def</span> <span class="nf">start</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerManager.start"><a class="viewcode-back" href="../../../crawlstuff.html#archive.cmanager.crawlmanager.CrawlerManager.start">[docs]</a> <span class="k">def</span> <span class="nf">start</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Starts threadpool with max number of instances</span>
@@ -118,14 +119,14 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__done_callback</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__done_callback</span> <span class="o">=</span> <span class="bp">None</span>
</div>
-<div class="viewcode-block" id="CrawlerManager.register_done"><a class="viewcode-back" href="../../crawlstuff.html#cmanager.crawlmanager.CrawlerManager.register_done">[docs]</a> <span class="k">def</span> <span class="nf">register_done</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerManager.register_done"><a class="viewcode-back" href="../../../crawlstuff.html#archive.cmanager.crawlmanager.CrawlerManager.register_done">[docs]</a> <span class="k">def</span> <span class="nf">register_done</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Register method for callback function which is</span>
<span class="sd"> triggered after work is done</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__done_callback</span> <span class="o">=</span> <span class="n">func</span>
</div>
-<div class="viewcode-block" id="CrawlerManager.shutdown"><a class="viewcode-back" href="../../crawlstuff.html#cmanager.crawlmanager.CrawlerManager.shutdown">[docs]</a> <span class="k">def</span> <span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerManager.shutdown"><a class="viewcode-back" href="../../../crawlstuff.html#archive.cmanager.crawlmanager.CrawlerManager.shutdown">[docs]</a> <span class="k">def</span> <span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Shuts down all currently running crawljobs</span>
<span class="sd"> and joins/closes the pool</span>
@@ -134,14 +135,7 @@
<span class="n">job</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__pool</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">__pool</span><span class="o">.</span><span class="n">join</span><span class="p">()</span>
-
-<span class="c">###########################################################################</span>
-<span class="c"># unittest #</span>
-<span class="c">###########################################################################</span>
-</div></div>
-<span class="k">if</span> <span class="n">__name__</span> <span class="o">==</span> <span class="s">&#39;__main__&#39;</span><span class="p">:</span>
- <span class="n">cm</span> <span class="o">=</span> <span class="n">CrawlerManager</span><span class="p">(</span><span class="n">utl</span><span class="o">.</span><span class="n">unique_items_from_file</span><span class="p">(</span><span class="s">&#39;url.txt&#39;</span><span class="p">))</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">__pool</span><span class="o">.</span><span class="n">join</span><span class="p">()</span></div></div>
</pre></div>
</div>
@@ -151,7 +145,7 @@
<div class="sphinxsidebarwrapper">
<div id="searchbox" style="display: none">
<h3>Quick search</h3>
- <form class="search" action="../../search.html" method="get">
+ <form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
@@ -170,13 +164,13 @@
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" >Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" >Module code</a> &raquo;</li>
</ul>
</div>
<div class="footer">
View
105 _modules/cmanager/intervalmanager.html → ...les/archive/cmanager/intervalmanager.html
@@ -7,38 +7,38 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>cmanager.intervalmanager &mdash; Webarchiv 1.0 documentation</title>
+ <title>archive.cmanager.intervalmanager &mdash; Webarchiv 1.0 documentation</title>
- <link rel="stylesheet" href="../../_static/nature.css" type="text/css" />
- <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
- URL_ROOT: '../../',
+ URL_ROOT: '../../../',
VERSION: '1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
- <script type="text/javascript" src="../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../_static/doctools.js"></script>
- <link rel="top" title="Webarchiv 1.0 documentation" href="../../index.html" />
- <link rel="up" title="Module code" href="../index.html" />
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+ <link rel="top" title="Webarchiv 1.0 documentation" href="../../../index.html" />
+ <link rel="up" title="Module code" href="../../index.html" />
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" accesskey="U">Module code</a> &raquo;</li>
</ul>
</div>
@@ -47,7 +47,7 @@
<div class="bodywrapper">
<div class="body">
- <h1>Source code for cmanager.intervalmanager</h1><div class="highlight"><pre>
+ <h1>Source code for archive.cmanager.intervalmanager</h1><div class="highlight"><pre>
<span class="c">#!/usr/bin/env python</span>
<span class="c"># encoding: utf-8</span>
@@ -61,13 +61,14 @@
<span class="kn">import</span> <span class="nn">logging</span>
<span class="kn">import</span> <span class="nn">threading</span>
<span class="kn">import</span> <span class="nn">cmd</span>
-<span class="kn">import</span> <span class="nn">cmanager.crawlmanager</span> <span class="kn">as</span> <span class="nn">c</span>
-<span class="kn">import</span> <span class="nn">util.files</span> <span class="kn">as</span> <span class="nn">utl</span>
-<span class="kn">import</span> <span class="nn">util.times</span>
-<span class="kn">import</span> <span class="nn">config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
+<span class="kn">import</span> <span class="nn">archive.cmanager.crawlmanager</span> <span class="kn">as</span> <span class="nn">c</span>
+<span class="kn">import</span> <span class="nn">archive.util.files</span> <span class="kn">as</span> <span class="nn">utl</span>
+<span class="kn">import</span> <span class="nn">archive.util.times</span> <span class="kn">as</span> <span class="nn">times</span>
+<span class="kn">import</span> <span class="nn">archive.config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
-<div class="viewcode-block" id="IntervalManager"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.IntervalManager">[docs]</a><span class="k">class</span> <span class="nc">IntervalManager</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+
+<div class="viewcode-block" id="IntervalManager"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.IntervalManager">[docs]</a><span class="k">class</span> <span class="nc">IntervalManager</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> IntervalManager, manages crawling intervals including start, stop functionality</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -92,7 +93,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__status_lock</span><span class="o">.</span><span class="n">release</span><span class="p">()</span>
<span class="nd">@property</span>
-<div class="viewcode-block" id="IntervalManager.status"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.IntervalManager.status">[docs]</a> <span class="k">def</span> <span class="nf">status</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="IntervalManager.status"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.IntervalManager.status">[docs]</a> <span class="k">def</span> <span class="nf">status</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :returns: current system status</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -107,7 +108,7 @@
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">kill</span><span class="p">()</span>
-<div class="viewcode-block" id="IntervalManager.start"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.IntervalManager.start">[docs]</a> <span class="k">def</span> <span class="nf">start</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">delay_in_sec</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span>
+<div class="viewcode-block" id="IntervalManager.start"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.IntervalManager.start">[docs]</a> <span class="k">def</span> <span class="nf">start</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">delay_in_sec</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Starts the intervalmanager, which starts</span>
<span class="sd"> the crawlmanager procedure with a given delay</span>
@@ -125,26 +126,24 @@
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="n">delay_in_sec</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">status</span> <span class="o">!=</span> <span class="s">&#39;stop&#39;</span><span class="p">:</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">__cmanager</span> <span class="o">=</span> <span class="n">c</span><span class="o">.</span><span class="n">CrawlerManager</span><span class="p">(</span>
- <span class="n">utl</span><span class="o">.</span><span class="n">unique_items_from_file</span><span class="p">(</span>
- <span class="n">config</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">&#39;crawler.urllistpath&#39;</span><span class="p">)))</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">__start_time</span> <span class="o">=</span> <span class="n">util</span><span class="o">.</span><span class="n">times</span><span class="o">.</span><span class="n">get_localtime_sec</span><span class="p">()</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">__cmanager</span> <span class="o">=</span> <span class="n">c</span><span class="o">.</span><span class="n">CrawlerManager</span><span class="p">(</span><span class="n">utl</span><span class="o">.</span><span class="n">unique_items_from_file</span><span class="p">())</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">__start_time</span> <span class="o">=</span> <span class="n">times</span><span class="o">.</span><span class="n">get_localtime_sec</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__cmanager</span><span class="o">.</span><span class="n">register_done</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">crawling_done_callback</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__set_status</span><span class="p">(</span><span class="s">&#39;active&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__cmanager</span><span class="o">.</span><span class="n">start</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__set_status</span><span class="p">(</span><span class="s">&#39;ready&#39;</span><span class="p">)</span>
<span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s">&#39;Interval Manager finished - current state set to ready.&#39;</span><span class="p">)</span>
</div>
-<div class="viewcode-block" id="IntervalManager.crawling_done_callback"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.IntervalManager.crawling_done_callback">[docs]</a> <span class="k">def</span> <span class="nf">crawling_done_callback</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="IntervalManager.crawling_done_callback"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.IntervalManager.crawling_done_callback">[docs]</a> <span class="k">def</span> <span class="nf">crawling_done_callback</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Registers end time of last crawl and calculates</span>
<span class="sd"> delay and starts next run</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">status</span> <span class="o">!=</span> <span class="s">&#39;stop&#39;</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__set_status</span><span class="p">(</span><span class="s">&#39;ready&#39;</span><span class="p">)</span>
- <span class="n">current_time</span> <span class="o">=</span> <span class="n">util</span><span class="o">.</span><span class="n">times</span><span class="o">.</span><span class="n">get_localtime_sec</span><span class="p">()</span>
+ <span class="n">current_time</span> <span class="o">=</span> <span class="n">times</span><span class="o">.</span><span class="n">get_localtime_sec</span><span class="p">()</span>
<span class="n">next_crawl_time</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__start_time</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">__interval</span>
<span class="k">while</span> <span class="n">next_crawl_time</span> <span class="o">&lt;</span> <span class="n">current_time</span><span class="p">:</span>
@@ -156,7 +155,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__set_status</span><span class="p">(</span><span class="s">&#39;ready&#39;</span><span class="p">)</span>
<span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s">&#39;Interval Manager finished - current state set to ready.&#39;</span><span class="p">)</span>
</div>
-<div class="viewcode-block" id="IntervalManager.stop"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.IntervalManager.stop">[docs]</a> <span class="k">def</span> <span class="nf">stop</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="IntervalManager.stop"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.IntervalManager.stop">[docs]</a> <span class="k">def</span> <span class="nf">stop</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Stopps the interval manager</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -165,7 +164,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__set_status</span><span class="p">(</span><span class="s">&#39;stop&#39;</span><span class="p">)</span>
</div>
-<div class="viewcode-block" id="IntervalManager.kill"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.IntervalManager.kill">[docs]</a> <span class="k">def</span> <span class="nf">kill</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="IntervalManager.kill"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.IntervalManager.kill">[docs]</a> <span class="k">def</span> <span class="nf">kill</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Kills the system hard, like ctrl + c</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -176,7 +175,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__kill_mtx</span><span class="o">.</span><span class="n">release</span><span class="p">()</span>
</div></div>
-<div class="viewcode-block" id="CrawlerShell"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell">[docs]</a><span class="k">class</span> <span class="nc">CrawlerShell</span><span class="p">(</span><span class="n">cmd</span><span class="o">.</span><span class="n">Cmd</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerShell"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell">[docs]</a><span class="k">class</span> <span class="nc">CrawlerShell</span><span class="p">(</span><span class="n">cmd</span><span class="o">.</span><span class="n">Cmd</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interactive command shell to start, stop, kill and quit crawling procedure</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -194,25 +193,25 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__activeflag</span> <span class="o">=</span> <span class="bp">False</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__quitflag</span> <span class="o">=</span> <span class="bp">False</span>
-<div class="viewcode-block" id="CrawlerShell.set_quitflag"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.set_quitflag">[docs]</a> <span class="k">def</span> <span class="nf">set_quitflag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerShell.set_quitflag"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.set_quitflag">[docs]</a> <span class="k">def</span> <span class="nf">set_quitflag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Setting &#39;quit&#39; flag</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__quitflag</span> <span class="o">=</span> <span class="n">state</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.set_activeflag"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.set_activeflag">[docs]</a> <span class="k">def</span> <span class="nf">set_activeflag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerShell.set_activeflag"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.set_activeflag">[docs]</a> <span class="k">def</span> <span class="nf">set_activeflag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Setting &#39;active&#39; flag</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__activeflag</span> <span class="o">=</span> <span class="n">state</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.activeflag"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.activeflag">[docs]</a> <span class="k">def</span> <span class="nf">activeflag</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerShell.activeflag"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.activeflag">[docs]</a> <span class="k">def</span> <span class="nf">activeflag</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Getter for &#39;active&#39; flag</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__activeflag</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.quitflag"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.quitflag">[docs]</a> <span class="k">def</span> <span class="nf">quitflag</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerShell.quitflag"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.quitflag">[docs]</a> <span class="k">def</span> <span class="nf">quitflag</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Getter for &#39;quit&#39; flag</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -229,10 +228,7 @@
<span class="c"># Commands:</span>
-<div class="viewcode-block" id="CrawlerShell.do_start"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.do_start">[docs]</a> <span class="k">def</span> <span class="nf">do_start</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
- <span class="sd">&quot;&quot;&quot;</span>
-<span class="sd"> Invokes start command</span>
-<span class="sd"> &quot;&quot;&quot;</span>
+<div class="viewcode-block" id="CrawlerShell.do_start"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.do_start">[docs]</a> <span class="k">def</span> <span class="nf">do_start</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
<span class="s">&#39;Starts crawljobs if stopped previously.&#39;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">__activeflag</span> <span class="o">==</span> <span class="bp">False</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__activeflag</span> <span class="o">=</span> <span class="bp">True</span>
@@ -241,42 +237,29 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__cv</span><span class="o">.</span><span class="n">release</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">False</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.do_status"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.do_status">[docs]</a> <span class="k">def</span> <span class="nf">do_status</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
- <span class="sd">&quot;&quot;&quot;</span>
-<span class="sd"> Invokes status command</span>
-<span class="sd"> &quot;&quot;&quot;</span>
+<div class="viewcode-block" id="CrawlerShell.do_status"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.do_status">[docs]</a> <span class="k">def</span> <span class="nf">do_status</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
<span class="s">&#39;Status of crawler an intervalmanager.&#39;</span>
<span class="k">print</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__imanager</span><span class="o">.</span><span class="n">status</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">False</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.do_stop"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.do_stop">[docs]</a> <span class="k">def</span> <span class="nf">do_stop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
- <span class="sd">&quot;&quot;&quot;</span>
-<span class="sd"> Invokes stop command</span>
-<span class="sd"> &quot;&quot;&quot;</span>
+<div class="viewcode-block" id="CrawlerShell.do_stop"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.do_stop">[docs]</a> <span class="k">def</span> <span class="nf">do_stop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
<span class="s">&#39;Stopps self.__imanager.&#39;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__imanager</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">False</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.do_quit"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.do_quit">[docs]</a> <span class="k">def</span> <span class="nf">do_quit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
- <span class="sd">&quot;&quot;&quot;</span>
-<span class="sd"> Invokes quit command</span>
-<span class="sd"> &quot;&quot;&quot;</span>
+<div class="viewcode-block" id="CrawlerShell.do_quit"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.do_quit">[docs]</a> <span class="k">def</span> <span class="nf">do_quit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
<span class="s">&#39;Quits Intervalmanager, Crawljobs will still run until finished.&#39;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__cv</span><span class="o">.</span><span class="n">acquire</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__quitflag</span> <span class="o">=</span> <span class="bp">True</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__cv</span><span class="o">.</span><span class="n">notify</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__cv</span><span class="o">.</span><span class="n">release</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">True</span>
</div>
-<div class="viewcode-block" id="CrawlerShell.do_EOF"><a class="viewcode-back" href="../../intervalmanager.html#cmanager.intervalmanager.CrawlerShell.do_EOF">[docs]</a> <span class="k">def</span> <span class="nf">do_EOF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlerShell.do_EOF"><a class="viewcode-back" href="../../../intervalmanager.html#archive.cmanager.intervalmanager.CrawlerShell.do_EOF">[docs]</a> <span class="k">def</span> <span class="nf">do_EOF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes quit on EOF</span>
<span class="sd"> &quot;&quot;&quot;</span>
- <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">do_quit</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span>
-
-<span class="c">###########################################################################</span>
-<span class="c"># unittest #</span>
-<span class="c">###########################################################################</span></div></div>
+ <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">do_quit</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span></div></div>
</pre></div>
</div>
@@ -286,7 +269,7 @@
<div class="sphinxsidebarwrapper">
<div id="searchbox" style="display: none">
<h3>Quick search</h3>
- <form class="search" action="../../search.html" method="get">
+ <form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
@@ -305,13 +288,13 @@
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" >Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" >Module code</a> &raquo;</li>
</ul>
</div>
<div class="footer">
View
44 _modules/config/reader.html → _modules/archive/config/reader.html
@@ -7,38 +7,38 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>config.reader &mdash; Webarchiv 1.0 documentation</title>
+ <title>archive.config.reader &mdash; Webarchiv 1.0 documentation</title>
- <link rel="stylesheet" href="../../_static/nature.css" type="text/css" />
- <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
- URL_ROOT: '../../',
+ URL_ROOT: '../../../',
VERSION: '1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
- <script type="text/javascript" src="../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../_static/doctools.js"></script>
- <link rel="top" title="Webarchiv 1.0 documentation" href="../../index.html" />
- <link rel="up" title="Module code" href="../index.html" />
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+ <link rel="top" title="Webarchiv 1.0 documentation" href="../../../index.html" />
+ <link rel="up" title="Module code" href="../../index.html" />
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" accesskey="U">Module code</a> &raquo;</li>
</ul>
</div>
@@ -47,24 +47,24 @@
<div class="bodywrapper">
<div class="body">
- <h1>Source code for config.reader</h1><div class="highlight"><pre>
+ <h1>Source code for archive.config.reader</h1><div class="highlight"><pre>
<span class="c">#!/usr/bin/env python</span>
<span class="c"># encoding: utf-8</span>
<span class="n">__author__</span> <span class="o">=</span> <span class="s">&#39;Florian Bauer&#39;</span>
-<span class="kn">import</span> <span class="nn">config.options</span> <span class="kn">as</span> <span class="nn">options</span>
+<span class="kn">import</span> <span class="nn">archive.config.options</span> <span class="kn">as</span> <span class="nn">options</span>
-<div class="viewcode-block" id="get_default"><a class="viewcode-back" href="../../config.html#config.reader.get_default">[docs]</a><span class="k">def</span> <span class="nf">get_default</span><span class="p">(</span><span class="n">value</span><span class="p">):</span>
+<div class="viewcode-block" id="get_default"><a class="viewcode-back" href="../../../config.html#archive.config.reader.get_default">[docs]</a><span class="k">def</span> <span class="nf">get_default</span><span class="p">(</span><span class="n">value</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">return</span> <span class="n">options</span><span class="o">.</span><span class="n">default_options</span><span class="p">[</span><span class="n">value</span><span class="p">]</span>
<span class="k">except</span> <span class="ne">KeyError</span><span class="p">:</span>
<span class="k">return</span> <span class="s">&#39;&#39;</span>
</div>
-<div class="viewcode-block" id="get"><a class="viewcode-back" href="../../config.html#config.reader.get">[docs]</a><span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="n">value</span><span class="p">):</span>
+<div class="viewcode-block" id="get"><a class="viewcode-back" href="../../../config.html#archive.config.reader.get">[docs]</a><span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="n">value</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">return</span> <span class="n">options</span><span class="o">.</span><span class="n">actual_options</span><span class="p">[</span><span class="n">value</span><span class="p">]</span>
<span class="k">except</span> <span class="ne">KeyError</span><span class="p">:</span>
@@ -85,7 +85,7 @@
<div class="sphinxsidebarwrapper">
<div id="searchbox" style="display: none">
<h3>Quick search</h3>
- <form class="search" action="../../search.html" method="get">
+ <form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
@@ -104,13 +104,13 @@
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" >Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" >Module code</a> &raquo;</li>
</ul>
</div>
<div class="footer">
View
60 _modules/crawler/cleaner.html → _modules/archive/crawler/cleaner.html
@@ -7,38 +7,38 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>crawler.cleaner &mdash; Webarchiv 1.0 documentation</title>
+ <title>archive.crawler.cleaner &mdash; Webarchiv 1.0 documentation</title>
- <link rel="stylesheet" href="../../_static/nature.css" type="text/css" />
- <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
- URL_ROOT: '../../',
+ URL_ROOT: '../../../',
VERSION: '1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
- <script type="text/javascript" src="../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../_static/doctools.js"></script>
- <link rel="top" title="Webarchiv 1.0 documentation" href="../../index.html" />
- <link rel="up" title="Module code" href="../index.html" />
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+ <link rel="top" title="Webarchiv 1.0 documentation" href="../../../index.html" />
+ <link rel="up" title="Module code" href="../../index.html" />
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" accesskey="U">Module code</a> &raquo;</li>
</ul>
</div>
@@ -47,7 +47,7 @@
<div class="bodywrapper">
<div class="body">
- <h1>Source code for crawler.cleaner</h1><div class="highlight"><pre>
+ <h1>Source code for archive.crawler.cleaner</h1><div class="highlight"><pre>
<span class="c">#!/usr/bin/env python</span>
<span class="c"># encoding: utf-8</span>
@@ -64,12 +64,12 @@
<span class="kn">import</span> <span class="nn">unittest</span>
<span class="kn">import</span> <span class="nn">itertools</span>
-<span class="kn">import</span> <span class="nn">util.times</span> <span class="kn">as</span> <span class="nn">times</span>
-<span class="kn">import</span> <span class="nn">crawler.filter</span> <span class="kn">as</span> <span class="nn">filter</span>
-<span class="kn">from</span> <span class="nn">crawler.metadata</span> <span class="kn">import</span> <span class="n">MetaData</span>
+<span class="kn">import</span> <span class="nn">archive.util.times</span> <span class="kn">as</span> <span class="nn">times</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.filter</span> <span class="kn">as</span> <span class="nn">filter</span>
+<span class="kn">from</span> <span class="nn">archive.crawler.metadata</span> <span class="kn">import</span> <span class="n">MetaData</span>
-<div class="viewcode-block" id="Cleaner"><a class="viewcode-back" href="../../cleaner.html#crawler.cleaner.Cleaner">[docs]</a><span class="k">class</span> <span class="nc">Cleaner</span><span class="p">:</span>
+<div class="viewcode-block" id="Cleaner"><a class="viewcode-back" href="../../../cleaner.html#archive.crawler.cleaner.Cleaner">[docs]</a><span class="k">class</span> <span class="nc">Cleaner</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cleaner submodule, walk dir tree, cleans and restructures</span>
<span class="sd"> tmp_crawler_folder hierarchy and calls filter subsystem</span>
@@ -89,7 +89,7 @@
<span class="sd"> :file_name: name to content file itself</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">src_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">tmp_crawler_folder</span><span class="p">,</span> <span class="n">file_name</span><span class="p">)</span>
- <span class="n">dest_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">tmp_crawler_folder</span><span class="p">,</span> <span class="s">&#39;____data&#39;</span><span class="p">)</span>
+ <span class="n">dest_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">tmp_crawler_folder</span><span class="p">,</span> <span class="s">&#39;__data__&#39;</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">os</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">src_file</span><span class="p">,</span> <span class="n">dest_file</span><span class="p">)</span>
@@ -115,13 +115,13 @@
<span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">dest_file</span><span class="p">)</span>
<span class="nd">@property</span>
-<div class="viewcode-block" id="Cleaner.meta_list"><a class="viewcode-back" href="../../cleaner.html#crawler.cleaner.Cleaner.meta_list">[docs]</a> <span class="k">def</span> <span class="nf">meta_list</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cleaner.meta_list"><a class="viewcode-back" href="../../../cleaner.html#archive.crawler.cleaner.Cleaner.meta_list">[docs]</a> <span class="k">def</span> <span class="nf">meta_list</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :returns: metalist</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__mdlist</span>
</div>
-<div class="viewcode-block" id="Cleaner.restructure"><a class="viewcode-back" href="../../cleaner.html#crawler.cleaner.Cleaner.restructure">[docs]</a> <span class="k">def</span> <span class="nf">restructure</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cleaner.restructure"><a class="viewcode-back" href="../../../cleaner.html#archive.crawler.cleaner.Cleaner.restructure">[docs]</a> <span class="k">def</span> <span class="nf">restructure</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Walks through crawler temp folder hierarchy calling internal</span>
<span class="sd"> restructure method to &#39;normalize&#39; file hierarchy</span>
@@ -135,7 +135,7 @@
<span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="n">logging</span><span class="o">.</span><span class="n">exception</span><span class="p">(</span><span class="s">&quot;cannot walk through dir structure.&quot;</span><span class="p">)</span>
</div>
-<div class="viewcode-block" id="Cleaner.clean_empty"><a class="viewcode-back" href="../../cleaner.html#crawler.cleaner.Cleaner.clean_empty">[docs]</a> <span class="k">def</span> <span class="nf">clean_empty</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Cleaner.clean_empty"><a class="viewcode-back" href="../../../cleaner.html#archive.crawler.cleaner.Cleaner.clean_empty">[docs]</a> <span class="k">def</span> <span class="nf">clean_empty</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Removes empty files and folders</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -167,19 +167,19 @@
<span class="n">shutil</span><span class="o">.</span><span class="n">copytree</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__raw_data</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">__restruct_test_data</span><span class="p">)</span>
<span class="c"># copying &#39;raw restructured&#39; data for clean empty testing</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">copytree</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__clean_should_be</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_data</span><span class="p">)</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_dataer</span> <span class="o">=</span> <span class="n">Cleaner</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__restruct_test_data</span><span class="p">)</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_data</span> <span class="o">=</span> <span class="n">Cleaner</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__restruct_test_data</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">compare_filetree</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">should_be</span><span class="p">,</span> <span class="n">test_data</span><span class="p">):</span>
<span class="n">should_be</span> <span class="o">=</span> <span class="n">content_helper</span><span class="p">(</span><span class="n">should_be</span><span class="p">)</span>
<span class="n">really_is</span> <span class="o">=</span> <span class="n">content_helper</span><span class="p">(</span><span class="n">test_data</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">should_be</span><span class="p">,</span> <span class="n">really_is</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">test_restructure</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_dataer</span><span class="o">.</span><span class="n">restructure</span><span class="p">()</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_data</span><span class="o">.</span><span class="n">restructure</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">compare_filetree</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__restruct_should_be</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">__restruct_test_data</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">test_clean_empty</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
- <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_dataer</span><span class="o">.</span><span class="n">clean_empty</span><span class="p">()</span>
+ <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_data</span><span class="o">.</span><span class="n">clean_empty</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">compare_filetree</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__clean_should_be</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">__clean_test_data</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">tearDown</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
@@ -196,7 +196,7 @@
<div class="sphinxsidebarwrapper">
<div id="searchbox" style="display: none">
<h3>Quick search</h3>
- <form class="search" action="../../search.html" method="get">
+ <form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
@@ -215,13 +215,13 @@
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" >Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" >Module code</a> &raquo;</li>
</ul>
</div>
<div class="footer">
View
85 _modules/crawler/crawljob.html → _modules/archive/crawler/crawljob.html
@@ -7,38 +7,38 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>crawler.crawljob &mdash; Webarchiv 1.0 documentation</title>
+ <title>archive.crawler.crawljob &mdash; Webarchiv 1.0 documentation</title>
- <link rel="stylesheet" href="../../_static/nature.css" type="text/css" />
- <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
+ <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
- URL_ROOT: '../../',
+ URL_ROOT: '../../../',
VERSION: '1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
- <script type="text/javascript" src="../../_static/jquery.js"></script>
- <script type="text/javascript" src="../../_static/underscore.js"></script>
- <script type="text/javascript" src="../../_static/doctools.js"></script>
- <link rel="top" title="Webarchiv 1.0 documentation" href="../../index.html" />
- <link rel="up" title="Module code" href="../index.html" />
+ <script type="text/javascript" src="../../../_static/jquery.js"></script>
+ <script type="text/javascript" src="../../../_static/underscore.js"></script>
+ <script type="text/javascript" src="../../../_static/doctools.js"></script>
+ <link rel="top" title="Webarchiv 1.0 documentation" href="../../../index.html" />
+ <link rel="up" title="Module code" href="../../index.html" />
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
- <a href="../../genindex.html" title="General Index"
+ <a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
- <a href="../../py-modindex.html" title="Python Module Index"
+ <a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
- <li><a href="../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
- <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li>
+ <li><a href="../../../index.html">Webarchiv 1.0 documentation</a> &raquo;</li>
+ <li><a href="../../index.html" accesskey="U">Module code</a> &raquo;</li>
</ul>
</div>
@@ -47,7 +47,7 @@
<div class="bodywrapper">
<div class="body">
- <h1>Source code for crawler.crawljob</h1><div class="highlight"><pre>
+ <h1>Source code for archive.crawler.crawljob</h1><div class="highlight"><pre>
<span class="c">#!/usr/bin/env python</span>
<span class="c"># encoding: utf-8</span>
@@ -64,21 +64,20 @@
<span class="kn">import</span> <span class="nn">time</span>
<span class="kn">import</span> <span class="nn">shutil</span>
-<span class="kn">import</span> <span class="nn">config.reader</span> <span class="kn">as</span> <span class="nn">config</span>
-<span class="kn">import</span> <span class="nn">crawler.wget</span> <span class="kn">as</span> <span class="nn">wget</span>
-<span class="kn">import</span> <span class="nn">util.paths</span> <span class="kn">as</span> <span class="nn">paths</span>
-<span class="kn">import</span> <span class="nn">crawler.cleaner</span> <span class="kn">as</span> <span class="nn">cleaner</span>
-<span class="kn">import</span> <span class="nn">crawler.xmlgen</span> <span class="kn">as</span> <span class="nn">xmlgen</span>
-<span class="kn">import</span> <span class="nn">util.filelock</span> <span class="kn">as</span> <span class="nn">lock</span>
-<span class="kn">import</span> <span class="nn">crawler.git</span> <span class="kn">as</span> <span class="nn">git</span>
-<span class="kn">import</span> <span class="nn">crawler.exceptions</span>
-<span class="kn">import</span> <span class="nn">crawler.dbgen</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.wget</span> <span class="kn">as</span> <span class="nn">wget</span>
+<span class="kn">import</span> <span class="nn">archive.util.paths</span> <span class="kn">as</span> <span class="nn">paths</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.cleaner</span> <span class="kn">as</span> <span class="nn">cleaner</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.xmlgen</span> <span class="kn">as</span> <span class="nn">xmlgen</span>
+<span class="kn">import</span> <span class="nn">archive.util.filelock</span> <span class="kn">as</span> <span class="nn">lock</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.git</span> <span class="kn">as</span> <span class="nn">git</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.exceptions</span> <span class="kn">as</span> <span class="nn">exceptions</span>
+<span class="kn">import</span> <span class="nn">archive.crawler.dbgen</span> <span class="kn">as</span> <span class="nn">dbgen</span>
-<span class="kn">from</span> <span class="nn">dbrecover.pickle_recover</span> <span class="kn">import</span> <span class="n">PickleDBRecover</span>
-<span class="kn">from</span> <span class="nn">crawler.rsync</span> <span class="kn">import</span> <span class="n">rsync</span>
+<span class="kn">from</span> <span class="nn">archive.dbrecover.pickle_recover</span> <span class="kn">import</span> <span class="n">PickleDBRecover</span>
+<span class="kn">from</span> <span class="nn">archive.crawler.rsync</span> <span class="kn">import</span> <span class="n">rsync</span>
-<div class="viewcode-block" id="CrawlJob"><a class="viewcode-back" href="../../crawlstuff.html#crawler.crawljob.CrawlJob">[docs]</a><span class="k">class</span> <span class="nc">CrawlJob</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlJob"><a class="viewcode-back" href="../../../crawlstuff.html#archive.crawler.crawljob.CrawlJob">[docs]</a><span class="k">class</span> <span class="nc">CrawlJob</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A CrawlJob Process that starts with crawling</span>
<span class="sd"> and ends with db commit.</span>
@@ -91,13 +90,13 @@
<span class="bp">self</span><span class="o">.</span><span class="n">__ident</span> <span class="o">=</span> <span class="n">ident</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__shutdown</span> <span class="o">=</span> <span class="bp">False</span>
-<div class="viewcode-block" id="CrawlJob.shutdown"><a class="viewcode-back" href="../../crawlstuff.html#crawler.crawljob.CrawlJob.shutdown">[docs]</a> <span class="k">def</span> <span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlJob.shutdown"><a class="viewcode-back" href="../../../crawlstuff.html#archive.crawler.crawljob.CrawlJob.shutdown">[docs]</a> <span class="k">def</span> <span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Shutdown flag for &#39;hard&#39;-way shutdown</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__shutdown</span> <span class="o">=</span> <span class="bp">True</span>
</div>
-<div class="viewcode-block" id="CrawlJob.run"><a class="viewcode-back" href="../../crawlstuff.html#crawler.crawljob.CrawlJob.run">[docs]</a> <span class="k">def</span> <span class="nf">run</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="CrawlJob.run"><a class="viewcode-back" href="../../../crawlstuff.html#archive.crawler.crawljob.CrawlJob.run">[docs]</a> <span class="k">def</span> <span class="nf">run</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Starts the Crawljob procedure</span>
<span class="sd"> &quot;&quot;&quot;</span>
@@ -121,15 +120,15 @@
<span class="n">rec</span> <span class="o">=</span> <span class="n">PickleDBRecover</span><span class="p">()</span>
<span class="n">rec</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__metalist</span><span class="p">)</span>