Browse files

Merged Flos commit & added file.xsd to archive

  • Loading branch information...
1 parent f1c210c commit 75cefd43eff29973c36d4fecf46efa5a5527db07 @qitta qitta committed Jul 5, 2012
View
21 src/python/archive/config/handler.py
@@ -8,6 +8,7 @@
import archive.config.xmlhandler as xmlhandler
import unittest
+
def _as_int(value):
'''
Converts value to int if string is numeric
@@ -17,25 +18,29 @@ def _as_int(value):
else:
return value
+
def get(url):
'''
Returns actual value for url.
If not found, it trys to return default value.
'''
return _as_int(reader.get(url))
+
def get_default(url):
'''
- Returns defaulft value for url.
+ Returns default value for url.
'''
return _as_int(reader.get_default(url))
+
def set(url, value):
'''
Set value of url to given value.
'''
return writer.set_value(url, value)
+
# Not used
def set_default(url):
return writer.set_default(url)
@@ -46,10 +51,16 @@ def load(path):
'''
return xmlhandler.load(path)
+###########################################################################
+# unittest #
+###########################################################################
+
+CONFIG_FILE = 'archive/testdata/webarchive.conf.xml'
+
class TestHandler(unittest.TestCase):
def setUp(self):
- self.assertTrue(load('webarchive.conf.xml'))
-
+ self.assertTrue(load(CONFIG_FILE))
+
def test_load(self):
self.assertTrue(load(xmlhandler.gConfigPath))
self.assertFalse(load('notfound.xml'))
@@ -66,8 +77,8 @@ def test_set(self):
def test_getDefault(self):
self.assertNotEqual(get_default('general.root'), False)
self.assertFalse(get_default('not.found'))
-
+
if __name__ == '__main__':
unittest.main()
-
+
View
19 src/python/archive/config/reader.py
@@ -6,20 +6,26 @@
import archive.config.xmlhandler as xmlhandler
import archive.config.options as options
-# Try find default value for url.
-# If url is found, return value.
-# Else it returns ''
+
def get_default(url):
+ """
+ Try find default value for url.
+ If url is found, return value.
+ Else it returns ''
+ """
try:
value = options.default_options[url]
return value
except KeyError:
return False
-# Try find value for url.
-# If url is found, return value.
-# If url isn't found, try to get default value
+
def get(url):
+ """
+ Try find value for url.
+ If url is found, return value.
+ If url isn't found, try to get default value
+ """
try:
ret = xmlhandler.get_element(url)
if ret == False:
@@ -28,4 +34,3 @@ def get(url):
return ret
except KeyError:
return False
-
View
17 src/python/archive/config/writer.py
@@ -6,15 +6,20 @@
import archive.config.reader as reader
import archive.config.xmlhandler as xmlhandler
-# Set url to value
+
def set_value(url, value):
+ """
+ Sets given value
+ """
return xmlhandler.set_element(url, value)
-# Set url back to default
-# Not used
-def set_default(value):
- default = reader.get_default(value)
+
+def set_default(url):
+ """
+ Sets given url to default value
+ """
+ default = reader.get_default(url)
if default == '':
return ''
else:
- set_value(value, default)
+ set_value(url, default)
View
34 src/python/archive/config/xmlhandler.py
@@ -5,12 +5,15 @@
from xml.etree.ElementTree import ElementTree
-tree = ElementTree()
+tree = ElementTree()
gConfigPath ='webarchive.conf.xml'
-# Load xml config file by the given path
+
def load(configPath):
+ """
+ Load xml config file by the given path
+ """
global gConfigPath
gConfigPath = configPath
try:
@@ -19,10 +22,13 @@ def load(configPath):
except IOError:
return False
-# Set url to value.
-# If url is found, it is set to value and writes file and return is true.
-# If not, return is False
+
def set_element(url, value):
+ """
+ Set url to value.
+ If url is found, it is set to value and writes file and return is true.
+ If not, return is False
+ """
tagname = url_to_xpath(url)
try:
tree.find(tagname).text = value
@@ -32,13 +38,19 @@ def set_element(url, value):
write_file()
return True
-# Writes the actual Element Tree to xml file.
+
def write_file():
+ """
+ Writes the actual Element Tree to xml file.
+ """
tree.write(gConfigPath)
-# Returns value for url
-# If not found, it returns ''
+
def get_element(url):
+ """
+ Returns value for url
+ If not found, it returns ''
+ """
xpath = url_to_xpath(url)
try:
value = tree.findtext(xpath)
@@ -48,7 +60,9 @@ def get_element(url):
except AttributeError:
return False
-# Converts the given url string
+
def url_to_xpath(url):
+ """
+ Converts the given url string
+ """
return url.strip().replace('.', '/')
-
View
48 src/python/archive/init/init.py
@@ -9,6 +9,7 @@
import archive.init.default_cfg as default_cfg
import archive.config.handler as config
+import archive.init.xsdtemplate as xsd
__author__ = 'Christopher Pahl'
@@ -21,6 +22,7 @@
"""
+
def init_archive(init_path=os.getcwd()):
"""
Gets and sets parms at on the first start of archive
@@ -30,34 +32,34 @@ def init_archive(init_path=os.getcwd()):
try:
# Create top direcoty
os.mkdir(base_path)
-
+ ACTUAL_CONFIG = default_cfg.CONFIG_TEMPLATE.format(
+ archive_path=base_path,
+ filter_path=config.get_default('general.filterpath'),
+ depth=config.get_default('crawler.depth'),
+ interval_in_min=config.get_default('crawler.interval'),
+ max_inst=config.get_default('crawler.maxInst'),
+ user_agent=config.get_default('crawler.userAgent'),
+ temp_dir=config.get_default('crawler.tempRoot'),
+ robots=config.get_default('crawler.ignoreRobots'),
+ url_path=config.get_default('crawler.urllistpath'),
+ custom_wget=config.get_default('crawler.customWgetParms'),
+ db_file=config.get_default('db.path'),
+ sql_source=config.get_default('db.sqlSource'),
+ server_port=config.get_default('server.port'),
+ notify_in_min=config.get_default('server.notify.interval'),
+ javadapter_port=config.get_default('javadapter.port'))
# Create base structure
- for folder in ['content', 'tmp', 'filter', 'logs', 'pickle_cache']:
+ for folder in ['content', 'tmp', 'filter', 'logs', 'pickle_cache', 'xml']:
os.mkdir(os.path.join(base_path, folder))
# Default url.txt
- with open(os.path.join(base_path, 'url.txt'), 'w') as urltxt:
- urltxt.write(DEFAULT_URLS)
+ files = [('url.txt', DEFAULT_URLS),
+ (os.path.join('xml', 'file.xsd'), xsd.XSD_TEMPLATE),
+ ('webarchive.conf.xml', ACTUAL_CONFIG)]
- # Write a default config template... that's a bit ugly
- with open(os.path.join(base_path, 'webarchive.conf.xml'), 'w') as cfg_handle:
- cfg_handle.write(default_cfg.CONFIG_TEMPLATE.format(
- archive_path=base_path,
- filter_path=config.get_default('general.filterpath'),
- depth=config.get_default('crawler.depth'),
- interval_in_min=config.get_default('crawler.interval'),
- max_inst=config.get_default('crawler.maxInst'),
- user_agent=config.get_default('crawler.userAgent'),
- temp_dir=config.get_default('crawler.tempRoot'),
- robots=config.get_default('crawler.ignoreRobots'),
- url_path=config.get_default('crawler.urllistpath'),
- custom_wget=config.get_default('crawler.customWgetParms'),
- db_file=config.get_default('db.path'),
- sql_source=config.get_default('db.sqlSource'),
- server_port=config.get_default('server.port'),
- notify_in_min=config.get_default('server.notify.interval'),
- javadapter_port=config.get_default('javadapter.port')
- ))
+ for item in files:
+ with open(os.path.join(base_path, item[0]), 'w') as item_handle:
+ item_handle.write(item[1])
print('Initialized new archive at', base_path)
except OSError as err:
View
4 src/python/setup.py
@@ -19,6 +19,6 @@
],
data_files=[('/usr/bin', ['archive.py'])],
url = "www.github.com/studentkittens/webarchive",
- author = "Christopher Pahl, Christoph Piechula",
- author_email = "cpahl@hof-university.de, cpiechula@hof-university.de"
+ author = "Christopher Pahl, Christoph Piechula, Florian Bauer",
+ author_email = "cpahl@hof-university.de, cpiechula@hof-university.de, florian.steffen.joerg.bauer@hof-university.de"
)

0 comments on commit 75cefd4

Please sign in to comment.