diff --git a/config.yaml b/config.yaml index e362bc6..56f53e9 100644 --- a/config.yaml +++ b/config.yaml @@ -5,7 +5,6 @@ wp_exports: wordpress-xml build_dir: build # Output format: primary choices are html or markdown. -#target_format: markdown target_format: markdown # The date format of the wikipedia export file. diff --git a/exitwp.py b/exitwp.py index 4184c8f..7f65738 100755 --- a/exitwp.py +++ b/exitwp.py @@ -17,8 +17,7 @@ ''' exitwp - Wordpress xml exports to Jekykll blog format conversion -Tested with Wordpress 3.3.1 and jekyll master branch from 2011-03-26 -pandoc is required to be installed if conversion from html will be done. +Tested with Wordpress 3.3.1 and jekyll 0.11.2 ''' ###################################################### @@ -41,33 +40,30 @@ def __init__(self): XMLTreeBuilder.__init__(self) self._parser.StartNamespaceDeclHandler=self._start_ns self.namespaces={} - + def _start_ns(self, prefix, ns): self.namespaces[prefix]='{' + ns + '}' - def html2fmt(html, target_format): -# html = html.replace("\n\n", '

') - # html = html.replace('
', '
', ']]>
') + # html = html.replace("\n\n", '

') + # html = html.replace('
', '
', ']]>
') if target_format=='html': return html else: - # This is like very stupid but I was having troubles with unicode encodings and process.POpen + # This is probably a stupid solution. + # but I was having troubles with character encodings + # and process.POpen. return html2text_file(html, None) def parse_wp_xml(file): - parser=ns_tracker_tree_builder() tree=ElementTree() - print "reading: " + wpe - root=tree.parse(file, parser) - ns=parser.namespaces ns['']='' - + c=root.find('channel') def parse_header(): @@ -87,7 +83,9 @@ def parse_items(): if not "domain" in tax.attrib: continue t_domain=unicode(tax.attrib['domain']) t_entry=unicode(tax.text) - if not (t_domain in taxonomy_filter) and not (t_domain in taxonomy_entry_filter and taxonomy_entry_filter[t_domain]==t_entry): + if (not (t_domain in taxonomy_filter) and + not (t_domain in taxonomy_entry_filter and + taxonomy_entry_filter[t_domain]==t_entry)): if not t_domain in export_taxanomies: export_taxanomies[t_domain]=[] export_taxanomies[t_domain].append(t_entry) @@ -136,7 +134,6 @@ def gi(q, unicode_wrap=True): 'items': parse_items(), } - def write_jekyll(data, target_format): sys.stdout.write("writing") @@ -233,7 +230,6 @@ def get_attachment_path(src, dir, dir_prefix='a'): #data['items']=[] for i in data['items']: - skip_item = False for field, value in item_field_filter.iteritems(): @@ -279,15 +275,12 @@ def get_attachment_path(src, dir, dir_prefix='a'): else: print "Unknown item type :: " + i['type'] - if download_images: for img in i['img_srcs']: try: - urlretrieve(urljoin(data['header']['link'],img.decode('utf-8')), get_attachment_path(img, i['uid'])) + urlretrieve(urljoin(data['header']['link'], img.decode('utf-8')), get_attachment_path(img, i['uid'])) except: - print "\n unable to download "+urljoin(data['header']['link'],img.decode('utf-8')) - - + print "\n unable to download " + urljoin(data['header']['link'], img.decode('utf-8')) if out is not None: def toyaml(data): @@ -314,7 +307,6 @@ def toyaml(data): out.close() print "\n" - wp_exports=glob(wp_exports+'/*.xml') for wpe in wp_exports: data=parse_wp_xml(wpe)