Skip to content

Commit

Permalink
Write preliminary meta file.
Browse files Browse the repository at this point in the history
Still needs user and page IDs and the ability to read back.
  • Loading branch information
Tim Weber committed Nov 6, 2009
1 parent 1e0b19f commit 95cb943
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,3 +1,4 @@
20* 20*
*.swp *.swp
repo repo
.import-*
33 changes: 29 additions & 4 deletions import.py
Expand Up @@ -2,16 +2,19 @@


import xml.dom.minidom import xml.dom.minidom
from xml.parsers.expat import ParserCreate from xml.parsers.expat import ParserCreate
from calendar import timegm
import codecs import codecs
import datetime import datetime
import os import os
import struct
import sys import sys
import bz2 import bz2


# FIXME: With smaller READ_SIZE this tends to crash on the final read? # FIXME: With smaller READ_SIZE this tends to crash on the final read?
READ_SIZE = 10240000 READ_SIZE = 10240000
ENCODING = 'UTF-8' ENCODING = 'UTF-8'
IMPORT_MAX = 10 IMPORT_MAX = 10
METAFILE = '.import-meta'


def singletext(node): def singletext(node):
if len(node.childNodes) == 0: if len(node.childNodes) == 0:
Expand All @@ -29,11 +32,30 @@ def progress(text):
out('progress ' + text + '\n') out('progress ' + text + '\n')
sys.stdout.flush() sys.stdout.flush()


class Meta:
def __init__(self, file):
self.struct = struct.Struct('llllB')
self.fh = open(file, 'wb+')
def write(self, rev, time, page, author, minor):
flags = 0
if minor:
flags += 1
data = self.struct.pack(
rev,
timegm(time.utctimetuple()),
page,
author,
flags
)
self.fh.seek(rev * self.struct.size)
self.fh.write(data)

class Revision: class Revision:
def __init__(self, node): def __init__(self, node, writers):
self.id = -1 self.id = -1
self.minor = False self.minor = False
self.timestamp = self.text = None self.timestamp = self.text = None
self.writers = writers
self.dom = node self.dom = node
for lv1 in self.dom.childNodes: for lv1 in self.dom.childNodes:
if lv1.nodeType != lv1.ELEMENT_NODE: if lv1.nodeType != lv1.ELEMENT_NODE:
Expand All @@ -47,15 +69,17 @@ def __init__(self, node):
elif lv1.tagName == 'text': elif lv1.tagName == 'text':
self.text = singletext(lv1) self.text = singletext(lv1)
def dump(self, title): def dump(self, title):
self.writers['meta'].write(self.id, self.timestamp, 0, 0, self.minor)
mydata = self.text.encode(ENCODING) mydata = self.text.encode(ENCODING)
out('blob\nmark :%d\ndata %d\n' % (self.id, len(mydata))) out('blob\nmark :%d\ndata %d\n' % (self.id, len(mydata)))
out(mydata + '\n') out(mydata + '\n')


class Page: class Page:
def __init__(self, xmlstring): def __init__(self, xmlstring, writers):
self.revisions = [] self.revisions = []
self.id = -1 self.id = -1
self.title = '' self.title = ''
self.writers = writers
self.dom = xml.dom.minidom.parseString(xmlstring) self.dom = xml.dom.minidom.parseString(xmlstring)
for lv1 in self.dom.documentElement.childNodes: for lv1 in self.dom.documentElement.childNodes:
if lv1.nodeType != lv1.ELEMENT_NODE: if lv1.nodeType != lv1.ELEMENT_NODE:
Expand All @@ -65,7 +89,7 @@ def __init__(self, xmlstring):
elif lv1.tagName == 'id': elif lv1.tagName == 'id':
self.id = int(singletext(lv1)) self.id = int(singletext(lv1))
elif lv1.tagName == 'revision': elif lv1.tagName == 'revision':
self.revisions.append(Revision(lv1)) self.revisions.append(Revision(lv1, self.writers))
def dump(self): def dump(self):
progress(' ' + self.title.encode(ENCODING)) progress(' ' + self.title.encode(ENCODING))
for revision in self.revisions: for revision in self.revisions:
Expand All @@ -76,6 +100,7 @@ def __init__(self):
self.text = self.xml = None self.text = self.xml = None
self.inpage = False self.inpage = False
self.startbyte = self.readbytes = self.imported = 0 self.startbyte = self.readbytes = self.imported = 0
self.meta = Meta(METAFILE) # FIXME: Use a parameter.
self.fh = codecs.getreader(ENCODING)(sys.stdin) self.fh = codecs.getreader(ENCODING)(sys.stdin)
self.expat = ParserCreate(ENCODING) self.expat = ParserCreate(ENCODING)
self.expat.StartElementHandler = self.find_page self.expat.StartElementHandler = self.find_page
Expand Down Expand Up @@ -105,7 +130,7 @@ def find_pageend(self, name):
self.expat.StartElementHandler = self.find_page self.expat.StartElementHandler = self.find_page
self.expat.EndElementHandler = None self.expat.EndElementHandler = None
self.xml += self.text[self.startbyte:self.expat.CurrentByteIndex-self.readbytes] + '</' + name.encode(ENCODING) + '>' self.xml += self.text[self.startbyte:self.expat.CurrentByteIndex-self.readbytes] + '</' + name.encode(ENCODING) + '>'
Page(self.xml).dump() Page(self.xml, {'meta': self.meta}).dump()
self.imported += 1 self.imported += 1
if IMPORT_MAX > 0 and self.imported >= IMPORT_MAX: if IMPORT_MAX > 0 and self.imported >= IMPORT_MAX:
sys.exit(0) sys.exit(0)
Expand Down

0 comments on commit 95cb943

Please sign in to comment.