Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Write preliminary meta file.
Still needs user and page IDs and the ability to read back.
  • Loading branch information
Tim Weber committed Nov 6, 2009
1 parent 1e0b19f commit 95cb943
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,3 +1,4 @@
20*
*.swp
repo
.import-*
33 changes: 29 additions & 4 deletions import.py
Expand Up @@ -2,16 +2,19 @@

import xml.dom.minidom
from xml.parsers.expat import ParserCreate
from calendar import timegm
import codecs
import datetime
import os
import struct
import sys
import bz2

# FIXME: With smaller READ_SIZE this tends to crash on the final read?
READ_SIZE = 10240000
ENCODING = 'UTF-8'
IMPORT_MAX = 10
METAFILE = '.import-meta'

def singletext(node):
if len(node.childNodes) == 0:
Expand All @@ -29,11 +32,30 @@ def progress(text):
out('progress ' + text + '\n')
sys.stdout.flush()

class Meta:
def __init__(self, file):
self.struct = struct.Struct('llllB')
self.fh = open(file, 'wb+')
def write(self, rev, time, page, author, minor):
flags = 0
if minor:
flags += 1
data = self.struct.pack(
rev,
timegm(time.utctimetuple()),
page,
author,
flags
)
self.fh.seek(rev * self.struct.size)
self.fh.write(data)

class Revision:
def __init__(self, node):
def __init__(self, node, writers):
self.id = -1
self.minor = False
self.timestamp = self.text = None
self.writers = writers
self.dom = node
for lv1 in self.dom.childNodes:
if lv1.nodeType != lv1.ELEMENT_NODE:
Expand All @@ -47,15 +69,17 @@ def __init__(self, node):
elif lv1.tagName == 'text':
self.text = singletext(lv1)
def dump(self, title):
self.writers['meta'].write(self.id, self.timestamp, 0, 0, self.minor)
mydata = self.text.encode(ENCODING)
out('blob\nmark :%d\ndata %d\n' % (self.id, len(mydata)))
out(mydata + '\n')

class Page:
def __init__(self, xmlstring):
def __init__(self, xmlstring, writers):
self.revisions = []
self.id = -1
self.title = ''
self.writers = writers
self.dom = xml.dom.minidom.parseString(xmlstring)
for lv1 in self.dom.documentElement.childNodes:
if lv1.nodeType != lv1.ELEMENT_NODE:
Expand All @@ -65,7 +89,7 @@ def __init__(self, xmlstring):
elif lv1.tagName == 'id':
self.id = int(singletext(lv1))
elif lv1.tagName == 'revision':
self.revisions.append(Revision(lv1))
self.revisions.append(Revision(lv1, self.writers))
def dump(self):
progress(' ' + self.title.encode(ENCODING))
for revision in self.revisions:
Expand All @@ -76,6 +100,7 @@ def __init__(self):
self.text = self.xml = None
self.inpage = False
self.startbyte = self.readbytes = self.imported = 0
self.meta = Meta(METAFILE) # FIXME: Use a parameter.
self.fh = codecs.getreader(ENCODING)(sys.stdin)
self.expat = ParserCreate(ENCODING)
self.expat.StartElementHandler = self.find_page
Expand Down Expand Up @@ -105,7 +130,7 @@ def find_pageend(self, name):
self.expat.StartElementHandler = self.find_page
self.expat.EndElementHandler = None
self.xml += self.text[self.startbyte:self.expat.CurrentByteIndex-self.readbytes] + '</' + name.encode(ENCODING) + '>'
Page(self.xml).dump()
Page(self.xml, {'meta': self.meta}).dump()
self.imported += 1
if IMPORT_MAX > 0 and self.imported >= IMPORT_MAX:
sys.exit(0)
Expand Down

0 comments on commit 95cb943

Please sign in to comment.