Skip to content

Commit

Permalink
Topaz metadata support; kindlet metadata support (NiLuJe)
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Peng committed Dec 3, 2010
1 parent 368a959 commit ef3d096
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 6 deletions.
6 changes: 3 additions & 3 deletions Kindelabra.py
Expand Up @@ -40,7 +40,7 @@ def __init__(self):
filechooserdiag.set_current_folder(os.path.join(self.root, 'system')) filechooserdiag.set_current_folder(os.path.join(self.root, 'system'))
self.filechooser = gtk.FileChooserButton(filechooserdiag) self.filechooser = gtk.FileChooserButton(filechooserdiag)
self.filechooser.connect("current-folder-changed", self.load) self.filechooser.connect("current-folder-changed", self.load)

file_toolbar = gtk.HBox() file_toolbar = gtk.HBox()
file_toolbar.pack_start(self.filechooser, True, True, 2) file_toolbar.pack_start(self.filechooser, True, True, 2)
file_toolbar.pack_start(self.get_button('gtk-refresh', 'Refresh files', self.refresh), False, True, 2) file_toolbar.pack_start(self.get_button('gtk-refresh', 'Refresh files', self.refresh), False, True, 2)
Expand Down Expand Up @@ -125,7 +125,7 @@ def get_collections(self):
namehash = book.hash namehash = book.hash
except: except:
namehash = None namehash = None
print "! ASIN %s is in a collection, but not on the device!" %( asin ) print "! ASIN %s belongs to collection %s but wasn't found on the device!" %( asin, collection )
if namehash in self.kindle.files: if namehash in self.kindle.files:
if self.kindle.files[namehash].title: if self.kindle.files[namehash].title:
filename = self.kindle.files[namehash].title filename = self.kindle.files[namehash].title
Expand Down Expand Up @@ -323,7 +323,7 @@ def del_file(self, widget):
path = gtkrow.get_path() path = gtkrow.get_path()
(filename, filehash, asin) = self.get_colpath_value(colstore, gtkrow) (filename, filehash, asin) = self.get_colpath_value(colstore, gtkrow)
collection = unicode(self.get_colpath_value(colstore, (path[0], ))[0]) collection = unicode(self.get_colpath_value(colstore, (path[0], ))[0])
if asin != '': if asin and asin != '':
book = self.kindle.searchAsin(asin) book = self.kindle.searchAsin(asin)
asin = "#%s^%s" % (book.asin, book.type) asin = "#%s^%s" % (book.asin, book.type)
if self.db[collection].has_hash(asin): if self.db[collection].has_hash(asin):
Expand Down
153 changes: 153 additions & 0 deletions ebook.py
Expand Up @@ -8,6 +8,9 @@


import struct import struct


import zipfile
import re

class Sectionizer: class Sectionizer:
def __init__(self, filename, perm): def __init__(self, filename, perm):
self.f = file(filename, perm) self.f = file(filename, perm)
Expand Down Expand Up @@ -43,3 +46,153 @@ def __init__(self, filename):
exth_records = exth_records[reclen:] exth_records = exth_records[reclen:]
except ValueError: except ValueError:
self.title = None self.title = None

'''Kindlet metadata parsing
'''
class Kindlet:
def __init__(self, filename):
# For official apps, ASIN is stored in the Amazon-ASIN field of META-INF/MANIFEST.MF, and title in the Implementation-Title field
kindlet = zipfile.ZipFile( filename, 'r')
kdkmanifest = kindlet.read( 'META-INF/MANIFEST.MF' )
# Catch Title
kdktitlem = re.search( '(^Implementation-Title: )(.*?$)', kdkmanifest, re.MULTILINE )
if kdktitlem and kdktitlem.group(2):
self.title = kdktitlem.group(2).strip()
else:
self.title = None
# Catch ASIN
kdkasinm = re.search( '(^Amazon-ASIN: )(.*?$)', kdkmanifest, re.MULTILINE )
if kdkasinm and kdkasinm.group(2):
self.asin = kdkasinm.group(2).strip()
else:
self.asin = None
kindlet.close()

'''Topaz metadata parsing. Almost verbatim code by Greg Riker from Calibre
'''
class StreamSlicer(object):
def __init__(self, stream, start=0, stop=None):
self._stream = stream
self.start = start
if stop is None:
stream.seek(0, 2)
stop = stream.tell()
self.stop = stop
self._len = stop - start

def __getitem__(self, key):
stream = self._stream
base = self.start
if isinstance(key, (int, long)):
stream.seek(base + key)
return stream.read(1)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if size <= 0:
return ""
stream.seek(base + start)
data = stream.read(size)
if stride != 1:
data = data[::stride]
return data
raise TypeError("stream indices must be integers")

class Topaz(object):
def __init__(self, filename):
self.stream = open(filename, 'rb')
self.data = StreamSlicer(self.stream)

sig = self.data[:4]
if not sig.startswith('TPZ'):
raise ValueError("'%s': Not a Topaz file" % getattr(stream, 'name', 'Unnamed stream'))
offset = 4

self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
self.topaz_headers = self.get_headers(offset)

# First integrity test - metadata header
if not 'metadata' in self.topaz_headers:
raise ValueError("'%s': Invalid Topaz format - no metadata record" % getattr(stream, 'name', 'Unnamed stream'))

# Second integrity test - metadata body
md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
md_offset += self.base
if self.data[md_offset+1:md_offset+9] != 'metadata':
raise ValueError("'%s': Damaged metadata record" % getattr(stream, 'name', 'Unnamed stream'))

# Get metadata, and store what we need
self.title, self.asin, self.type = self.get_metadata()
self.stream.close()

def decode_vwi(self,bytes):
pos, val = 0, 0
done = False
while pos < len(bytes) and not done:
b = ord(bytes[pos])
pos += 1
if (b & 0x80) == 0:
done = True
b &= 0x7F
val <<= 7
val |= b
if done: break
return val, pos

def get_headers(self, offset):
# Build a dict of topaz_header records, list of order
topaz_headers = {}
for x in range(self.header_records):
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
blocks = {}
for val in range(num_vals):
hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
len_comp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
blocks[val] = dict(offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp)
topaz_headers[tag] = dict(blocks=blocks)
self.eoth = self.data[offset]
offset += 1
self.base = offset
return topaz_headers

def get_metadata(self):
''' Return MetaInformation with title, author'''
self.get_original_metadata()
return self.metadata['Title'], self.metadata['ASIN'], self.metadata['CDEType']

def get_original_metadata(self):
offset = self.base + self.topaz_headers['metadata']['blocks'][0]['offset']
self.md_header = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
self.md_header['tag'] = self.data[offset:offset+taglen]
offset += taglen
self.md_header['flags'] = ord(self.data[offset])
offset += 1
self.md_header['num_recs'] = ord(self.data[offset])
offset += 1

self.metadata = {}
for x in range(self.md_header['num_recs']):
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
md_len, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
metadata = self.data[offset:offset + md_len]
offset += md_len
self.metadata[tag] = metadata
26 changes: 23 additions & 3 deletions kindle.py
Expand Up @@ -15,7 +15,7 @@
import ebook import ebook


KINDLEROOT = '/mnt/us' KINDLEROOT = '/mnt/us'
FILTER = ['pdf', 'mobi', 'prc', 'txt', 'tpz', 'azw', 'manga'] FILTER = ['pdf', 'mobi', 'prc', 'txt', 'tpz', 'azw1', 'azw', 'manga', 'azw2']
FOLDERS = ['documents', 'pictures'] FOLDERS = ['documents', 'pictures']


class Collection(dict): class Collection(dict):
Expand Down Expand Up @@ -73,7 +73,7 @@ def __init__(self, path):
self.meta = None self.meta = None
self.asin = None self.asin = None
self.type = None self.type = None
ext = os.path.splitext(path)[1][1:] ext = os.path.splitext(path)[1][1:].lower()
if ext in ['mobi', 'azw']: if ext in ['mobi', 'azw']:
self.meta = ebook.Mobi(path) self.meta = ebook.Mobi(path)
if self.meta.title: if self.meta.title:
Expand All @@ -86,6 +86,26 @@ def __init__(self, path):
self.title = self.meta.exth[503] self.title = self.meta.exth[503]
else: else:
print "\nMetadata read error:", path print "\nMetadata read error:", path
elif ext in ['tpz', 'azw1']:
self.meta = ebook.Topaz(path)
if self.meta.title:
self.title = self.meta.title
if self.meta.asin:
self.asin = self.meta.asin
if self.meta.type:
self.type = self.meta.type
else:
print "\nTopaz metadata read error:", path
elif ext in ['azw2']:
self.meta = ebook.Kindlet(path)
if self.meta.title:
self.title = self.meta.title
if self.meta.asin:
self.asin = self.meta.asin
self.type = 'AZW2'
else:
# Couldn't get an ASIN, developper app? We'll use the hash instead, which is what the Kindle itself does, so no harm done.
print "\nKindlet Metadata read error, assuming developper app:", path


class Kindle: class Kindle:
'''Access a Kindle filesystem '''Access a Kindle filesystem
Expand All @@ -108,7 +128,7 @@ def load_folder(self, path):
sys.stdout.write("Loading " + path) sys.stdout.write("Loading " + path)
for root, dirs, files in os.walk(os.path.join(self.root, path)): for root, dirs, files in os.walk(os.path.join(self.root, path)):
for filename in files: for filename in files:
if os.path.splitext(filename)[1][1:] in FILTER: if os.path.splitext(filename)[1][1:].lower() in FILTER:
fullpath = os.path.abspath(os.path.join(root, filename)) fullpath = os.path.abspath(os.path.join(root, filename))
book = Ebook(fullpath) book = Ebook(fullpath)
self.files[book.hash] = book self.files[book.hash] = book
Expand Down

0 comments on commit ef3d096

Please sign in to comment.