Permalink
Browse files

Topaz metadata support; kindlet metadata support (NiLuJe)

  • Loading branch information...
1 parent 368a959 commit ef3d0965a69effa0225b7762c73e764a741ee90d @richardpeng committed Dec 3, 2010
Showing with 179 additions and 6 deletions.
  1. +3 −3 Kindelabra.py
  2. +153 −0 ebook.py
  3. +23 −3 kindle.py
View
@@ -40,7 +40,7 @@ def __init__(self):
filechooserdiag.set_current_folder(os.path.join(self.root, 'system'))
self.filechooser = gtk.FileChooserButton(filechooserdiag)
self.filechooser.connect("current-folder-changed", self.load)
-
+
file_toolbar = gtk.HBox()
file_toolbar.pack_start(self.filechooser, True, True, 2)
file_toolbar.pack_start(self.get_button('gtk-refresh', 'Refresh files', self.refresh), False, True, 2)
@@ -125,7 +125,7 @@ def get_collections(self):
namehash = book.hash
except:
namehash = None
- print "! ASIN %s is in a collection, but not on the device!" %( asin )
+ print "! ASIN %s belongs to collection %s but wasn't found on the device!" %( asin, collection )
if namehash in self.kindle.files:
if self.kindle.files[namehash].title:
filename = self.kindle.files[namehash].title
@@ -323,7 +323,7 @@ def del_file(self, widget):
path = gtkrow.get_path()
(filename, filehash, asin) = self.get_colpath_value(colstore, gtkrow)
collection = unicode(self.get_colpath_value(colstore, (path[0], ))[0])
- if asin != '':
+ if asin and asin != '':
book = self.kindle.searchAsin(asin)
asin = "#%s^%s" % (book.asin, book.type)
if self.db[collection].has_hash(asin):
View
153 ebook.py
@@ -8,6 +8,9 @@
import struct
+import zipfile
+import re
+
class Sectionizer:
def __init__(self, filename, perm):
self.f = file(filename, perm)
@@ -43,3 +46,153 @@ def __init__(self, filename):
exth_records = exth_records[reclen:]
except ValueError:
self.title = None
+
+'''Kindlet metadata parsing
+'''
+class Kindlet:
+ def __init__(self, filename):
+ # For official apps, ASIN is stored in the Amazon-ASIN field of META-INF/MANIFEST.MF, and title in the Implementation-Title field
+ kindlet = zipfile.ZipFile( filename, 'r')
+ kdkmanifest = kindlet.read( 'META-INF/MANIFEST.MF' )
+ # Catch Title
+ kdktitlem = re.search( '(^Implementation-Title: )(.*?$)', kdkmanifest, re.MULTILINE )
+ if kdktitlem and kdktitlem.group(2):
+ self.title = kdktitlem.group(2).strip()
+ else:
+ self.title = None
+ # Catch ASIN
+ kdkasinm = re.search( '(^Amazon-ASIN: )(.*?$)', kdkmanifest, re.MULTILINE )
+ if kdkasinm and kdkasinm.group(2):
+ self.asin = kdkasinm.group(2).strip()
+ else:
+ self.asin = None
+ kindlet.close()
+
+'''Topaz metadata parsing. Almost verbatim code by Greg Riker from Calibre
+'''
+class StreamSlicer(object):
+ def __init__(self, stream, start=0, stop=None):
+ self._stream = stream
+ self.start = start
+ if stop is None:
+ stream.seek(0, 2)
+ stop = stream.tell()
+ self.stop = stop
+ self._len = stop - start
+
+ def __getitem__(self, key):
+ stream = self._stream
+ base = self.start
+ if isinstance(key, (int, long)):
+ stream.seek(base + key)
+ return stream.read(1)
+ if isinstance(key, slice):
+ start, stop, stride = key.indices(self._len)
+ if stride < 0:
+ start, stop = stop, start
+ size = stop - start
+ if size <= 0:
+ return ""
+ stream.seek(base + start)
+ data = stream.read(size)
+ if stride != 1:
+ data = data[::stride]
+ return data
+ raise TypeError("stream indices must be integers")
+
+class Topaz(object):
+ def __init__(self, filename):
+ self.stream = open(filename, 'rb')
+ self.data = StreamSlicer(self.stream)
+
+ sig = self.data[:4]
+ if not sig.startswith('TPZ'):
+ raise ValueError("'%s': Not a Topaz file" % getattr(stream, 'name', 'Unnamed stream'))
+ offset = 4
+
+ self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ self.topaz_headers = self.get_headers(offset)
+
+ # First integrity test - metadata header
+ if not 'metadata' in self.topaz_headers:
+ raise ValueError("'%s': Invalid Topaz format - no metadata record" % getattr(stream, 'name', 'Unnamed stream'))
+
+ # Second integrity test - metadata body
+ md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
+ md_offset += self.base
+ if self.data[md_offset+1:md_offset+9] != 'metadata':
+ raise ValueError("'%s': Damaged metadata record" % getattr(stream, 'name', 'Unnamed stream'))
+
+ # Get metadata, and store what we need
+ self.title, self.asin, self.type = self.get_metadata()
+ self.stream.close()
+
+ def decode_vwi(self,bytes):
+ pos, val = 0, 0
+ done = False
+ while pos < len(bytes) and not done:
+ b = ord(bytes[pos])
+ pos += 1
+ if (b & 0x80) == 0:
+ done = True
+ b &= 0x7F
+ val <<= 7
+ val |= b
+ if done: break
+ return val, pos
+
+ def get_headers(self, offset):
+ # Build a dict of topaz_header records, list of order
+ topaz_headers = {}
+ for x in range(self.header_records):
+ offset += 1
+ taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ tag = self.data[offset:offset+taglen]
+ offset += taglen
+ num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ blocks = {}
+ for val in range(num_vals):
+ hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ len_comp, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ blocks[val] = dict(offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp)
+ topaz_headers[tag] = dict(blocks=blocks)
+ self.eoth = self.data[offset]
+ offset += 1
+ self.base = offset
+ return topaz_headers
+
+ def get_metadata(self):
+ ''' Return MetaInformation with title, author'''
+ self.get_original_metadata()
+ return self.metadata['Title'], self.metadata['ASIN'], self.metadata['CDEType']
+
+ def get_original_metadata(self):
+ offset = self.base + self.topaz_headers['metadata']['blocks'][0]['offset']
+ self.md_header = {}
+ taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ self.md_header['tag'] = self.data[offset:offset+taglen]
+ offset += taglen
+ self.md_header['flags'] = ord(self.data[offset])
+ offset += 1
+ self.md_header['num_recs'] = ord(self.data[offset])
+ offset += 1
+
+ self.metadata = {}
+ for x in range(self.md_header['num_recs']):
+ taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ tag = self.data[offset:offset+taglen]
+ offset += taglen
+ md_len, consumed = self.decode_vwi(self.data[offset:offset+4])
+ offset += consumed
+ metadata = self.data[offset:offset + md_len]
+ offset += md_len
+ self.metadata[tag] = metadata
View
@@ -15,7 +15,7 @@
import ebook
KINDLEROOT = '/mnt/us'
-FILTER = ['pdf', 'mobi', 'prc', 'txt', 'tpz', 'azw', 'manga']
+FILTER = ['pdf', 'mobi', 'prc', 'txt', 'tpz', 'azw1', 'azw', 'manga', 'azw2']
FOLDERS = ['documents', 'pictures']
class Collection(dict):
@@ -73,7 +73,7 @@ def __init__(self, path):
self.meta = None
self.asin = None
self.type = None
- ext = os.path.splitext(path)[1][1:]
+ ext = os.path.splitext(path)[1][1:].lower()
if ext in ['mobi', 'azw']:
self.meta = ebook.Mobi(path)
if self.meta.title:
@@ -86,6 +86,26 @@ def __init__(self, path):
self.title = self.meta.exth[503]
else:
print "\nMetadata read error:", path
+ elif ext in ['tpz', 'azw1']:
+ self.meta = ebook.Topaz(path)
+ if self.meta.title:
+ self.title = self.meta.title
+ if self.meta.asin:
+ self.asin = self.meta.asin
+ if self.meta.type:
+ self.type = self.meta.type
+ else:
+ print "\nTopaz metadata read error:", path
+ elif ext in ['azw2']:
+ self.meta = ebook.Kindlet(path)
+ if self.meta.title:
+ self.title = self.meta.title
+ if self.meta.asin:
+ self.asin = self.meta.asin
+ self.type = 'AZW2'
+ else:
+ # Couldn't get an ASIN, developper app? We'll use the hash instead, which is what the Kindle itself does, so no harm done.
+ print "\nKindlet Metadata read error, assuming developper app:", path
class Kindle:
'''Access a Kindle filesystem
@@ -108,7 +128,7 @@ def load_folder(self, path):
sys.stdout.write("Loading " + path)
for root, dirs, files in os.walk(os.path.join(self.root, path)):
for filename in files:
- if os.path.splitext(filename)[1][1:] in FILTER:
+ if os.path.splitext(filename)[1][1:].lower() in FILTER:
fullpath = os.path.abspath(os.path.join(root, filename))
book = Ebook(fullpath)
self.files[book.hash] = book

0 comments on commit ef3d096

Please sign in to comment.