-
Notifications
You must be signed in to change notification settings - Fork 0
/
kobo.py
111 lines (81 loc) · 3.44 KB
/
kobo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''
Created on Aug 7, 2013
@author: richard
'''
from lxml import etree
import html5lib
import time
import email
import receipt_scraper
# e.g. for calling with get_emails_from_withsubject
SEARCH_FROM = "Kobo"
SEARCH_SUBJECT = "Your Kobo Order Receipt"
# n.b. Kobo has "Purchase History",
# https://secure.kobobooks.com/profile/purchasehistory
# if all you need is a glance/refresher of what you
# purchased. Can't download-as-CSV, nor will it show
# more than 100 orders per page.
def parse_email_html(html_data):
etree_document = html5lib.parse(html_data, treebuilder="lxml", namespaceHTMLElements=False)
root = etree_document.getroot()
# By inspecting HTML payloads (saved/dumped elsewhere),
# (samples taken at points which the scraping threw an exception!),
#
# It's clear that the format in the emails is close-enough that it's easier
# to write a flexible scraper, than to scrape scrictly for slight variations.
#
# Emails after 2014-Aug (ish) change from:
# <td>BookTitle <span>By AuthorName</span></td>
# to:
# <td><a>$BookTitle</a> <span>By $AuthorName</span></td>
# Additionally, emails after 2014-Aug (ish) no longer include
# a <td>$DateOfPurchase</td>, so, this changes xpath of $Price <td/>
#
# Emails after 2015-Aug (ish) change the specific xpath to the items table.
#
# Edge case in my emails is an email (before 2014-Aug) with no <span/>,
# & so no author. Okay.
# General formula was
# some_xpath = "/path/to/el"
# some_el = root.xpath(some_xpath) # n.b. this is a list.
# some_str = f(some_el) # some_el[0].text, etc.
# TBH, most of the rest is "magic"/hard-coded enough (by nature)
# that it's not particularly maintainable.
# Scrapers should be fragile.
# items_table contains all the <tr/> with order items.
# items_table_xpath = "/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/table[3]/tbody/tr/td[2]/table[3]/tbody/tr/td/table[4]"
items_table_xpath = "/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/table/tbody/tr/td[2]/table[3]/tbody/tr/td/table[4]"
items_table = root.xpath(items_table_xpath)[0]
# "/tbody..." vs "tbody..."?
item_rows = items_table.xpath("tbody/tr")
# print "DEBUG Num item rows: ", len(item_rows)
# For individual <tr/>, return { title, author, price }
def item_from_row(tr):
# Because it's email, the <tr/> has a table or two inside it. Cool.
title_author_td = tr.xpath("td/table/tbody/tr/td/table/tbody/tr/td[2]")
# print "DEBUG Title Author TD len", len(title_author_td)
# How to do things like ".getElementsByTag"? :S
# Prefer BeautifulSoup for some things?
a = title_author_td[0].xpath("a")
if len(a) == 0:
title = title_author_td[0].text
else:
title = a[0].text
# print "DEBUG Title", title
span = title_author_td[0].xpath("span")
if len(span) > 0:
# Get rid of the "By.."
author = " ".join(span[0].text.split()[1:])
else:
author = None
# print "DEBUG author ", author
# Price <td/> is the last one.
price_td = tr.xpath("td/table/tbody/tr/td/table/tbody/tr/td")[-1]
price = price_td.text
print "DEBUG Kobo found '%s' by '%s' @ '%s'" % (title, author, price)
return {
"title": title,
"author": author,
"price": price
}
return [item_from_row(r) for r in item_rows]