Permalink
Browse files

Enhancements to Exs. 6-3 & 6-13 for Google Takeout

Enhanced Example 6-3 so that it "just works" with Google Takeout data
continaing Unicode where previously an exception was being raised
due to a string being mistakenly decoded twice.

Enhanced Example 6-13 so that email addresses that serve as the basis of
the query are substrings in the From field instead of exact matches,
making it easier to query for known addresses

See also these gists for enhancements with debugging data that may also
be useful.

* https://gist.github.com/ptwobrussell/8791064
* https://gist.github.com/ptwobrussell/8875470
  • Loading branch information...
1 parent c2b6788 commit 29c3476c8d42c3c9b44c3118e24452474a07a171 @ptwobrussell committed Feb 14, 2014
Showing with 21 additions and 14 deletions.
  1. +21 −14 ipynb/Chapter 6 - Mining Mailboxes.ipynb
@@ -244,13 +244,14 @@
"from dateutil.parser import parse\n",
"\n",
"MBOX = 'resources/ch06-mailboxes/data/enron.mbox'\n",
- "OUT_FILE = 'resources/ch06-mailboxes/data/enron.mbox.json'\n",
- "\n",
+ "OUT_FILE = MBOX + '.json'\n",
+ " \n",
"def cleanContent(msg):\n",
"\n",
- " # Decode message from \"quoted printable\" format\n",
- " msg = quopri.decodestring(msg)\n",
- " \n",
+ " # Decode message from \"quoted printable\" format, but first\n",
+ " # re-encode, since decodestring will try to do a decode of its own\n",
+ " msg = quopri.decodestring(msg.encode('utf-8'))\n",
+ "\n",
" # Strip out HTML tags, if any are present.\n",
" # Bail on unknown encodings if errors happen in BeautifulSoup.\n",
" try:\n",
@@ -273,8 +274,9 @@
" msg = mb.next()\n",
" if msg is None:\n",
" break\n",
+ "\n",
" yield jsonifyMessage(msg)\n",
- " \n",
+ "\n",
"def jsonifyMessage(msg):\n",
" json_msg = {'parts': []}\n",
" for (k, v) in msg.items():\n",
@@ -291,15 +293,16 @@
"\n",
" for part in msg.walk():\n",
" json_part = {}\n",
- " if part.get_content_maintype() == 'multipart':\n",
+ "\n",
+ " if part.get_content_maintype() != 'text':\n",
+ " print >> sys.stderr, \"Skipping MIME content in JSONification ({0})\".format(part.get_content_maintype())\n",
" continue\n",
- " \n",
+ "\n",
" json_part['contentType'] = part.get_content_type()\n",
" content = part.get_payload(decode=False).decode('utf-8', 'ignore')\n",
" json_part['content'] = cleanContent(content)\n",
- " \n",
" json_msg['parts'].append(json_part)\n",
- " \n",
+ "\n",
" # Finally, convert date from asctime to milliseconds since epoch using the\n",
" # $date descriptor so it imports \"natively\" as an ISODate object in MongoDB\n",
" then = parse(json_msg['Date'])\n",
@@ -317,7 +320,9 @@
"for msg in gen_json_msgs(mbox):\n",
" if msg != None:\n",
" f.write(json.dumps(msg, cls=Encoder) + '\\n')\n",
- "f.close()"
+ "f.close()\n",
+ "\n",
+ "print \"All done\""
],
"language": "python",
"metadata": {},
@@ -708,6 +713,7 @@
"import json\n",
"import pymongo # pip install pymongo\n",
"from bson import json_util # Comes with pymongo\n",
+ "import re\n",
"\n",
"# The basis of our query\n",
"FROM = \"kenneth.lay@enron.com\"\n",
@@ -719,7 +725,7 @@
"# Get the recipient lists for each message\n",
"\n",
"recipients_per_message = db.mbox.aggregate([\n",
- " {\"$match\" : {\"From\" : FROM} }, \n",
+ " {\"$match\" : {\"From\" : re.compile(r\".*{0}.*\".format(FROM), re.IGNORECASE)}}, \n",
" {\"$project\" : {\"From\" : 1, \"To\" : 1} }, \n",
" {\"$group\" : {\"_id\" : \"$From\", \"recipients\" : {\"$addToSet\" : \"$To\" } } } \n",
"])['result'][0]['recipients']\n",
@@ -739,14 +745,15 @@
"# Demonstrate how to use $unwind followed by $group to collapse\n",
"# the recipient lists into a single list (with no duplicates\n",
"# per the $addToSet operator)\n",
- " \n",
+ "\n",
"unique_recipients = db.mbox.aggregate([\n",
- " {\"$match\" : {\"From\" : FROM} }, \n",
+ " {\"$match\" : {\"From\" : re.compile(r\".*{0}.*\".format(FROM), re.IGNORECASE)}}, \n",
" {\"$project\" : {\"From\" : 1, \"To\" : 1} }, \n",
" {\"$unwind\" : \"$To\"}, \n",
" {\"$group\" : {\"_id\" : \"From\", \"recipients\" : {\"$addToSet\" : \"$To\"}} }\n",
"])['result'][0]['recipients']\n",
"\n",
+ "print all_recipients\n",
"print \"Num total recipients on all messages:\", len(all_recipients)\n",
"print \"Num recipients for each message:\", recipients_per_message_totals\n",
"print \"Num unique recipients\", len(unique_recipients)"

0 comments on commit 29c3476

Please sign in to comment.