1. Trial - filtering relevant key value from email dataset

In [9]:

import mailbox
import pandas as pd

MBOX = 'resources/ch06-mailboxes/data/Allmail.mbox'

mb = mailbox.mbox(MBOX)

keys = ['Date', 'X-Gmail-Labels', 'X-GM-THRID', 'From', 'Subject', 'Content-Type']
message_list = []

for message in mb.itervalues():
    dmessage = dict(message.items())
    message_list.append({key:dmessage[key] if key in dmessage.keys() else '' for key in keys})

print len(message_list), 'messages'
print '**'*50
message_list[:5]

# print dmessage

9588 messages
****************************************************************************************************


[{'Content-Type': 'multipart/alternative; \r\n\tboundary="----=_Part_299026_603284481.1538247620955"',
  'Date': 'Sat, 29 Sep 2018 19:00:21 +0000',
  'From': 'Imprivata Recruiting Team <no-reply@jobvite.com>',
  'Subject': 'Your application for Software Engineer I (Identity Governance) at\r\n Imprivata',
  'X-GM-THRID': '1612969538082251512',
  'X-Gmail-Labels': 'Inbox,Unread'},
 {'Content-Type': 'multipart/alternative;\r\n\tboundary="----=_NextPart_000_31C2C_01D457F7.7EE62E60"',
  'Date': 'Sat, 29 Sep 2018 13:22:39 +0530',
  'From': '"ICICIdirect.com" <service@icicisecurities.com>',
  'Subject': "Presenting the 'Mutual Fund Pulse'- A Monthly E-Magazine for mutual fund investment guidance",
  'X-GM-THRID': '1612927718462238570',
  'X-Gmail-Labels': 'Inbox,Unread'},
 {'Content-Type': 'multipart/alternative; boundary="0000000000002275800576cb5c85"',
  'Date': 'Wed, 26 Sep 2018 19:37:59 +0000',
  'From': 'Google Cloud Platform <CloudPlatform-noreply@google.com>',
  'Subject': 'Google Clou

In [None]:
Cleaned gmail mbox data filtered and converted into json

In [4]:
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse

MBOX = 'resources/ch06-mailboxes/data/Starred.mbox'
OUT_FILE = MBOX + '.json'
 
def cleanContent(msg):

    # Decode message from "quoted printable" format, but first
    # re-encode, since decodestring will try to do a decode of its own
    msg = quopri.decodestring(msg.encode('utf-8'))

    # Strip out HTML tags, if any are present.
    # Bail on unknown encodings if errors happen in BeautifulSoup.
    try:
        soup = BeautifulSoup(msg)
    except:
        return ''
    return ''.join(soup.findAll(text=True))

# There's a lot of data to process, and the Pythonic way to do it is with a 
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object 
# serialization.

class Encoder(json.JSONEncoder):
    def default(self, o): return  list(o)

# The generator itself...
def gen_json_msgs(mb):
    while 1:
        msg = mb.next()
        if msg is None:
            break

        yield jsonifyMessage(msg)

def jsonifyMessage(msg):
    json_msg = {'parts': []}
    for (k, v) in msg.items():
        json_msg[k] = v.decode('utf-8', 'ignore')

    # The To, Cc, and Bcc fields, if present, could have multiple items.
    # Note that not all of these fields are necessarily defined.

    for k in ['To', 'Cc', 'Bcc']:
        if not json_msg.get(k):
            continue
        json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
                                 .replace(' ', '').decode('utf-8', 'ignore').split(',')

    for part in msg.walk():
        json_part = {}

        if part.get_content_maintype() != 'text':
            print >> sys.stderr, "Skipping MIME content in JSONification ({0})".format(part.get_content_maintype())
            continue

        json_part['contentType'] = part.get_content_type()
        content = part.get_payload(decode=False).decode('utf-8', 'ignore')
        json_part['content'] = cleanContent(content)
        json_msg['parts'].append(json_part)

    # Finally, convert date from asctime to milliseconds since epoch using the
    # $date descriptor so it imports "natively" as an ISODate object in MongoDB
    then = parse(json_msg['Date'])
    millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
    json_msg['Date'] = {'$date' : millis}

    return json_msg

mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)

# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport

f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
    if msg != None:
        f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()

print "All done"

Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Sk

Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME conten

Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification 

Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (image)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME cont

Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (application)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JSONification (multipart)
Skipping MIME content in JS

All done


Skipping MIME content in JSONification (multipart)


In [5]:
import os
import sys
import envoy

data_file = os.path.join(os.getcwd(), 'resources/ch06-mailboxes/data/Starred.mbox.json')

print data_file

# Run a command just as you would in a terminal on the virtual machine to import the data file into MongoDB.
r = envoy.run('mongoimport --db gmail --collection mbox ' + \
              '--file  ' + data_file)


# Print its standard output
print r.std_out
print sys.stderr.write(r.std_err)

/home/resham/Documents/fall2018/cs483/Project/trial/ipynb/resources/ch06-mailboxes/data/Starred.mbox.json

None


2018-10-14T01:12:59.390-0700	connected to: localhost
2018-10-14T01:12:59.509-0700	imported 327 documents
