Skip to content

Commit

Permalink
Merge pull request #24 from sammyrulez/master
Browse files Browse the repository at this point in the history
Better memory management (Thanks sammyrulez - will also merge in your mongodb commit as part of this pull request but back it out of master since MongoDB isn't used elsewhere in Mining the Social Web)
  • Loading branch information
Matthew A. Russell committed Jun 12, 2012
2 parents 3af23dc + 517c069 commit bc6a5d8
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 15 deletions.
29 changes: 19 additions & 10 deletions python_code/mailboxes__jsonify_mbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
import json

MBOX = sys.argv[1]
OUT_FILE = None
try:
OUT_FILE = sys.argv[2]
except Exception, e:
pass

def cleanContent(msg):

Expand Down Expand Up @@ -54,13 +59,17 @@ def jsonifyMessage(msg):
finally:
return json_msg

# Note: opening in binary mode is recommended
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
json_msgs = []
while 1:
msg = mbox.next()
if msg is None:
break
json_msgs.append(jsonifyMessage(msg))

print json.dumps(json_msgs, indent=4)
#Note: opening in binary mode is recommended

mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
def gen_json_msgs(m_box):
while 1:
msg = m_box.next()
if msg is None:
break
yield jsonifyMessage(msg)

if OUT_FILE:
json.dump(gen_json_msgs(mbox),open(OUT_FILE, 'wb'), indent=4)
else:
print json.dumps(gen_json_msgs(mbox), indent=4)
17 changes: 12 additions & 5 deletions python_code/mailboxes__load_json_mbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,23 @@

import sys
import os
import couchdb
import pymongo
from pymongo import Connection

try:
import jsonlib2 as json
except ImportError:
import json

JSON_MBOX = sys.argv[1] # i.e. enron.mbox.json
DB = os.path.basename(JSON_MBOX).split('.')[0]
DB_NAME = os.path.basename(JSON_MBOX).split('.')[0]

connection = Connection('localhost', 27017)
db = connection[DB_NAME]

server = couchdb.Server('http://localhost:5984')
db = server.create(DB)
docs = json.loads(open(JSON_MBOX).read())
db.update(docs, all_or_nothing=True)

collection = db['messages']
for doc in docs:
print str(doc)
collection.insert(doc)

0 comments on commit bc6a5d8

Please sign in to comment.