In [2]:
import tarfile
import re
from datetime import datetime
from collections import namedtuple, Counter

import pandas as pd
import altair as alt

In [3]:
tar = tarfile.open("enron_mail_20150507.tar.gz","r")
items = tar.getmembers()


In [30]:
items

<TarInfo 'maildir/blair-l/personnel___promotions' at 0x19aef0cdb38>

In [4]:
Email = namedtuple('Email','Date, From, To, Subject, Cc, Bcc, Message')

In [17]:
# Figure out structure of each file
def get_message(item_number):
	f = tar.extractfile(items[item_number])

	try:
		date = from_ = to = subject = cc = bcc = message = ''
		in_to = False
		in_message = False
		to = []
		message = []
		item = f.read().decode()
		item = item.replace('\r','').replace('\t','')
		lines = item.split('\n')
		# print(len(lines))

		for num, line in enumerate(lines):
			if line.startswith('Date:') and not date:
				date = datetime.strptime(' '.join(line.split('Date: ')[1].split()[:-2]), '%a, %d %b %Y %H:%M:%S')
			elif line.startswith('From:') and not from_:
				from_ = line.replace('From:', '').strip()
			elif line.startswith('To:') and not to:
				in_to = True
				to = line.replace('To:','').replace(',','').replace(';','').split()
			elif line.startswith('Subject:') and not subject:
				in_to = False
				subject = line.replace('Subject:','').strip()
			elif line.startswith('Cc:') and not cc:
				cc = line.replace('Cc:','').replace(',','').replace(';','').split()
			elif line.startswith('Bcc') and not bcc:
				bcc = line.replace('Bcc:','').replace(',','').replace(';','').split()
			elif in_to:
				to.extend(line.replace(',','').split())
			elif line.startswith('Subject:') and not subject:
				in_to = False
			elif line.startswith('X-FileName'):
				in_message = True
			elif in_message:
				message.append(line)

		to = '; '.join(to).strip()
		cc = '; '.join(cc).strip()
		bcc = '; '.join(bcc).strip()

		message = ' '.join(message).strip()
		email = Email(date, from_, to, subject, cc, bcc, message)
		return email

	except Exception as e:
		return e

In [18]:
msg = get_message(2002)

In [19]:
msg.Message

"FYI. Thanks. Lynn   -----Original Message----- From: January, Steven   Sent:Monday, July 09, 2001 4:44 PM To:Spraggins, Gary; Blair, Lynn Subject:FW: Guardian OBA Agreement  FYI. sj   -----Original Message----- From: Bowers, Janet   Sent:Monday, July 09, 2001 4:43 PM To:January, Steven Subject:RE: Guardian OBA Agreement  Steve,  Our first thought on this was to make it a volumetric agreement.  But now, we are re-thinking that we would like to have a dollar-valued OBA with daily cash out.  Guardian wanted a copy of our agreement today, but since we plan to re-draft, I've told them that it will be a couple of weeks before they get a draft from us.  So, please disregard this and I'll be sending you a new OBA draft sometime within the next week or two.  Thanks!   Janet  -----Original Message----- From: January, Steven   Sent:Monday, July 09, 2001 2:17 PM To:Bowers, Janet Subject:RE: Guardian OBA Agreement  How come this isn't cashed out at each month's end? sj   -----Original Message-----

In [8]:
len(items)

520901

In [9]:
# Creating a list of dates that the emails were sent

dates = []
for num in range(len(items)):
	msg = get_message(num)
	if not isinstance(msg, tuple):
		continue
	dates.append(msg.Date)

In [10]:
# Counting the dates
dates_count = Counter(dates)

In [11]:
df = pd.DataFrame.from_dict(dates_count, orient='index').reset_index()
df.columns = ['datetime','count']
df['date'] = df['datetime'].map(lambda x: x.date())
df.head()

Unnamed: 0,datetime,count,date
0,2001-09-14 14:05:43,1,2001-09-14
1,2001-09-10 10:33:15,1,2001-09-10
2,2001-08-20 12:34:19,1,2001-08-20
3,2001-07-05 08:35:29,2,2001-07-05
4,2001-07-26 08:16:14,1,2001-07-26


In [12]:
# Creating a pivot table to visualize the data

pivot = df.pivot_table(index='date', values='count', aggfunc='sum').reset_index()
pivot = pivot[(pivot['date'] > pd.datetime(2000,1,1).date()) & (pivot['date'] < pd.datetime(2002,6,30).date()) ]
pivot['date'] = pd.to_datetime(pivot['date'])

  


In [13]:
alt.Chart(pivot).mark_bar().encode(
	x = 'date',
	y = 'count',
	tooltip=['date','count'],
).properties(width=800)