In [2]:
# This pulls the data down from my email

import imaplib, email, getpass
from email.utils import getaddresses

# Email settings
imap_server = 'imap.gmail.com'
imap_user = 'scott.houde@gmail.com'
imap_password = getpass.getpass()
# imap_password = '' #removed and revoked password
# Connection
conn = imaplib.IMAP4_SSL(imap_server)
(retcode, capabilities) = conn.login(imap_user, imap_password)

conn.list() # this makes the actual connection, run this command alone to see a list of folders

conn.select("[Gmail]/All Mail", readonly=True) # This saves out all the mail, in and out
# result, data = conn.uid('search', None, 'ALL')
result, data = conn.uid('search', None, '(SINCE "20-Apr-2015" BEFORE "20-Apr-2016")') # Adjust the dates here

uids = data[0].split()
 
# Download headers
result, data = conn.uid('fetch', ','.join(uids), '(BODY[HEADER.FIELDS (MESSAGE-ID IN-REPLY-TO FROM TO CC DATE)])')

In [3]:
# This outputs all of my email data to a text file so I can read it in easier later
# import datetime
import time

# Where data will be stored
raw_file = open('raw-email-rec.tsv', 'w')
# Header for TSV file
raw_file.write("Message-ID\tDate\tReplyTo\tFrom\tTo\tCc\n")
# Parse data and spit out info
for i in range(0, len(data)):
     
    # If the current item is _not_ an email header
    if len(data[i]) != 2:
        continue
     
    # Okay, it's an email header. Parse it.
    msg = email.message_from_string(data[i][1])
    mids = msg.get_all('message-id', None)
    mdates = msg.get_all('date', None)
    reply_to = msg.get_all('in-reply-To', None)
    senders = msg.get_all('from', [])
    receivers = msg.get_all('to', [])
    ccs = msg.get_all('cc', [])
    
    email_sent = str(time.mktime(email.utils.parsedate(mdates[0])))

    row = "\t" if not mids else mids[0] + "\t"
    row += "\t" if not email_sent else email_sent + "\t"
    row += "\t" if not reply_to else reply_to[0] + "\t"
    
    # Only one person sends an email, but just in case
    for name, addr in getaddresses(senders):
        row += addr + " "
    row += "\t"
     
    # Space-delimited list of those the email was addressed to
    for name, addr in getaddresses(receivers):
        row += addr + " "
    row += "\t"
        
    # Space-delimited list of those who were CC'd
    for name, addr in getaddresses(ccs):
        row += addr + " "
     
    row += "\n"
     
    # Just going to output tab-delimited, raw data.
    raw_file.write(row)

# Done with file, so close it
raw_file.close()



So when I pull the last year from all of my email I find that I received and sent roughly 3500 emails from 257 unique from_to addresses.  However out of those 257 only 38 are from actual people.  So I made of list and use that to start a dictionary that I will use to filter down my raw data and to store my results

In [31]:
# This reads my list of important contacts from a file and stores them in a dictionary that I can reuse later
import csv

contacts = {}
contact_index = []

with open('contacts.txt', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        contacts[row[0]] = [0,0,0] # The list will be emails received from, emails sent to, total response time
        contact_index.append(row[0].strip()) # to save me time later, when machine learnin'

In [28]:
# Okay, now lets reduce down my email data to just the address I care about
# I'm going to read back in my email file and only save the rows I care about
# while I do that I'm going to split the email file into two, one for emails I've received and one for emails I've sent

temp = []
inbox = []
sentbox = []

with open('raw-email-rec.tsv', 'rb') as f:
    reader = csv.reader(f, delimiter='\t') # yup csv reader for tsv files, who knew
    reader.next() # skips header row
    for row in reader:
        temp.append(row)
        if row[3] in contacts: # is this from someone I care about
            if row[3].strip() == 'scott.houde@gmail.com': # this is from me aka sent mail
                sentbox.append(row)
            else:
                inbox.append(row)


So now I've reduced down from 3500 emails to just over 900, with 454 emails received from contacts I care about and 449 being sent by me

In [49]:
# Print statements that I used for testing
# print len(inbox)
# print inbox[0]
# print len(sentbox)
# print sentbox[0]

# print data[0]

# print temp[5]

# for i in range(len(inbox)):
#     if inbox[i][0] == '<CAGFRn0GvMX=c-rOu56ipgx=ZT36ifrg+Cxou4-cjHYNKV+MPRQ@mail.gmail.com>':
#         print inbox[i]

In [366]:
# Now I'm going to iterate through the inbox array and count how many times I've received an email from an address
# then count the number of times I've responded and the total response time
# the message ID in inbox should match up to a in_reply_to message ID in sentbox
# [messageID, date/time, in-reply-to, from, to,cc]
# contacts is [nbr received from, nbr sent to, [list of response times], mean response, median response, var, % response] 

details = [] # this stores message details that I can split and use for machine learning

# an important thing to note is that this doesn't take into account if I initiated the email and they responded to me

for key in contacts: # since I keep re-running this I need to clear it each time
    contacts[key] = [0,0,[],0,0,0,0]

for email in inbox:
    temp = []
    responses = []
    mID = email[0]
    sender = email[3]
    contacts[sender][0] +=1
    temp.append(mID)
    temp.append(sender.strip())
    temp.append(contact_index.index(sender.strip()))
    temp.append(int(float(email[1])))
    responseTime = 0 # This means no response, the machine learning doesn't like None
    for response in sentbox: #This is O(n^2) but still pretty fast since I've filtered down
        rID = response[2]
        if rID == mID:
            contacts[sender][1] +=1
            responseTime = (float(response[1]) - float(email[1])) / 60 # convert to minutes
            if responseTime < 0:
                responseTime = 0 # if I have a negative response time set back to 0 - this is caused by a mailing list
            responses.append(responseTime)    
            contacts[sender][2].append(responseTime)
    
    temp.append(responseTime)
    details.append(temp)
    
overall = [0,0,0,0,0,[]] # I'm going to use this to store some over all statistics
# [total received, total replied to, % responded, mean response, median response, [list of response times],]

        
for key in contacts: # precalc some statistics for this contact
    overall[0] += contacts[key][0] # add to emails received
    if contacts[key][1] != 0:
        overall[1] += contacts[key][1] # add to emails replied
        overall[5] += contacts[key][2] # append all the response times
        meanTime = np.mean(contacts[key][2])
        medianTime = np.median(contacts[key][2])
        stdTime = np.std(contacts[key][2])
        aveResp = contacts[key][1]/float(contacts[key][0])
        contacts[key][3] = meanTime
        contacts[key][4] = medianTime
        contacts[key][5] = stdTime
        contacts[key][6] = aveResp 
        
overall[2] = float(overall[1])/float(overall[0]) # % responded
overall[3] = np.mean(overall[5])
overall[4] = np.median(overall[5])

In [369]:
# This block is for testing stuff

# print contacts['producejb@hotmail.com ']
# print contacts['sue.houde@gmail.com ']
# print len(contacts['sue.houde@gmail.com '][2])
# print len(details)
# # print len(details[1])
# for i in range(len(details)):
#     if details[i][1] == 'joefa@comcast.net':
#         print details[i]
        
# print inbox[9]

# print overall[0:5]
# print np.max(overall[5])/60

So now I have a detailed list of messages I've received with my response time if any (I only responded to about half of them) as well as some statistics on my normal response rate to a given sender.  I also have some overall statistics that I can use for people I don't normally respond to and/or to weight my predicted response.  I respond to about 53% of incoming messages over the last year with a median response time of 15 minutes, but a mean of 652 (and a max of 650 hours)

Now I'm going to train a linear regression on some features and see what I get

In [370]:
# details has [mID, from_email, email_sent date/time, response_time]
# X = [from_email, email_sent_date/time]
# y = [response_time]

from datetime import datetime

#I'm going to try splitting the epoch date in WeekDay and Hour and see if that's better

X = []
y = []

for row in details:
#     print row
    X.append(row[2:4])
    y.append(row[4])
#     if row[4] != 0: # only train on the responded messages
#         X.append(row[2])
#         X.append(row[2:4])
#         temp = []
#         temp.append(row[2]) # sender ID
#         temp.append(datetime.fromtimestamp(row[3]).weekday()) # Day of Week as int
#         temp.append(int(time.strftime('%H', time.localtime(row[3])))) # hour of day
#         X.append(temp)
#         time.strftime('%H:%M:%S', time.localtime(1461718287)) # this gives me hh:mm:ss
#         y.append(row[4])

# Time to do some machine learnin'

from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

Although it hurt me to leave stubs of code in, you can see that I tried several variations of how I was looking at the time the email was received.  None of them worked as well as just plain using the epoch time from unix.  I found this rather surprising because I thought hour of day and day of week would be pretty good indicators.  I also found the results better if I trained on the entire detail dataset, not just on the emails that I responded to.  It's very possible I just have no where near enough data.

In [371]:
# print details[1]
# print X[0]
# print y[1]
# 
print regr.predict([17, 1461718287])
# regr.predict([17, 15, 1])
# regr.predict([17])
# print regr.predict([17, 1])
# to predict I feed it an index to the contact and the unix time the 
# email was received, it returns the predicted time in minutes to my response
print regr.coef_

42.5008587484
[12.888 -0.000]


In [373]:
# Getting my test data

import imaplib, email, getpass
from email.utils import getaddresses

# Email settings
imap_server = 'imap.gmail.com'
imap_user = 'scott.houde@gmail.com'
imap_password = getpass.getpass()
# imap_password = '' # revoked and removed password
# Connection
conn = imaplib.IMAP4_SSL(imap_server)
(retcode, capabilities) = conn.login(imap_user, imap_password)

conn.list() # this makes the actual connection, run this command alone to see a list of folders

conn.select("[Gmail]/All Mail", readonly=True) # This saves out all the mail, in and out
# result, data = conn.uid('search', None, 'ALL')
result, data = conn.uid('search', None, '(SINCE "21-Apr-2016" BEFORE "28-Apr-2016")') # Adjust the dates here

uids = data[0].split()
 
# Download headers
result, data = conn.uid('fetch', ','.join(uids), '(BODY[HEADER.FIELDS (MESSAGE-ID IN-REPLY-TO FROM TO CC DATE)])')

In [374]:
# This outputs all of my tesing email data to a text file so I can read it in easier later
# import datetime
import time

# Where data will be stored
raw_file = open('raw-train-email.tsv', 'w')
# Header for TSV file
raw_file.write("Message-ID\tDate\tReplyTo\tFrom\tTo\tCc\n")
# Parse data and spit out info
for i in range(0, len(data)):
     
    # If the current item is _not_ an email header
    if len(data[i]) != 2:
        continue
    # Okay, it's an email header. Parse it.
    msg = email.message_from_string(data[i][1])
    mids = msg.get_all('message-id', None)
    mdates = msg.get_all('date', None)
    reply_to = msg.get_all('in-reply-To', None)
    senders = msg.get_all('from', [])
    receivers = msg.get_all('to', [])
    ccs = msg.get_all('cc', [])
    
    email_sent = str(time.mktime(email.utils.parsedate(mdates[0])))

    row = "\t" if not mids else mids[0] + "\t"
    row += "\t" if not email_sent else email_sent + "\t"
    row += "\t" if not reply_to else reply_to[0] + "\t"
    
    # Only one person sends an email, but just in case
    for name, addr in getaddresses(senders):
        row += addr + " "
    row += "\t"
     
    # Space-delimited list of those the email was addressed to
    for name, addr in getaddresses(receivers):
        row += addr + " "
    row += "\t"
        
    # Space-delimited list of those who were CC'd
    for name, addr in getaddresses(ccs):
        row += addr + " "
     
    row += "\n"
     
    # Just going to output tab-delimited, raw data.
    raw_file.write(row)

# Done with file, so close it
raw_file.close()

In [375]:
# So same as before - except just for my training data
# Okay, now lets reduce down my email data to just the address I care about
# I'm going to read back in my email file and only save the rows I care about
# while I do that I'm going to split the email file into two, one for emails I've received and one for emails I've sent

temp = []
inbox_test = []
sentbox_test = []

with open('raw-train-email.tsv', 'rb') as f:
    reader = csv.reader(f, delimiter='\t') # yup csv reader for tsv files, who knew
    reader.next() # skips header row
    for row in reader:
        temp.append(row)
        if row[3] in contacts: # is this from someone I care about
            if row[3].strip() == 'scott.houde@gmail.com': # this is from me aka sent mail
                sentbox_test.append(row)
            else:
                inbox_test.append(row)


In [377]:
# then count the number of times I've responded and the total response time
# the message ID in inbox should match up to a in_reply_to message ID in sentbox
# [messageID, date/time, in-reply-to, from, to,cc]
# contacts is [nbr received from, nbr sent to, [list of response times], mean response, median response, var, % response] 

details_test = [] # this stores message details that I can split and use for machine learning

# an important thing to note is that this doesn't take into account if I initiated the email and they responded to me

for email in inbox_test:
    temp = []
    responses = []
    mID = email[0]
    sender = email[3]
    contacts[sender][0] +=1
    temp.append(mID)
    temp.append(sender.strip())
    temp.append(contact_index.index(sender.strip()))
    temp.append(int(float(email[1])))
    responseTime = 0 # This means no response, the machine learning doesn't like None
    for response in sentbox_test: #This is O(n^2) but still pretty fast since I've filtered down
        rID = response[2]
        if rID == mID:
            contacts[sender][1] +=1
            responseTime = (float(response[1]) - float(email[1])) / 60 # convert to minutes
            responses.append(responseTime)
            contacts[sender][2].append(responseTime)
    
    temp.append(responseTime)
    details_test.append(temp)

In [385]:
# These will print out the my test data if needed
#print len(details_test[1])
# for i in range(len(details_test)):
#     print details_test[i][1],'--',details_test[i][3],'--',details_test[i][4]
# how does my predictor do against what really happened
# print regr.predict([37, 1461294717])

In [440]:
# This function takes a sender email and email sent date/time and generates a predicted response
# I also included some statistics about my response times

def predictResponse(sender, send_date):
    email = ''
    #First lookup the senderID given the sender
    senderID = contact_index.index(sender.strip())
    #Adjust the date if needed
    
    #predicted response time
    pred = regr.predict([senderID, send_date])
    #Consult the contacts dict to determine normal response statistics
    if sender in contacts:
        email += "I normally respond to you "
        email += "%.2g" % (contacts[sender][6] * 100)
        email += "% of the time and my median response time is "
        email += "%.4g" % contacts[sender][4]
        email += " minutes.  "
        email += "My predicted response time to you right now is: "
        email += str("%.6g" % pred)
        email += " minutes"
    else: # If no normal responses I revert to my overall stuff
        email += "I usually respond to emails "
        email += "%.4g" % (overall[2] * 100)
        email += "% of the time and my median response time is "
        email += "%.4g" % overall[4]
        email += " minutes.  However I can take as long as "
        email += str("%.6g" % np.max(overall[5]))
        email += " minutes and my mean is "
        email += "%.4g" % overall[3]
        email += " minutes. So please be patient if I don't respond quickly."
    
    return email
    

In [442]:
print "Unknown or not important sender:"
print predictResponse("xenophon13@gmail.com", 1461323299),"\n"
print "Known sender:"
print predictResponse("xenophon13@gmail.com ", 1461323299),"\n"
print "Known sender:"
print predictResponse("josie.houde@gmail.com ", 1461323299),"\n"
print "Known sender:"
print predictResponse('brilliantjosh@gmail.com ', 1461323299),"\n"

Unknown or not important sender:
I usually respond to emails 53.3% of the time and my median response time is 14.93 minutes.  However I can take as long as 39033.1 minutes and my mean is 665.2 minutes. So please be patient if I don't respond quickly. 

Known sender:
I normally respond to you 67% of the time and my median response time is 14.17 minutes.  My predicted response time to you right now is: 307.139 minutes 

Known sender:
I normally respond to you 60% of the time and my median response time is 17.93 minutes.  My predicted response time to you right now is: 49.3884 minutes 

Known sender:
I normally respond to you 86% of the time and my median response time is 17.35 minutes.  My predicted response time to you right now is: -143.925 minutes 



In [433]:
# This prints out my contact and overall statistics
print "Total Emails\tTotal Replies\tPercent Responded\tMean Response Time\tMedian Response Time"
print overall[0],'\t\t',overall[1],'\t\t',"%.4g" % overall[2],'\t\t\t',"%.4g" % overall[3],'\t\t\t', "%.4g" % overall[4]
print '\n'
print "Major Contact Statisics"
print "Email\t\t\t\t From\tTo\t% Responded\tMean Time\tMedian Time"

for k in contacts:
    print k.strip(), '\t\t',contacts[k][0],'\t', contacts[k][1],'\t',"%.2g" % contacts[k][6],'\t\t',"%.4g" % contacts[k][3],'\t\t', "%.4g" % contacts[k][4]
    


Total Emails	Total Replies	Percent Responded	Mean Response Time	Median Response Time
454 		242 		0.533 			665.2 			14.93


Major Contact Statisics
Email				 From	To	% Responded	Mean Time	Median Time
lgibeau@gmail.com 		14 	6 	0.46 		42.74 		19.59
shawnafav@yahoo.com 		1 	1 	1 		0 		0
coachmikew@gmail.com 		4 	1 	0.5 		39.57 		39.57
cmh1972@gmail.com 		3 	3 	1 		440.5 		502.4
diro@comcast.net 		11 	3 	0.27 		837.1 		36.68
josie.houde@gmail.com 		99 	59 	0.6 		595.4 		17.93
producejb@hotmail.com 		36 	18 	0.5 		1472 		12.11
marvy.uchida@gmail.com 		5 	3 	0.6 		246.4 		73.05
acco1234@gmail.com 		1 	0 	0 		0 		0
sue.houde@gmail.com 		33 	10 	0.3 		3943 		19.09
scott_houde@brown.edu 		5 	0 	0 		0 		0
peter_favaloro@hotmail.com 		2 	0 	0 		0 		0
grimgoroth@gmail.com 		1 	1 	1 		19.62 		19.62
cabourassa1972@gmail.com 		6 	1 	0 		0 		0
lucien1974@aol.com 		1 	0 	0 		0 		0
mmcginleycrowe@gmail.com 		1 	1 	1 		43.88 		43.88
kristifav@yahoo.com 		9 	5 	0.56 		157.7 		0
scott.houde@gmail.com 		0 	

Because my immediate family uses a mailing list to communicate and I was filtering that out I was sometimes getting a negative response time, I am just manually setting that to 0 to ignore them.  This sometimes leads to the wierd situation where I responded to them, but the response time is 0.

You'll also notice that one of my predicted response times is negative.  I'm taking that to mean that if that person emailed me at the date/time I would respond immediately.  Which is probably a good guess actually.