# Email info extraction

This script will access your inbox and download all the 1st layer information in you inbox, such as body, sender information, and email attachments

### Import packages

In [1]:
import datetime
import pandas as pd
import os
import win32com.client #connects to your MS outlook  

### User inputs

In [5]:
# Create output folder 
output_dir = '/put_your_directory/emails/'
user_email = "sarkis.kassounian@outlook.com" #replace with yours

In [7]:
#connect to inbox
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

### Extract messages from inbox 

In [8]:
inbox = outlook.Folders(user_email).Folders("Inbox")
#You can also connect to the inbox via folder numbers
# Check https://docs.microsoft.com/en-us/office/vba/api/outlook.oldefaultfolders
inbox = outlook.GetDefaultFolder(6)
messages = inbox.Items

### Fill database

In [15]:
#prepare arrays
names = []
email_address = [] 
message_class = [] 
subject_arr = [] 
attachments_arr = [] 
body_arr = [] 
date_arr = [] 
time_arr = [] 

### Iterate over message objects to extract information pieces

In [16]:
counter = 1
for message in messages:
    if counter%100 == 0 or counter==len(messages): print(f"[INFO]: Extracted messages: {counter}/{len(messages)}")
    counter+=1
    subject_arr.append(message.Subject)
    attachmets = message.Attachments
    attachments_arr.append(attachmets)
    body_arr.append(message.body)
    recieved = message.ReceivedTime
    date_arr.append(recieved.date().strftime("%d-%b-%Y"))
    time_arr.append(str(recieved.time()))
    
    
    #get sender name     
    sender_name = None
    try:
        sender_name = message.Sender.Name
    except:
        sender_name = message.SenderName    
    names.append(sender_name)

    #get message class
    message_class.append(message.Class)

    #get_email ... Due to the class of the email, the nature of the email address can change
    email = None
    if message.Class == 43 :        
        if message.SenderEmailType =='EX':
            if message.Sender.GetExchangeUser()!=None:
                if type(message.Sender.GetExchangeUSer())==str:
                    email = message.Sender.GetExchangeUSer()
                else:    
                    email = message.Sender.GetExchangeUSer().PrimarySmtpAddress            
        else:         
            email = message.SenderEmailAddress

    email_address.append(email)

[INFO]: Extracted messages: 100/895
[INFO]: Extracted messages: 200/895
[INFO]: Extracted messages: 300/895
[INFO]: Extracted messages: 400/895
[INFO]: Extracted messages: 500/895
[INFO]: Extracted messages: 600/895
[INFO]: Extracted messages: 700/895
[INFO]: Extracted messages: 800/895
[INFO]: Extracted messages: 895/895


### Save data in Pandas dataframe for visalualization

Not that the body and the attachments list are not included since we will use them to extract information

In [22]:
data = {}
data['date'] = date_arr
data['time'] = time_arr
data['names'] = names
data['message_class'] = message_class
data['email'] = email_address
data['subject'] = subject_arr
data['N_attach'] = [len(i) for i in attachments_arr]
data = pd.DataFrame(data)

### Later you can do any filtering process

In [23]:
data['names'] = 'my friends'
data['message_class'] = message_class
data['email'] = 'non_of_your_business@outlook.com'
data['subject'] = 'Need to know basis'
data['N_attach'] = [len(i) for i in attachments_arr]
data = data.query("N_attach>0")
data

Unnamed: 0,date,time,names,message_class,email,subject,N_attach
0,04-Apr-2023,08:41:52.783000,my friends,43,non_of_your_business@outlook.com,Need to know basis,5
2,04-Apr-2023,05:33:48.067000,my friends,43,non_of_your_business@outlook.com,Need to know basis,10
3,03-Apr-2023,16:41:38.914000,my friends,43,non_of_your_business@outlook.com,Need to know basis,4
18,28-Mar-2023,08:18:16.262000,my friends,43,non_of_your_business@outlook.com,Need to know basis,4
19,26-Mar-2023,19:44:02.540000,my friends,43,non_of_your_business@outlook.com,Need to know basis,3
...,...,...,...,...,...,...,...
888,14-Jul-2023,14:05:38.561000,my friends,43,non_of_your_business@outlook.com,Need to know basis,1
889,14-Jul-2023,15:38:15.302000,my friends,43,non_of_your_business@outlook.com,Need to know basis,1
890,14-Jul-2023,15:41:00.028000,my friends,43,non_of_your_business@outlook.com,Need to know basis,1
892,14-Jul-2023,20:38:12.381000,my friends,43,non_of_your_business@outlook.com,Need to know basis,1


### Downloading attachments

Some types of attachments are not downloadable and there are many files per email. But to keep track of what 

In [27]:
#extract attachments
counter = 1 
attach_count = 0 
for idx,attch in enumerate(data.index):

    for file in attachments_arr[idx]:
        try:
            file.SaveAsFile(output_dir+f"{idx}_{file.FileName}")
            attach_count += 1  
        except: 
            pass   
    if counter%50 == 0 or counter==len(data): print(f"[INFO]: Extracted messages: {counter}/{len(data)} --> Total attachments = {attach_count}")
    counter+=1    
        

[INFO]: Extracted messages: 50/343 --> Total attachments = 60
[INFO]: Extracted messages: 100/343 --> Total attachments = 105
[INFO]: Extracted messages: 150/343 --> Total attachments = 134
[INFO]: Extracted messages: 200/343 --> Total attachments = 181
[INFO]: Extracted messages: 250/343 --> Total attachments = 224
[INFO]: Extracted messages: 300/343 --> Total attachments = 264
[INFO]: Extracted messages: 343/343 --> Total attachments = 286
