# 解析邮件

In [198]:
import os
import re
import base64

from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr


# 阿里云邮箱
with open('sample/raw_aliyun.eml', 'r') as f:
    _raw = f.read()
_msg = Parser().parsestr(_raw)
print(_msg.get('From'))


# 新浪邮箱
with open('sample/raw_163.eml', 'r') as f:
    _raw = f.read()
_msg2 = Parser().parsestr(_raw)
print(_msg2.get('From'))

Solomon Xie <solomonxie@outlook.com>
=?utf-8?B?5rWm5Y+R6YKA57qm?= <webmaster@faa.vsonke.com>


In [139]:
# 'From', 'TO', 'Date', 'Subject'
_msg.get('from')

'"solomonxie" <solomonxie@aliyun.com>'

In [150]:
print( _msg.get('Content-Type'),'\n\n' )

print( _msg2.get('Content-Type') )

multipart/mixed;
  boundary="----=ALIBOUNDARY_49951_4b7b7940_5b5afe97_aae72" 


text/html; charset=utf-8


In [144]:
print( _msg.get('Content-transfer-encoding'),'\n\n' )

print( _msg2.get('Content-transfer-encoding') )

None 


8BIT


In [146]:
print( _msg.get('Message-ID') )

print( _msg2.get('Message-ID') )

<9e79be3e-9236-4f72-9016-28584981c6a2.solomonxie@aliyun.com>
None


In [44]:
_msg.is_attachment()

AttributeError: 'Message' object has no attribute 'is_attachment'

## 了解邮件结构

In [199]:
_depth = 0 # 嵌套深度（初始为0）
for _part in _msg.walk():
    print('\n', f'[嵌套深度：{_depth}]', f'[内容类型：{_part.get_content_type()}]', '\n')
    print('    [', _part.get_content_disposition(), '] ====== ', _part.get('Content-Disposition'))
    if _part.is_multipart() is True:
        _depth += 1
        continue


 [嵌套深度：0] [内容类型：multipart/alternative] 


 [嵌套深度：1] [内容类型：text/plain] 


 [嵌套深度：1] [内容类型：text/html] 



## 打印实体文字型内容

In [200]:
# 这一步最麻烦的是文字解码的问题
# 每个服务商的传输方式和解码方式都不一样，所以要分析判断再解码
# 一般顺序是：
# 传输方式解码 -> Unicode语言解码 > 得到原本的文字
# 这里声明一个函数专门来处理这几个步骤

import quopri

def __decode_text(part):
    """
    :part: 
        接收整封Email的message实体
        或multipart中的part都可以
        前提是part的内容类型必须是Text类的
    :解码:   传输方式的解码 -> 语言的解码 > 得到原本的文字
    """
    
    # 1. 获取文本的传输编码方式
    _transfer = part.get('Content-Transfer-Encoding')
    print(_transfer)
    
    
    # 2. 获取文本的Unicode编码格式
    _ctype = part.get('Content-Type')
    _result = re.findall(r'charset\s?=\s?\"?([\w-]+)\"?\s*$', _ctype)
    _charset = _result[0] if _result else 'utf-8'
    
    print(f'[CType: {_ctype}] [Charset: {_charset}] [Result: {_result}]')

    _raw_text = part.get_payload()
    _content = ''
    
    # 3. 按照不同情况进行解码
    if 'base64' in _transfer.lower():
        _content = base64.b64decode(_raw_text).decode(_charset)
    elif 'bit' in _transfer.lower():
        _content = _raw_text
    elif 'quoted-printable' in _transfer:
        _content = quopri.decodestring(_raw_text).decode(_charset)
    
    
    return _content

In [207]:
_depth = 0 # 嵌套深度（初始为0）
_text = ''
_html = ''
for _part in _msg.walk():
    _maintype = _part.get_content_maintype()
    _subtype = _part.get_content_subtype()
    print('\n\n', f'[嵌套深度：{_depth}]', f'[内容类型：{_maintype}/{_subtype}]', '\n\n')
    
    
    # 跳过框架结构
    if _part.is_multipart() is True:
        _depth += 1
        continue
    
    
    # 只读取文本内容 (Text/plain或Text/html)
    if _maintype == 'text':
        _content = __decode_text( _part )
        print( _content )
        
        if _subtype == 'plain':  # 文字分类储存（因为有时是完全重复的）
            _text += _content
        elif _subtype == 'html':
            _html += _content



 [嵌套深度：0] [内容类型：multipart/alternative] 




 [嵌套深度：1] [内容类型：text/plain] 


base64
[CType: text/plain; charset="utf-8"] [Charset: utf-8] [Result: ['utf-8']]
第一次转发

Begin forwarded message:

From: "Coursera" <no-reply@m.mail.coursera.org<mailto:no-reply@m.mail.coursera.org>>
Subject: New courses & degrees! Penn MCIT, IBM Applied Data Science & more
Date: 26 July 2018 at 11:40:44 am GMT+8
To: "Solomon Xie" <solomonxiewise@gmail.com<mailto:solomonxiewise@gmail.com>>




[New courses & degrees. Get a CS Degree from an Ivy League School, no technical background required ...]<https://eventing.coursera.org/redirectSigned/eyJrZXkiOiJlbWFpbC5saW5rLm9wZW4iLCJ2YWx1ZSI6eyJ1cmwiOiJodHRwczovL3d3dy5jb3Vyc2VyYS5vcmc_dXRtX21lZGl1bT1lbWFpbCZ1dG1fc291cmNlPW1hcmtldGluZyZ1dG1fY2FtcGFpZ249X05ZNVlKQmpFZWlsR01GMlF5UWdUdyIsInRyYWNraW5nIjp7InVzZXJJZCI6MzMyMzYxMDQsInVzZXJFbWFpbCI6InNvbG9tb254aWV3aXNlQGdtYWlsLmNvbSIsIm5vdGlmaWNhdGlvblR5cGUiOiJSVUN5TUpCakVlaWxHTUYyUXlRZ1R3IiwiY2FtcGFpZ25JZCI6Im1hcmtld

## 下载所有附件（忽略嵌套）

In [193]:
# 先定义个文件头解码函数 要不然出来都是编码过的文字
def __decode_header(raw):
    if raw is None:
        return ''
    content, charset = decode_header(raw)[0]
    text = content.decode(charset) if charset else raw
    return text

In [194]:
for _part in _msg.walk():
    
    # 跳过框架结构
    if _part.is_multipart() is True:
        _depth += 1
        continue
    
    # 查找确认格式为附件的二进制内容（跳过所有非附件内容）
    _disposition = _part.get_content_disposition()
    if _disposition != 'attachment':
        continue
    
    # 下载保存附件
    _filename = __decode_header(_part.get_filename())
    _payload = base64.b64decode(_part.get_payload()) #先从B64解码出来
    with open(f'sample/{_filename}', 'wb') as f:
        f.write(_payload)
    
    print(f'已下载附件：{_filename}')

已下载附件：(地心历险记2_BD.rmvb)[00.04.18.569].jpg
已下载附件：[G.I.A.N.T][darli-fra][13][720p][CHS]-0001.jpg


## Prettified Original Mail Structure

In [128]:
with open('sample/raw_prettify_sina.eml', 'r') as f:
    print(f.read())

X-Mda-Received: from <m0.mail.sina.cn>([<10.41.14.99>])
 by <mda-14-117.mda.fmail.dbl.sinanode.com> with LMTP id <61657>
 Jul 25 2018 17:13:47 +0800 (CST)

X-Sina-MID:048C4D2E2B86A0144521F7F2A9B632DA3B00000000000006

X-Sina-Attnum:0


Date: Wed, 25 Jul 2018 17:13:47 +0800 
From: =?UTF-8?B?5paw5rWq6YKu566x5Zui6Zif?= <webcn@staff.sina.com.cn>
Subject: =?UTF-8?B?5bCP5b+D77yB5paw5rWq6YKu566x5o+Q6YaS5oKo6Ziy6IyD5qy66K+I6YKu5Lu2?=
To: xie_xiaobo_vip@sina.com


Mime-Version: 1.0

Content-type: text/html; charset=utf-8

Content-transfer-encoding: 8BIT

X-Mailer: SinaMail 3.0

-- <以下是完整的HTML代码> --

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
    <省略...>
</html>



In [127]:
with open('sample/raw_prettify_aliyun.eml', 'r') as f:
    print(f.read())

X-Alimail-AntiSpam:AC = PASS;
BC = -1|-1;
BR = 01201311R521e2;
CH = green;
FP = 0|-1|-1|-1|0|-1|-1|-1;
HT = e01f04452;
MF = solomonxie@aliyun.com;
NM = 1;
PH = DW;
RN = 1;
RT = 1;
SR = 0;
TI = W4_5305839_DEFAULT_0AC264D3_1532690071843_o7001c65i;
Received: from WS-web (solomonxie@aliyun.com[W4_5305839_DEFAULT_0AC264D3_1532690071843_o7001c65i]) by e01e07484.eu6 at Fri, 27 Jul 2018 19:14:31 +0800 



Date: Fri, 27 Jul 2018 19:14:31 +0800 
From: "solomonxie" 
Return-Path: "solomonxie" 
To: "Solomon Xie" 
Reply-To: "solomonxie" 
Message-ID: <9e79be3e-9236-4f72-9016-28584981c6a2.solomonxie@aliyun.com> 
Subject:  = ?UTF-8?B?Rnc6IOS4reWbveW3peWVhumTtuihjOWuouaIt+Wvuei0puWNlShJQ0JDIFBlb255IENhcmQg? =   = ?UTF-8?B?QmFuayBTdGF0ZW1lbnQp? =  



X-Mailer: [Alimail-Mailagent revision 7][W4_5305839][DEFAULT][Chrome] 
MIME-Version: 1.0 
Return-Path: 
References: <1032333284.502091.1530429207702.JavaMail.wasup@pdccsumspapp077>,<28936236-c12b-49ed-941a-8a7d67347998.solomonxie@aliyun.com>, 
In-Reply-To: 

# 根据日期发送邮件

In [22]:
import os
import json
import yagmail

# Load content
with open('sample/review_list.csv', 'r') as f:
    file_list = [line.split(',') for line in f.read().split('\n')]

title = file_list[0][4]
path = file_list[0][5]

with open(path, 'r') as f:
    content = f.read()

    
    
# Load Sender-Server
path = '.local/email-servers.json'
if os.path.islink(path) is True:
    path = os.readlink(path)
with open(path, 'r') as f:
    servers = json.loads(f.read())

# Choose an "Email Server" on which we're downloding
_sender = servers['senders'][0]


# Login & Send mail
yag = yagmail.SMTP(_sender['email'], _sender['password'], host=_sender['server'])
contents = [
    content
]
yag.send('solomonxie@outlook.com', title, contents)

print(title, '[OK]')

[OK]
