-
Notifications
You must be signed in to change notification settings - Fork 4
/
EMLParser.py
162 lines (128 loc) · 3.9 KB
/
EMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import re
import sys
import os
import email
import mail
import json
import hashlib
from bs4 import BeautifulSoup
from lxml.html import tostring, html5parser
from HTMLParser import HTMLParser
class Parser:
def __init(self):
print "Parser Class Called"
def header():
try:
f = open(sys.argv[1],'r')
lines = f.readlines()
headers = {}
file = open(sys.argv[1]+".txt","w")
print ("\n---------------HEADER PART------------------\n")
file.write("-------------------HEADER PART--------------------\n")
for line in lines:
print(line)
if len(line) == '\n':
break
if line[0] != ' ' and line[0] != '\t' and line[0] != '\r':
hs = line.split(':',1)
if len(hs) != 2:
print(hs, ord(line[0]))
break
headers[hs[0]] = hs[1]
file.write(hs[0]+':'+hs[1])
except Exception, e:
print str(e)
def body():
try:
f = open(sys.argv[1],'r+')
lines = f.read()
#lines = str(lines)
b = email.message_from_string(lines)
body = ""
file = open(sys.argv[1]+".txt","a")
print ("\n----------------BODY PART--------------\n")
file.write('\n' + "----------------BODY PART-----------------" + '\n')
if b.is_multipart():
for part in b.walk():
ctype = part.get_content_type()
print ("ctype=", ctype)
if ctype == 'text/plain':
body = part.get_payload(decode=True) # decode
print body
print "Plain Text Loop"
file.write(body)
file.write('\n')
break
elif ctype == 'text/html':
print "HTML Text Loop1"
body = part.get_payload(decode=True)
soup = BeautifulSoup(body, "lxml")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract()
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print text
file.write(text.encode('utf8'))
file.close()
break
else:
body = b.get_payload(decode=True)
soup = BeautifulSoup(body, "lxml")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract()
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print text
file.write(text.encode('utf8'))
print "NOT MULTIPART"
file.close()
except Exception, e:
print str(e)
def attachment():
try:
with open(sys.argv[1], 'r+') as f:
lines = f.read()
#lines = str(lines)
b = email.message_from_string(lines)
print ("\n-------ATTACHMENT PART-------\n")
detach_dir = '.'
if 'attachments' not in os.listdir(detach_dir):
os.mkdir('attachments')
for part in b.walk():
if part.get_content_maintype() == 'multipart':
# print part.as_string()
continue
if part.get('Content-Disposition') is None:
# print part.as_string()
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(detach_dir, 'attachments', fileName)
if not os.path.isfile(filePath):
print fileName
with open(filePath, 'wa') as fp:
fp.write(part.get_payload(decode=True))
fp.write("\n-------ATTACHMENT PART-------\n")
print (fileName + ' Dosyaya Yazilmistir.')
except Exception as detail:
print detail
if __name__ == "__main__":
header()
body()
attachment()