In [1]:
import pandas as pd
import email
from email.parser import HeaderParser
from urlextract import URLExtract
import base64


def get_email_as_dict(msg):
    
    extractor = URLExtract()

    parser = email.parser.HeaderParser()
    headers = parser.parsestr(msg.as_string())

    email_headers = {}
    all_urls = []
    url_count = 1
    for h in headers.items():
        urls = extractor.find_urls(h[1])
        all_urls = all_urls + urls
        #base64 decode
        if isBase64(h[1]):
            try:
                b64_h1= base64.b64decode(h[1]).decode("utf-8")
                urls = extractor.find_urls(str(b64_h1))
                all_urls = all_urls + urls

            except Exception as e:
                b64_h1 = ""
        email_headers[h[0]] = [h[1]]
    
    body = parse_body(msg)
    attchments = get_attachments(msg)
    str_body = str(body)

    urls = extractor.find_urls(str(body))
    all_urls = all_urls + urls
    dict_row = get_as_row(all_urls)
    email_headers.update(dict_row)
    if attchments:
        email_headers.update(attchments)
    
    return email_headers, str_body



In [2]:
def get_email_as_df(msg):
    email_dict, body = get_email_as_dict(msg)
    email_df = pd.DataFrame.from_dict(email_dict)
    return email_df,body
    

In [3]:
def isBase64(sb):
    try:
        if isinstance(sb, str):
                # If there's any unicode here, an exception will be thrown and the function will return false
                sb_bytes = bytes(sb, 'ascii')
        elif isinstance(sb, bytes):
                sb_bytes = sb
        else:
                raise ValueError("Argument must be string or bytes")
        return base64.b64encode(base64.b64decode(sb_bytes)) == sb_bytes
    except Exception:
                return False

In [4]:
def get_as_row(list_urls):
    row = {}
    for index,url in enumerate(list_urls):
        row["url_" + str(index+1)] = url
    return row

In [5]:
# Reference: 
# https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
#body = ""
def parse_body(email_msg):
    body = ""  
    for part in email_msg.walk():
        ctype = part.get_content_type()
        cdispo = str(part.get('Content-Disposition'))
        if ctype == 'text/plain' and 'attachment' not in cdispo:
            body = part.get_payload(decode=True)  
        else:
            body = email_msg.get_payload(decode=True)
    return body

In [6]:
from bs4 import BeautifulSoup
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

In [7]:
from scipy.stats import entropy
import pickle

def get_attachments(email_msg):
    attachments={}
    if email_msg == None:
        return ""
    attachment_count = 0
    for part in email_msg.walk():
        content_dis = part.get_content_disposition()
        if content_dis == "attachment":
            att_filename = part.get_filename()
            payload = bytearray(part.get_payload(decode=True))
            att_entropy= entropy(payload,base=2)
            att_size = len(payload)
            prefix = "attachment" + str(attachment_count)
            attachment = {}
            attachment[prefix + "_filename"] = att_filename
            attachment[prefix + "_entropy"] = att_entropy
            attachment[prefix + "_size"] = att_size
            attachments.update(attachment)
            attachment_count+=1
    attachment_count = {}
    attachment_count["attachment_count"] = len(attachments)//3
    attachments.update(attachment_count)
    return attachments


In [8]:
import os
data_path = "/Users/ravis/Library/CloudStorage/OneDrive-SouthernMethodistUniversity/CapstoneA/Data/phishingdata/traindata"
sub_folders = [x[0] for x in os.walk(data_path) if x[0] != data_path]
metadata_df = pd.DataFrame()
body_list = []
for folder in sub_folders:
    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    for file in files:
        with open(f"{folder}/{file}", encoding="latin1") as f:
            f_realpath = os.path.realpath(f.name)
            is_spam = "spam" in f_realpath
            x = email.message_from_file(f)
            try:
                email_df, body = get_email_as_df(x)
                attachments = get_attachments(x)
                body_text = text_from_html(body[2:-1]).replace("\\n","").replace("\\t","")
                body_list.append([body_text, attachments['attachment_count'],1 if is_spam is True else 0])
                if len(metadata_df) == 0:
                    metadata_df = email_df
                else:
                    metadata_df = pd.concat([metadata_df,email_df], ignore_index=True)
            except Exception as e:
                print(e)
                continue
                
body_df = pd.DataFrame(body_list, columns = ["body", "attachment_count","target"])             

If using all scalar values, you must pass an index
If using all scalar values, you must pass an index
'utf-8' codec can't decode byte 0xda in position 15: invalid continuation byte
'utf-8' codec can't decode byte 0xda in position 15: invalid continuation byte
If using all scalar values, you must pass an index
'utf-8' codec can't decode byte 0xda in position 15: invalid continuation byte




If using all scalar values, you must pass an index
If using all scalar values, you must pass an index
cannot convert 'NoneType' object to bytearray


In [9]:
metadata_df

Unnamed: 0,Return-Path,Delivered-To,Received,Message-Id,To,From,Subject,Date,MIME-Version,Content-Type,...,X-Best-Window-Manager,X-Designation,X-Location,X-Uptime,X-Gnupg-Keyid,X-Gnupg-Fingerprint,X-Scanner,X-PGP-Fingerprint,X-Kernel-Version,X-Editor
0,<pamela4701@eudoramail.com>,zzzz@localhost.spamassassin.taint.org,from 210.214.94.76 (unverified) by mailsweeper...,<00005cd5540a$00004a9b$00007fa8@mx1.eudoramail...,<Undisclosed.Recipients@smtp-ft1.fr.colt.net>,pamela4701@eudoramail.com,Let us find the right mortgage lender for you ...,"Mon, 09 Sep 2002 14:36:18 -0700",1.0,"text/plain; charset=""Windows-1252""",...,,,,,,,,,,
1,<102192086381143-17090200005-example.com?zzzz@...,zzzz@localhost.jmason.org,from sonic1.tilw.net (sonic1.tilw.net [209.164...,<17090200005$102192086381143$1159552220$0@soni...,zzzz@example.com,CopyYourDVD <atomica2020@hotmail.com>,"Friend, Copy ANY DVD or Playstation Game with ...","Tue, 17 Sep 2002 09:15:32 PST",1.0,"multipart/alternative; boundary=""------------1...",...,,,,,,,,,,
2,<sh@insiq.us>,zzzz@localhost.jmason.org,from mail pickup service by mail1.insuranceiq....,<a3edf01c26032$aece4460$6b01a8c0@insuranceiq.com>,<zzzz@jmason.org>,"""IQ - Safe Harbor"" <sh@insiq.us>",5% Guaranteed for Eight Years,"Thu, 19 Sep 2002 19:17:11 -0400",1.0,"multipart/alternative; boundary=""----=_NextPar...",...,,,,,,,,,,
3,<OWNER-NOLIST-SGODAILY*JM**NETNOTEINC*-COM@SMT...,zzzz@localhost.jmason.org,from TIPUTIL2 (tiputil2.corp.tiprelease.com) b...,<200209252259.XAA28260@webnote.net>,JM@NETNOTEINC.COM,Customer Service <greatoffers@sendgreatoffers....,Congratulations! You Get a Free Handheld Organ...,"Wed, 25 Sep 2002 17:23:03 -0500",1.0,"text/html; charset=""us-ascii""",...,,,,,,,,,,
4,<apf@wu-wien.ac.at>,zzzz@localhost.spamassassin.taint.org,from mx.univie.ac.at (193.67.157.75 [193.67.15...,<200209191031.LAA01417@webnote.net>,<C:`Bulk.AdzNortonNorton.txt@webnote.net>,"""don"" <apf@wu-wien.ac.at>",PROTECT YOUR INFORMATION AND YOUR COMPUTER,"Thu, 19 Sep 2002 05:20:18 -0500",1.0,"text/plain; charset=""Windows-1252""",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9715,<razor-users-admin@example.sourceforge.net>,yyyy@localhost.netnoteinc.com,from med-core07.med.wayne.edu ([146.9.19.23]) ...,<D79A56AD131896448D0860DEE07CBE1FE15A@med-core...,"""Jordan Ritter"" <jpr5@darkridge.com>,\n\t<razo...","""Rose, Bobby"" <brose@med.wayne.edu>",RE: [Razor-users] What's wrong with the Razor ...,"Thu, 8 Aug 2002 17:17:05 -0400",1.0,"text/plain; charset=""us-ascii""",...,,,,,,,,,,
9716,<fork-admin@xent.com>,fork@spamassassin.taint.org,"by argote.ch (Postfix, from userid 500) id 9B3...",<20020802221113.9B346C44E@argote.ch>,fork@spamassassin.taint.org,harley@argote.ch (Robert Harley),Re: W3C approves HTML 4 'emotitags' [...],"Sat, 3 Aug 2002 00:11:13 +0200 (CEST)",,,...,,,,,,,,,,
9717,<rpm-zzzlist-admin@freshrpms.net>,yyyy@localhost.netnoteinc.com,from stumpy.se7en.org ([10.0.0.5]\n ident=[...,<1027281818.12983.3.camel@localhost.localdomain>,rpm-zzzlist@freshrpms.net,Mark Derricutt <mark@talios.com>,Re: Ximian apt repos?,22 Jul 2002 08:03:32 +1200,1.0,text/plain,...,,,,,,,,,,
9718,<rpm-zzzlist-admin@freshrpms.net>,yyyy@localhost.netnoteinc.com,from python (80-24-132-206.uc.nombres.ttd.es [...,<20020730225237.564ca6f8.matthias@egwn.net>,rpm-zzzlist@freshrpms.net,Matthias Saou <matthias@egwn.net>,Re: Installing RPM,"Tue, 30 Jul 2002 22:52:37 +0200",1.0,text/plain; charset=US-ASCII,...,,,,,,,,,,


In [15]:
metadata_df.to_csv("~/Downloads/metadata.csv")

In [14]:
body_df.body.to_csv("~/Downloads/newbody.csv")


In [12]:
mytext = "this contains ham"

is_spam = "spam" in mytext

print(1 if is_spam is True else 0)

0


In [13]:
body_df

Unnamed: 0,body,attachment_count,target
0,,0,1
1,,0,1
2,,0,1
3,"Dear Friend, I have your Personal D...",0,1
4,Don\'t fall prey to destructive viruses or hac...,0,1
...,...,...,...
9715,,0,0
9716,,0,0
9717,http://lists.freshrpms.net/mailman/listinfo/rp...,0,0
9718,and the related links they> give but they all ...,0,0
