# 데이터 셋 내려받기

In [3]:
import tarfile
from pathlib import Path
import urllib.request

def fetch_spam_data():
    spam_root="http://spamassassin.apache.org/old/publiccorpus/"
    ham_url=spam_root+"20030228_easy_ham.tar.bz2"
    spam_url=spam_root+"20030228_spam.tar.bz2"
    
    spam_path=Path()/"datasets"/"spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham","ham",ham_url),
                                   ("spam","spam",spam_url)):
        if not (spam_path/dir_name).is_dir():
            path=(spam_path/tar_name).with_suffix(".tar.bz2")
            print("Downloading",path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file=tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path/dir_name for dir_name in ("easy_ham","spam")]

In [4]:
ham_dir, spam_dir = fetch_spam_data()

In [5]:
#모든 이메일 읽어들이기
ham_filenames=[f for f in sorted(ham_dir.iterdir()) if len(f.name)>20]
spam_filenames=[f for f in sorted(spam_dir.iterdir()) if len(f.name)>20]

In [6]:
len(ham_filenames)

2500

In [7]:
len(spam_filenames)

500

In [8]:
#list에 PosixPath가 담겨있는 형태
ham_filenames[:5]

[PosixPath('datasets/spam/easy_ham/00001.7c53336b37003a9286aba55d2945844c'),
 PosixPath('datasets/spam/easy_ham/00002.9c4069e25e1ef370c078db7ee85ff9ac'),
 PosixPath('datasets/spam/easy_ham/00003.860e3c3cee1b42ead714c5c874fe25f7'),
 PosixPath('datasets/spam/easy_ham/00004.864220c5b6930b209cc287c361c99af1'),
 PosixPath('datasets/spam/easy_ham/00005.bf27cdeaf0b8c4647ecd61b1d09da613')]

In [9]:
#모듈을 사용해 이메일 파싱하기
import email
import email.policy

def load_email(filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [10]:
ham_emails=[load_email(filepath) for filepath in ham_filenames]
spam_emails=[load_email(filepath) for filepath in spam_filenames]

In [11]:
#list에 각 이메일이 저장되어 있음
ham_emails[:5]

[<email.message.EmailMessage at 0x7f95c80f0ca0>,
 <email.message.EmailMessage at 0x7f95c80f2280>,
 <email.message.EmailMessage at 0x7f95c80f2a00>,
 <email.message.EmailMessage at 0x7f95c80f2d60>,
 <email.message.EmailMessage at 0x7f95c80f2700>]

In [12]:
#email 내용 오픈을 위해선 get_contnt 함수 사용 필수
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


어떤 이메일은 이미지나 첨부파일을 가진 멀티파트로 구성

In [13]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload=email.get_payload()
    if isinstance(payload, list):
        multipart=",".join([get_email_structure(sub_email)
                           for sub_email in payload])
        return f"multipart({multipart})"
    else:
        return email.get_content_type()

In [14]:
from collections import Counter

def structures_counter(emails):
    structures=Counter()
    for email in emails:
        structure=get_email_structure(email)
        structures[structure]+=1
    return structures

In [15]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain,application/pgp-signature)', 66),
 ('multipart(text/plain,text/html)', 8),
 ('multipart(text/plain,text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain,application/octet-stream)', 2),
 ('multipart(text/plain,text/enriched)', 1),
 ('multipart(text/plain,application/ms-tnef,text/plain)', 1),
 ('multipart(multipart(text/plain,text/plain,text/plain),application/pgp-signature)',
  1),
 ('multipart(text/plain,video/mng)', 1),
 ('multipart(text/plain,multipart(text/plain))', 1),
 ('multipart(text/plain,application/x-pkcs7-signature)', 1),
 ('multipart(text/plain,multipart(text/plain,text/plain),text/rfc822-headers)',
  1),
 ('multipart(text/plain,multipart(text/plain,text/plain),multipart(multipart(text/plain,application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain,application/x-java-applet)', 1)]

In [16]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain,text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain,image/jpeg)', 3),
 ('multipart(text/html,application/octet-stream)', 2),
 ('multipart(text/plain,application/octet-stream)', 1),
 ('multipart(text/html,text/plain)', 1),
 ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),
 ('multipart(multipart(text/plain,text/html),image/gif)', 1),
 ('multipart/alternative', 1)]

각각 구성 내용을 살펴보자면 햄 이메일들은 대부분 plain text이고 스팸 메일들은 html이 차지하는 비중이 큼

햄 이메일들은 서명을 포함하는 경우가 종종 있음

In [17]:
#이메일 헤더 살펴보기
for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [18]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

# 훈련 세트와 테스트 세트로 분리하기

In [19]:
import pandas as pd
import numpy as np

data_set=np.array(ham_emails+spam_emails,dtype=object)
data_label=np.array([0]*len(ham_emails)+[1]*len(spam_emails))

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(data_set, data_label,test_size=0.1,random_state=42)

전처리 함수 작성: HTML을 일반 텍스트로 변환하는 함수 
- head 섹션을 삭제하고 모든 a 태그를 HYPERLINK 문자로 바꿈
- 모든 HTML 태그를 제거하고 텍스트만 남김
- 보기 편하게 여러개의 개행 문자를 하나로 만들고 html 엔티티를 복원

In [21]:
import re
from html import unescape

def html_to_plain_text(html):
    text=re.sub('<head.*?>.*</head>','', html, flags=re.M|re.S|re.I)
    text=re.sub('<a\s.*?>','HYPERLINK', text, flags=re.M|re.S|re.I)
    text=re.sub('<.*?>','', text, flags=re.M|re.S)
    text=re.sub(r'(\s*\n)+','\n', text, flags=re.M|re.S)
    return unescape(text)

In [22]:
html_spam_emails=[email for email in X_train[y_train==1]
                 if get_email_structure(email)=="text/html"]

In [23]:
sample_html_spam=html_spam_emails[5]
print(sample_html_spam.get_content().strip()[:1000],"...")

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META http-equiv=Content-Type content="text/html; charset=windows-1252">
<META content="MSHTML 6.00.2716.2200" name=GENERATOR></HEAD>
<BODY>
<CENTER>
<TABLE borderColor=black cellSpacing=0 cellPadding=10 border=1>
  <TBODY>
  <TR>
    <TD><FONT size=2><A
      href="http://theadmanager.com/server/c.asp?ad_key=BBBIJJJNSUMV&ext=1"
      target=""><IMG height=71 alt=""
      src="http://www.findanopportunity.com/fgoffer/htmlemail2/top_01.gif"
      width=549 border=0><BR><IMG height=73 alt=""
      src="http://www.findanopportunity.com/fgoffer/htmlemail2/top_02.gif"
      width=549 border=0><BR><IMG height=115 alt=""
      src="http://www.findanopportunity.com/fgoffer/htmlemail2/collage_01.jpg"
      width=188 border=0><IMG height=115 alt=""
      src="http://www.findanopportunity.com/fgoffer/htmlemail2/collage_02.jpg"
      width=171 border=0><IMG height=115 alt=""
      src="http://www.findanopportunity.com/fgof

In [24]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000],"...")


    HYPERLINK
      You've been hand selected to access this exclusive
      work-at-home information for HYPERLINKFREE!
      HYPERLINK
You are receiving this mailing because you are a
member of SendGreatOffers.com and subscribed as:JM@NETNOTEINC.COM
To unsubscribe HYPERLINK
Click Here
(http://admanmail.com/subscription.asp?em=JM@NETNOTEINC.COM&l=SGO)
or reply to this email with REMOVE in the subject line - you must
also include the body of this message to be unsubscribed. Any correspondence about
the products/services should be directed to
the company in the ad.
%EM%JM@NETNOTEINC.COM%/EM%
 ...


In [25]:
def email_to_text(email):
    html=None
    
    #walk: 메세지 객체 트리의 모든 파트와 서브 파트를 이터레이트하는데 사용되는 범용 제너레이터
    for part in email.walk(): 
        ctype=part.get_content_type()
        if not ctype in ("text/plain","text/html"):
            continue
        try:
            content=part.get_contnet()
        except: #encoding issue 발생시
            content=str(part.get_payload())
        if ctype=="text/plain":
            return content
        else:
            html=content
    if html:
        return html_to_plain_text(html)

In [26]:
print(email_to_text(sample_html_spam)[:100],"...")


    HYPERLINK
      You've been hand selected to access this exclusive
      work-at-home informati ...


In [27]:
#자연어 처리 툴킷 NLTK를 이용해 어간 추출
import nltk

stemmer=nltk.PorterStemmer()
for word in ("Computations","Computation","Computing","Computed","Compute",
            "Compulsive"):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [28]:
%pip install --upgrade pip

[0mNote: you may need to restart the kernel to use updated packages.


In [29]:
%pip install -q -U urlextract

[0mNote: you may need to restart the kernel to use updated packages.


In [30]:
#urlextract 라이브러리를 사용해 인터넷 주소를 "URL" 문자로 바꾸기
import urlextract

url_extractor=urlextract.URLExtract()
some_text="Will it detect github.com and https://youtu.be"
print(url_extractor.find_urls(some_text))

['github.com', 'https://youtu.be']


In [32]:
#위 작업을 하나의 변환기로 연결하여 이메일을 단어 카운트로 바꾸기
from skleanr.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_header=True, lower_case=True,
                remove_punctuations=True, replace_urls=True,
                replace_numbers=True, stemming=True):
        self.strip_headers=strip_headers
        self.lower_case=lower_case
        self.remove_punctuation=remove_pucnctuations
        self.replace_urls=replace_urls
        self.replace_numbers=replace_numbers
        self.stemming=stemming

    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed=[]
        for email in X:
            text=eamil_to_text(email) or ""
            if self.lower_case:
                text=text.lowe()
            if self.replace_urls and url_extractor is not None:
                urls=list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url:len(url), reverse=True)
                for url in urls:
                    text=text.replace(url," URL ")
            if self.replace_numbers:
                text=re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?','NUMBER',text)
            if self.remove_punctuation:
                text=re.sub(r'\W+','', text,flags=re.M)
            word_count=Counter(text.split())

SyntaxError: invalid syntax (1579096085.py, line 28)