In [1]:
# 스팸 분류기를 만들어 보세요.

In [14]:
# pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import tarfile
import urllib
import requests
from bs4 import BeautifulSoup
import re
import shutil
from tempfile import mkdtemp
from sklearn.datasets import load_files
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, cross_val_predict
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import html2text
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import time
# nltk.download()


In [150]:
"""

# 데이터셋 다운로드

download_root = "https://spamassassin.apache.org/old/publiccorpus/"  # 파일이 위치해있는 URL

response = requests.get(download_root)    # requests 를 통한 URL 에 있는 'a' tag 추출
soup = BeautifulSoup(response.text, 'html.parser') 
tags = soup.select('a')

for i, tag in enumerate(tags):    # 'a' 태그 텍스트 추출
    tags[i] = tag.get_text()
    
# 정규표현식을 활용한 파일 이름 추출

regex = re.compile(r"^[0-9]{8}.*")
file_names = []

for i in range(len(tags)):
    result = regex.search(tags[i])
    if result:
        file_names.append(result.group())
                
def fetch_mail_data(file_names=file_names, download_root=download_root):
    os.makedirs("datasets/hams", exist_ok=True)
    os.makedirs('datasets/spams', exist_ok=True)
    ham_number, spam_number = (0, 0)
    
    for name in file_names:
        file_url = download_root + name
        bz2_path = os.path.join('datasets', name)
        urllib.request.urlretrieve(file_url, bz2_path)
            
        with tarfile.open(bz2_path, 'r') as tar:
            tar.extractall(path='datasets')    # datasets 디렉토리에 바로 압축 해제
        
        middle = re.search('_(\w+).tar.bz2', name).group(1)    
        # 압축 해제시, 20021010_hard_ham.tar.bz 를 예시로 들면, hard_ham 디렉토리가 생성되고, 그 안에 데이터가 존재함.
        # 데이터를 꺼내 hams or spams 디렉토리에 넣기 위해, hard_ham 같은 middle 문자열을 추출
        
        files = os.listdir(os.path.join('datasets', middle))
            
        if 'ham' in name:
            for file in files:
                shutil.move(os.path.join('datasets', middle, file), os.path.join('datasets', 'hams', file))
                os.rename(os.path.join('datasets', 'hams', file), os.path.join('datasets', 'hams', str(ham_number) + '.txt'))
                ham_number += 1
        else:
            for file in files:
                shutil.move(os.path.join('datasets', middle, file), os.path.join('datasets', 'spams', file))
                os.rename(os.path.join('datasets', 'spams', file), os.path.join('datasets', 'spams', str(spam_number) + '.txt'))
                spam_number += 1
                 
        shutil.rmtree(os.path.join('datasets', middle))
        
"""

In [151]:
# fetch_mail_data(file_names)

In [148]:
# shutil.rmtree('datasets')

In [2]:
start = time.time()

data = load_files('datasets', allowed_extensions=['.txt'])

In [3]:
X, y = np.char.decode(np.array(data["data"]), encoding='ISO-8859-1'), data["target"]

In [85]:
# X, y = np.char.decode(np.array(data["data"])[:2000], encoding='ISO-8859-1'), data["target"][:2000]

In [4]:
# 훈련, 테스트 데이터 분리

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

for train_index, test_index in split.split(X, y):
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

In [5]:
# 이메일 헤더 제거

class remove_header(BaseEstimator):
    def __init__(self, perform = True):
        self.perform = perform
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.perform:
            for i in range(len(X)):
                X[i] = X[i].replace(X[i].split("\n\n")[0], '')
        return X

In [6]:
# URL 주소를 URL로 대체

class change_url(BaseEstimator):
    def __init__(self, perform = True):
        self.perform = perform
        self.url_changer = re.compile(r"\b(https|http):\/\/\S+\b", re.DOTALL)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.perform:
            for i in range(len(X)):
                X[i] =  self.url_changer.sub("URL", X[i])
        return X


In [7]:
# 모든 숫자를 NUMBER로 대체

class change_number(BaseEstimator):
    def __init__(self, perform = True):
        self.perform = perform
        self.number_changer = re.compile(r"\d+")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.perform:
            for i in range(len(X)):
                X[i] =  self.number_changer.sub("NUMBER", X[i])
        return X    

In [8]:
# html 태그 제거

class remove_html(BaseEstimator):
    def __init__(self, perform = True):
        self.perform = perform
        self.tag_remover = html2text.HTML2Text()
        self.tag_remover.ignore_emphasis = True
        self.tag_remover.strong_mark = ''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.perform:
            for i in range(len(X)):
                X[i] =  self.tag_remover.handle(X[i])
        return X   


In [9]:
# 어간 추출

class stemmer(BaseEstimator):
    def __init__(self, perform = True, word_length = 20):
        self.perform = perform
        self.stemmer = PorterStemmer()
        self.word_length = word_length
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.perform:
            for i in range(len(X)):
                tokenized_sentence = word_tokenize(X[i])
                stemming_word = [self.stemmer.stem(word) for word in tokenized_sentence if len(word) < self.word_length]
                X[i] = " ".join(stemming_word)
        return X       

In [10]:
# 불용어 제거

class stopword(BaseEstimator):
    def __init__(self, perform = True):
        self.perform = perform
        self.stop_words = set(stopwords.words('english'))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.perform:
            for i in range(len(X)):
                result = []
                tokenized_sentence = word_tokenize(X[i])

                for word in tokenized_sentence:
                    if word not in self.stop_words:
                        result.append(word)
                X[i] = " ".join(result)
        return X     

In [11]:
# 특성벡터 변환 (소문자 변환, 구두점 제거도 수행)

# vectorizer = TfidfVectorizer(encoding='ISO-8859-1')
# X_train = vectorizer.fit_transform(X_train)

In [12]:
# 전처리 파이프라인

preprocessing_pipeline = Pipeline([
    ('remove_header', remove_header(perform=True)),
    ('change_url', change_url(perform=True)),
    ('change_number', change_number(perform=True)),
    ('remove_html', remove_html(perform=True)),
    ('stemmer', stemmer(perform=True, word_length=20)),
    ('stopword', stopword(perform=True)),
    ('vectorizer', TfidfVectorizer(encoding='ISO-8859-1'))
])

In [13]:
X_train = preprocessing_pipeline.fit_transform(X_train)

In [14]:
model = LinearSVC(C=10)
model.fit(X_train, y_train)

X_test = preprocessing_pipeline.transform(X_test)

final_prediction = model.predict(X_test)

accuracy = accuracy_score(y_test, final_prediction)
print(accuracy)

end = time.time()
print(f"{end - start:.5f} sec")




0.9934913993491399
134.64524 sec


In [None]:
# -------------------------------------- 여기까지는 문제 없음----------------------------------------

In [133]:
# 모델 초기 평가 함수

"""

def first_model_evaluation(X, y, cv, scoring, models):
    for model in models:
        model.fit(X, y)
        print(str(model) + "\n\n\n")
        print(cross_val_score(model, X, y, cv=cv, scoring=scoring).mean())
        print("\n\n\n")        
        
models = [SGDClassifier(), LogisticRegression(), RidgeClassifier(),
          MultinomialNB(), RandomForestClassifier(),
          LinearSVC(), NuSVC(), SVC()]

models = [SGDClassifier(), LinearSVC()]

first_model_evaluation(X_train, y_train, 3, "f1", models)

# LinearSVC 로 선택.

"""

SGDClassifier()



0.9626468526034381




LinearSVC()



0.9674748465526207








In [12]:
"""

total_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('linearSVC', LinearSVC())
    ])

# 전처리 파이프라인에 예측 모델을 합치고, 하이퍼 파라미터 정밀 튜닝을 실시한다.
# 최적 예측기를 통해 X_test 를 예측하려 하면, X_test 가 파이프라인을 따라 변환되지 않는다.
"""

In [12]:
# 하이퍼파라미터 정밀 튜닝
"""


param = {
    'linearSVC__C':[0.01, 0.1, 1, 10, 100]
}

gs = GridSearchCV(total_pipeline, param_grid=param, scoring="f1", cv=3, verbose=3)
gs.fit(X_train, y_train)
"""

Fitting 3 folds for each of 5 candidates, totalling 15 fits




[CV 1/3] END .................linearSVC__C=0.01;, score=0.857 total time= 1.9min




[CV 2/3] END .................linearSVC__C=0.01;, score=0.860 total time= 1.7min




[CV 3/3] END .................linearSVC__C=0.01;, score=0.849 total time= 1.7min




[CV 1/3] END ..................linearSVC__C=0.1;, score=0.953 total time= 1.8min




[CV 2/3] END ..................linearSVC__C=0.1;, score=0.956 total time= 1.8min




[CV 3/3] END ..................linearSVC__C=0.1;, score=0.956 total time= 1.7min




[CV 1/3] END ....................linearSVC__C=1;, score=0.971 total time= 1.7min




[CV 2/3] END ....................linearSVC__C=1;, score=0.981 total time= 1.7min




[CV 3/3] END ....................linearSVC__C=1;, score=0.978 total time= 1.6min




[CV 1/3] END ...................linearSVC__C=10;, score=0.970 total time= 1.7min




[CV 2/3] END ...................linearSVC__C=10;, score=0.985 total time= 1.7min




[CV 3/3] END ...................linearSVC__C=10;, score=0.979 total time= 1.6min




[CV 1/3] END ..................linearSVC__C=100;, score=0.970 total time= 1.8min




[CV 2/3] END ..................linearSVC__C=100;, score=0.981 total time= 1.8min




[CV 3/3] END ..................linearSVC__C=100;, score=0.979 total time= 1.6min




In [13]:
# gs.best_params_

{'linearSVC__C': 10}

In [14]:
# final_model = gs.best_estimator_

In [17]:
"""
final_prediction = final_model.predict(X_test)

accuracy = accuracy_score(y_test, final_prediction)
print(accuracy)
"""

0.6466759646675965


In [26]:
# len(data["data"])

10751