In [1]:
import os
import glob
import numpy as np
import email

In [2]:
path = './datasets/spam/' 

In [3]:
easy_ham_paths = glob.glob(path+'easy_ham/*')   #datasets/spam 폴더 안에있는 easy ham파일과 spam파일을 불러옵니다.
spam_paths = glob.glob(path+'spam/*')

In [4]:
def get_email_content(email_path):          #e-mail중 text/plain type을 가진 메일만 text content로 저장합니다.
    file = open(email_path,encoding='latin1')
    try:
        msg = email.message_from_file(file)
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                return part.get_payload() 
    except Exception as e:
        print(e)
        
        
def get_email_content_bulk(email_paths):     #path에 들어있는 모든 메일을 text content로 바꿔줍니다.
    email_contents = [get_email_content(o) for o in email_paths]
    return email_contents

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
ham_path=[easy_ham_paths]     #ham 과 spam의 경로를 지정합니다.
spam_path=[spam_paths]

In [7]:
ham_sample = np.array([train_test_split(o) for o in ham_path])   

  """Entry point for launching an IPython kernel.


In [8]:
ham_train = np.array([])    #ham path안의 메일을 train set과 test set으로 각각 분리합니다.
ham_test = np.array([])
for o in ham_sample:
    ham_train = np.concatenate((ham_train,o[0]))
    ham_test = np.concatenate((ham_test,o[0]))

In [9]:
ham_train.shape, ham_test.shape #ham_train과 ham_test의 갯수를 확인합니다.

((1875,), (1875,))

In [10]:
spam_sample = np.array([train_test_split(o) for o in spam_path])

  """Entry point for launching an IPython kernel.


In [11]:
spam_train = np.array([])            #spam path안의 메일을 train set과 test set으로 각각 분리합니다.
spam_test = np.array([])
for o in spam_sample:
    spam_train = np.concatenate((spam_train,o[0]))
    spam_test = np.concatenate((spam_test,o[1]))

In [12]:
spam_train.shape, spam_test.shape       #spam_train과 spam_test의 갯수를 확인합니다.

((375,), (126,))

In [13]:
ham_train_label = [0]*ham_train.shape[0]  #x_train에는 ham메일과 spam메일을 넣고, y_train에는 ham은 0 spam은 1로 설정한 값을 넣습니다.
spam_train_label = [1]*spam_train.shape[0]
x_train = np.concatenate((ham_train,spam_train))
y_train = np.concatenate((ham_train_label,spam_train_label))

In [14]:
ham_test_label = [0]*ham_test.shape[0] #위의 방법과 마찬가지로 x_test와 y_test를 설정합니다.
spam_test_label = [1]*spam_test.shape[0]
x_test = np.concatenate((ham_test,spam_test))
y_test = np.concatenate((ham_test_label,spam_test_label))

In [15]:
train_shuffle_index = np.random.permutation(np.arange(0,x_train.shape[0])) 
test_shuffle_index = np.random.permutation(np.arange(0,x_test.shape[0]))

In [16]:
x_train = x_train[train_shuffle_index] #x_train과 y_train을 섞어줍니다.
y_train = y_train[train_shuffle_index]

In [17]:
x_test = x_test[test_shuffle_index] #x_test와 y_test를 섞어줍니다.
y_test = y_test[test_shuffle_index]

In [18]:
x_train = get_email_content_bulk(x_train)  #x_train과 x_test에 위에서 선언한 get_email_content_bulk함수를 적용합니다.
x_test = get_email_content_bulk(x_test)

In [19]:
def remove_null(datas,labels):    #공백(null)을 제거하는 함수를 선언합니다.
    not_null_idx = [i for i,o in enumerate(datas) if o is not None]
    return np.array(datas)[not_null_idx],np.array(labels)[not_null_idx]

In [20]:
x_train,y_train = remove_null(x_train,y_train) #x_train, y_train, x_test, y_test의 null을 제거합니다.
x_test,y_test = remove_null(x_test,y_test)

In [21]:
import re                                    
import string
from nltk.tokenize import word_tokenize
def remove_hyperlink(word):                  #http가 붙은 hyperlink를 제거하는 함수를 선언합니다.
    return  re.sub(r"http\S+", "", word)
def remove_number(word):                     #숫자를 제거하는 함수를 선언합니다.
    result = re.sub(r'\d+', '', word)
    return result
def remove_whitespace(word):                 #whitespace를 제거하는 함수를 선언합니다.
    result = word.strip()
    return result
def replace_newline(word):                   #\n을 ''으로 바꿔주는 함수를 선언합니다.
    return word.replace('\n','')
def clean_up_pipeline(sentence):             #위의 4개의 함수를 한번에 실행해주는 함수를 선언합니다.
    cleaning_utils = [remove_hyperlink,replace_newline,remove_number,remove_whitespace]
    for o in cleaning_utils:
        sentence = o(sentence)
    return sentence

In [22]:
x_train = [clean_up_pipeline(o) for o in x_train] #x_train과 x_test에 위에서 선언한 함수들을 적용합니다.
x_test = [clean_up_pipeline(o) for o in x_test]

In [23]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer



In [24]:
stemmer = PorterStemmer()    #nltk의 porterstemmer를 통해 문장의 어간(stem)을 추출합니다.
lemmatizer = WordNetLemmatizer()    #nltk의 wordnetlemmatizer를 통해 문장의 표제어(Lemma)를 추출합니다.

In [25]:
x_train = [word_tokenize(o) for o in x_train]   #x_train에 들어있는 메일들을 단어로 쪼갭니다.
x_test = [word_tokenize(o) for o in x_test]     #x_test에 들어있는 메일들을 단어로 쪼갭니다.

In [26]:
def remove_stop_words(words):    #word_tokenize로 쪼갠 단어들중 불용어(stop word)를 지워주는 함수를 선언합니다.
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

In [27]:
def word_stemmer(words):      #word_tokenize로 쪼갠 단어들중 어간(stem)을 추출하는 함수를 선언합니다.
    return [stemmer.stem(o) for o in words]

In [28]:
def word_lemmatizer(words):   #word_tokenize로 쪼갠 단어들중 표제어(Lemma)를 추출하는 함수를 선언합니다.
    return [lemmatizer.lemmatize(o) for o in words]

In [29]:
def clean_token_pipeline(words):   #위의 세함수를 한번에 적용시켜주는 함수를 선언합니다.
    cleaning_utils = [remove_stop_words, word_lemmatizer, word_stemmer]
    for o in cleaning_utils:
        words = o(words)
    return words

In [30]:
x_train = [clean_token_pipeline(o) for o in x_train] #x_train과 x_test에 세개의 함수를 한번에 적용시킵니다.
x_test = [clean_token_pipeline(o) for o in x_test]

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer  

In [32]:
vectorizer = TfidfVectorizer()
raw_sentences = [' '.join(o) for o in x_train]
vectorizer.fit(raw_sentences)

TfidfVectorizer()

In [33]:
def convert_to_feature(raw_tokenize_data):      #텍스트에서 단어별 등장횟수를 카운팅하여 수치벡터화 시켜주는 함수를 선언합니다.
    raw_sentences = [' '.join(o) for o in raw_tokenize_data]
    return vectorizer.transform(raw_sentences)

In [34]:
x_train_features = convert_to_feature(x_train) #x_train과 x_test에 위 함수를 적용시켜 줍니다.
x_test_features = convert_to_feature(x_test)

In [35]:
from sklearn.linear_model import LogisticRegression   #지도학습중 regression인 liner regression을 적용합니다.
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, x_train_features.toarray(), y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.933, total=   1.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV] .................................... , score=0.914, total=   1.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV] .................................... , score=0.928, total=   1.5s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.3s finished


0.9252040329722733

In [36]:
log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=2)
score = cross_val_score(log_clf, x_test_features.toarray(), y_test, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.966, total=   1.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] .................................... , score=0.966, total=   1.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s


[CV] .................................... , score=0.967, total=   0.9s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.0s finished


0.9664953592272344

In [37]:
from sklearn.svm import LinearSVC     #지도학습중 calssification인 svm을 적용합니다.

model = LinearSVC(C=10)
model.fit(x_train_features, y_train)

score = model.score(x_train_features.toarray(), y_train)
print(score)

1.0


In [38]:
from sklearn.svm import LinearSVC

model = LinearSVC(C=10)
model.fit(x_test_features, y_test)

score = model.score(x_test_features.toarray(), y_test)
print(score)

0.9969072164948454


In [39]:
from sklearn.naive_bayes import GaussianNB  #지도학습중 calssification인 Naive Bayes를 적용합니다.
nb = GaussianNB()
nb.fit(x_train_features.toarray(),y_train)
nb.score(x_test_features.toarray(),y_test)

0.9809278350515463

In [40]:
nb.score(x_train_features.toarray(),y_train)

0.9895188184849929