In [1]:
import pandas as pd

# 원시 URL
url = 'https://raw.githubusercontent.com/sigirace/sigirace.github.io/master/_posts/kang_lecture/python_preproc/data/spam_mail.csv'

# CSV 파일 읽어오기
data = pd.read_csv(url)
data.to_csv('spam_mail_data.csv')

data.head(10)

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291 thi...
1,ham,"Subject: hpl nom for january 9 , 2001 ( see at..."
2,ham,"Subject: neon retreat ho ho ho , we ' re aroun..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs this deal is to b...
5,ham,Subject: ehronline web address change this mes...
6,ham,Subject: spring savings certificate - take 30 ...
7,spam,Subject: looking for medication ? we ` re the ...
8,ham,Subject: noms / actual flow for 2 / 26 we agre...
9,ham,"Subject: nominations for oct . 21 - 23 , 2000 ..."


## 1. 룰 기반 필터링

In [2]:
import re

class Rule(object):
    def __init__(self, spam_keywords):
        self.spam_keywords = spam_keywords

    def check(self, mail_text):
        mail_text = set(re.findall(r'\b\w+\b', mail_text.lower()))
        common_words = mail_text.intersection(self.spam_keywords)
        if len(common_words) > 0:
            print("스팸입니다.")
            return True
        else:
            print("스팸이 아닙니다.")
            return False

### 1.1 룰 지정

In [3]:
spam_words = ['photoshop',
 'medication',
 'erection',
 'pills']

### 1.2 필터 생성 (프로그래밍)

In [4]:
rule_filter = Rule(spam_words)

### 1.3 신규 메일 필터링 (테스트)

In [5]:
mail = "Dear Valued Customer, We are pleased to inform you that you have been selected to receive an exclusive $1,000 gift card from our company! This is a limited-time offer, and we are excited to reward loyal customers like you. To claim your gift card, simply click on the link below and fill out a short survey. It’s that easy! Claim Your $1,000 Gift Card Now"
rule_filter.check(mail)

스팸이 아닙니다.


False

## 2. 모델 기반 필터링

In [6]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

nltk.download('stopwords')

class Model():
    def __init__(self, df):
        self.df = df
        self.corpus = []
        self.model = None
        self.stemmer = PorterStemmer()
        self.stopwords_set  = set(stopwords.words('english'))

    def preprocess(self):

        for i in range(len(self.df)):
            text = self.df['text'].iloc[i].lower()
            text = text.translate(str.maketrans('','',string.punctuation)).split()
            text = [self.stemmer.stem(word) for word in text if word not in self.stopwords_set]
            text = ' '.join(text)
            self.corpus.append(text)
            self.vectorizer = CountVectorizer()

        self.df['label_num'] = self.df.apply(lambda x: 1 if x['label'] == 'spam' else 0, axis=1)

    def train(self):
        self.preprocess()

        X = self.vectorizer.fit_transform(self.corpus).toarray()
        y = self.df.label_num

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        mnb = MultinomialNB()

        mnb.fit(X_train, y_train)
        self.model = mnb
        score = mnb.score(X_test, y_test)
        print("모델 성능: ", score)

    def check(self, mail_text):
        email_text = mail_text.lower().translate(str.maketrans('','',string.punctuation)).split()
        email_text = [self.stemmer.stem(word) for word in email_text if word not in self.stopwords_set]
        email_text = ' '.join(email_text)

        email_corpus = [email_text]

        X_email = self.vectorizer.transform(email_corpus)

        result = self.model.predict(X_email)

        if result[0] == 1:
            print("스팸입니다.")
            return True
        else:
            print("스팸이 아닙니다.")
            return False

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### 2.1 모델 생성

In [7]:
m=Model(data)

### 2.2 모델 훈련

In [8]:
m.train()

모델 성능:  0.9748792270531401


### 2.3 신규 메일 필터링(테스트)

In [9]:
mail = "Dear Valued Customer, We are pleased to inform you that you have been selected to receive an exclusive $1,000 gift card from our company! This is a limited-time offer, and we are excited to reward loyal customers like you. To claim your gift card, simply click on the link below and fill out a short survey. It’s that easy! Claim Your $1,000 Gift Card Now"
m.check(mail)

스팸입니다.


True