# Spam 메일 분류

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]


Saving spam.csv to spam.csv


In [3]:
df = pd.read_csv(filename, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


## 데이터 전처리


In [4]:
del df['Unnamed: 2']
del df['Unnamed: 3']
del df['Unnamed: 4']
df['label'] = df.v1.apply(lambda x: 1 if x == 'spam' else 0)
df.head(3)

Unnamed: 0,v1,v2,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1


In [None]:
df.info()

## 전처리

In [5]:
# 중복 확인
df.v2.nunique()

5169

In [6]:
# 중복 데이터 제거
df.drop_duplicates('v2', keep='first', inplace=True)

In [7]:
# 구둣점 제거
df['content'] = df.v2.str.replace('[^A-Za-z ]', '')
df.head(3)

Unnamed: 0,v1,v2,label,content
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in a wkly comp to win FA Cup final...


In [8]:
# 소문자 변환 - Vectorizer를 이용해서 변환할 경우에는 할 필요없음
df['content'] = df.content.apply(lambda x: x.lower())

In [9]:
# 햄/스팸 메일 분포
df.v1.value_counts()

ham     4516
spam     653
Name: v1, dtype: int64

## 훈련/테스트 데이터 셋으로 분리하고 DTM 변환


In [10]:
from sklearn.model_selection import train_test_split
str_train, str_test, y_train, y_test = train_test_split(
    df.content, df.label, test_size=0.2,
    stratify=df.label, random_state=2021
)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english', ngram_range=(1,2))
cvect.fit(str_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [12]:

X_train = cvect.transform(str_train)
X_test = cvect.transform(str_test)

## 훈련/예측/평가


In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)


0.9777562862669246