In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 데이터 준비
message_df = pd.read_csv('data-files/SMSSpamCollection', sep="\t", header=None, names=['label', 'message'])

In [3]:
# 데이터 확인
message_df.info()
message_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# target의 값을 범주형으로 인코딩

print(message_df['label'].value_counts())

from sklearn.preprocessing import LabelEncoder

# message_df['label'] = message_df['label'].map( { 'ham': 0, 'spam': 1 } ) # 직접 mapping 구현
message_df['label'] = LabelEncoder().fit_transform(message_df['label'])

print(message_df['label'].value_counts())

ham     4825
spam     747
Name: label, dtype: int64
0    4825
1     747
Name: label, dtype: int64


In [5]:
# message 전처리 : 대문자 -> 소문자

# message_df['message'].str.lower()
# message_df['message'].map(lambda v: v.lower())

message_df['message'] = message_df['message'].map(lambda v: v.lower())

In [6]:
# message 전처리 : 대문자 -> 소문자 확인

message_df.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [7]:
# message_df['message'].map(lambda v: v.replace('[^\w\s]', ''))

import re
message_df['message'].map(lambda v: re.sub('[^\w\s]', '', v))

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will ü b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: message, Length: 5572, dtype: object

In [8]:
# message 전처리 : 특수문자 제거

import re

# message_df['message'].str.replace('[^\w\s]', '')
# message_df['message'].map(lambda v: re.sub('[^\w\s]', '', v))

message_df['message'] = message_df['message'].str.replace('[^\w\s]', '')

In [9]:
# message 전처리 : 특수문자 제거 확인

message_df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [10]:
# 단어 분해 도구 준비

import nltk
# nltk.download('punkt') # 한 번만 실행하면 됩니다.

In [11]:
# message 전처리 : 문장 -> 단어 리스트로 변경

# nltk.word_tokenize(message_df['message'][0])
# message_df['message'].map(nltk.word_tokenize)

message_df['message'] = message_df['message'].map(nltk.word_tokenize)

In [12]:
# message 전처리 : 문장 -> 단어 리스트로 변경 확인
message_df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [13]:
# message 전처리 : 어근 추출

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

message_df['message'] = message_df['message'].map(lambda v: [ stemmer.stem(w) for w in v ])

In [14]:
# 문법 확인
vx = message_df['message'][0]
[ stemmer.stem(w) for w in vx ]

['go',
 'until',
 'jurong',
 'point',
 'crazi',
 'avail',
 'onli',
 'in',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amor',
 'wat']

In [15]:
# message 전처리 : 어근 추출 확인

message_df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."


In [16]:
# message 전처리 : 각 행의 단어 리스트 -> 한 개의 문장

# ' '.join(message_df['message'][0]) # 단어 리스트를 하나의 문장으로 결합

message_df['message'] = message_df['message'].map(lambda v: ' '.join(v))

In [17]:
# message 전처리 : 각 행의 단어 리스트 -> 한 개의 문장 확인

message_df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


In [18]:
# message 전처리 : 각 단어를 숫자로 변경 (숫자는 절대 빈도수)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(message_df['message'])
print(counts)

  (0, 3336)	1
  (0, 7497)	1
  (0, 4128)	1
  (0, 5635)	1
  (0, 2248)	1
  (0, 1340)	1
  (0, 5292)	1
  (0, 3872)	1
  (0, 1750)	1
  (0, 3425)	1
  (0, 7925)	1
  (0, 4273)	1
  (0, 1748)	1
  (0, 2029)	1
  (0, 7130)	1
  (0, 3388)	1
  (0, 1146)	1
  (0, 7715)	1
  (1, 5257)	1
  (1, 4308)	1
  (1, 4094)	1
  (1, 7835)	1
  (1, 5289)	1
  (2, 3872)	1
  (2, 3148)	1
  :	:
  (5570, 7754)	1
  (5570, 1160)	1
  (5570, 3823)	1
  (5570, 4396)	1
  (5570, 6587)	1
  (5570, 3105)	1
  (5570, 7109)	1
  (5570, 1463)	1
  (5570, 5048)	1
  (5570, 2492)	1
  (5570, 7534)	1
  (5570, 1773)	1
  (5570, 6596)	1
  (5570, 1777)	1
  (5570, 2760)	1
  (5570, 3255)	1
  (5570, 3477)	1
  (5570, 3940)	1
  (5570, 983)	1
  (5570, 1563)	1
  (5571, 7236)	1
  (5571, 3987)	2
  (5571, 4970)	1
  (5571, 7366)	1
  (5571, 6114)	1


In [19]:
# message 전처리 : 각 단어를 숫자로 변경 2 (숫자는 상대 빈도수)

from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
counts2 = transformer.fit_transform(counts)
print(counts2)

  (0, 7925)	0.22378642176936625
  (0, 7715)	0.18293604147358436
  (0, 7497)	0.232012730496152
  (0, 7130)	0.15808501470085967
  (0, 5635)	0.22485506312666312
  (0, 5292)	0.1588008730270491
  (0, 4273)	0.2781965206152583
  (0, 4128)	0.32930301835453774
  (0, 3872)	0.10860920003212803
  (0, 3425)	0.18328548053939198
  (0, 3388)	0.15280952404957904
  (0, 3336)	0.132266862568599
  (0, 2248)	0.255022519528138
  (0, 2029)	0.2781965206152583
  (0, 1750)	0.2781965206152583
  (0, 1748)	0.31435532599420324
  (0, 1340)	0.2504083119963028
  (0, 1146)	0.32930301835453774
  (1, 7835)	0.44483654514496557
  (1, 5289)	0.5633498837724461
  (1, 5257)	0.2825014776211812
  (1, 4308)	0.42081977871680865
  (1, 4094)	0.4773478663822099
  (2, 7883)	0.18653623125647448
  (2, 7848)	0.14242759355834578
  :	:
  (5570, 6587)	0.19054252105358732
  (5570, 5048)	0.21643786562194572
  (5570, 4396)	0.16284308112975754
  (5570, 3987)	0.11780359009346424
  (5570, 3940)	0.27149395792904457
  (5570, 3872)	0.1156240697440695

In [20]:
# target, features 분할
X = counts2
y = message_df['label']

In [21]:
# 훈련 데이터와 테스트 데이터 분할

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
# 데이터 확인
X_train.shape

(4457, 8169)

In [23]:
# 모델 훈련 : LogisticRegression

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [24]:
# 모델 평가
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.9706080323087278, 0.9587443946188341)

In [25]:
# 모델 훈련 : BernoulliNB

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

BernoulliNB()

In [27]:
# 모델 평가
bnb.score(X_train, y_train), bnb.score(X_test, y_test)

(0.9851918330715728, 0.9730941704035875)