In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data-files/SMSSpamCollection.tsv', sep="\t", 
                 header=None, names=['target', 'message'])
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df[['target']].values

array([['ham'],
       ['ham'],
       ['spam'],
       ...,
       ['ham'],
       ['ham'],
       ['ham']], dtype=object)

In [4]:
# 범주형 -> 수치형
from sklearn.preprocessing import LabelEncoder

df['label'] = LabelEncoder().fit_transform(df['target'])
df.head()

Unnamed: 0,target,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
# message 전처리 1 (영숫자, 공백을 제외한 나머지 문자 제거)

import re #  정규표현식 도구 12ab, a332, 135d -> [0-9a-zA-Z]{4,10}
df['message2'] = df['message'].map(lambda v: re.sub("[^\w\s]", '', v))
df

  df['message2'] = df['message'].map(lambda v: re.sub("[^\w\s]", '', v))


Unnamed: 0,target,message,label,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?,0,Will ü b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like id ...


In [6]:
# message 전처리 2 : 모두 소문자로 변경
df['message3'] = df['message2'].str.lower()
df.head()

Unnamed: 0,target,message,label,message2,message3
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...


In [7]:
# !pip install nltk

In [10]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\human\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\human\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [11]:
# message 전처리 3 : 문장을 토큰의 리스트로 변환
# df['message3'].map(lambda v: nltk.word_tokenize(v))
df['message4'] = df['message3'].map(nltk.word_tokenize)
df.head()

Unnamed: 0,target,message,label,message2,message3,message4
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [14]:
# message 전처리 4 : 어근 추출 (단어의 활용 표준화)

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('apples'), stemmer.stem('apple'))
df['message5'] = df['message4'].map(lambda words: [ stemmer.stem(w) for w in words])
df.head()

appl appl


Unnamed: 0,target,message,label,message2,message3,message4,message5
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazi, avail, onli,..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, goe, to, usf, he, li..."


In [18]:
' '.join(['abc', 'def', '123'])

'abc def 123'

In [19]:
# message 전처리 5 : 단어 리스트 -> 단일 문장

df['message6'] = df['message5'].map(lambda words: ' '.join(words))
df.head()

Unnamed: 0,target,message,label,message2,message3,message4,message5,message6
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazi, avail, onli,...",go until jurong point crazi avail onli in bugi...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]",ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entri, in, 2, a, wkli, comp, to, win, f...",free entri in 2 a wkli comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, earli, hor, u, c, alreadi, t...",u dun say so earli hor u c alreadi then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, goe, to, usf, he, li...",nah i dont think he goe to usf he live around ...


In [40]:
# message 전처리 6 : 단어 -> 숫자, 문장 -> 숫자 조합

from sklearn.feature_extraction.text \
    import CountVectorizer, TfidfTransformer, TfidfVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df['message6'])
# print( vectorizer.vocabulary_ )

sparse_word_counts = vectorizer.transform(df['message6'])
dense_word_counts = sparse_word_counts.toarray()

tvectorizer = TfidfVectorizer()
tvectorizer.fit(df['message6'])
sparse_word_counts2 = tvectorizer.transform(df['message6'])
dense_word_counts2 = sparse_word_counts2.toarray()

In [41]:
# 훈련 데이터, 테스트 데이터 준비
X_train, X_test, y_train, y_test = train_test_split(dense_word_counts, df['label'])
X_train2, X_test2, y_train2, y_test2 = train_test_split(dense_word_counts2, df['label'])

In [37]:
# 모델 훈련
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

mnb.score(X_train, y_train), mnb.score(X_test, y_test)

(0.9904283321368749, 0.9798994974874372)

In [45]:
# 모델 훈련 2

gnb = BernoulliNB()
gnb.fit(X_train2, y_train2)

gnb.score(X_train2, y_train2), gnb.score(X_test2, y_test2)

(0.9844460397224216, 0.9712849964106246)

In [38]:
vectorizer.vocabulary_

{'go': 3336,
 'until': 7497,
 'jurong': 4128,
 'point': 5635,
 'crazi': 2248,
 'avail': 1340,
 'onli': 5292,
 'in': 3872,
 'bugi': 1750,
 'great': 3425,
 'world': 7925,
 'la': 4273,
 'buffet': 1748,
 'cine': 2029,
 'there': 7130,
 'got': 3388,
 'amor': 1146,
 'wat': 7715,
 'ok': 5257,
 'lar': 4308,
 'joke': 4094,
 'wif': 7835,
 'oni': 5289,
 'free': 3148,
 'entri': 2802,
 'wkli': 7883,
 'comp': 2125,
 'to': 7236,
 'win': 7848,
 'fa': 2920,
 'cup': 2295,
 'final': 3023,
 'tkt': 7226,
 '21st': 446,
 'may': 4669,
 '2005': 433,
 'text': 7075,
 '87121': 861,
 'receiv': 5961,
 'questionstd': 5872,
 'txt': 7410,
 'ratetc': 5918,
 'appli': 1220,
 '08452810075over18': 71,
 'dun': 2676,
 'say': 6234,
 'so': 6572,
 'earli': 2694,
 'hor': 3718,
 'alreadi': 1118,
 'then': 7124,
 'nah': 4964,
 'dont': 2597,
 'think': 7151,
 'he': 3559,
 'goe': 3348,
 'usf': 7541,
 'live': 4427,
 'around': 1260,
 'here': 3609,
 'though': 7168,
 'freemsg': 3156,
 'hey': 3625,
 'darl': 2350,
 'it': 3987,
 'been': 1486,

In [42]:
dense_word_counts[0][[3336, 7497, 4128, 5635, 2248]]
dense_word_counts2[0][[3336, 7497, 4128, 5635, 2248]]

array([0.13226686, 0.23201273, 0.32930302, 0.22485506, 0.25502252])