In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data-files/SMSSpamCollection.tsv', 
                   sep='\t', header=None, names=["label", "message"])

In [3]:
data.head(3)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
data["label"].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# le.fit(data["label"])
# le.transform(data["label"])
# data["label"] = le.transform(data["label"])

# data['label'].map({ 'ham': 0, 'spam': 1})
data['label'] = data['label'].map({ 'ham': 0, 'spam': 1})

In [8]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# data['message'].str.lower()

# data-frame.map(func) : 지정된 data-frame의 각 행에  func를 적용
# data['message'].map(lambda x: x.lower())
data['message2'] = data['message'].map(lambda x: x.lower())
data.head()

Unnamed: 0,label,message,message2
0,0,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,0,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."


In [28]:
# data['message2'].map(lambda x: x.replace('[^\w\s]', ''))
# data['message2'].str.replace('[^\w\s]', '')
data['message2'] = data['message2'].str.replace('[^\w\s]', '')

In [30]:
import nltk

nltk.download('punkt')
data['message3'] = data['message2'].apply(nltk.word_tokenize) # 문장을 단어로 분리
data['message3']

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shoseo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
                              ...                        
5567    [this, is, the, 2nd, time, we, have, tried, 2,...
5568         [will, ü, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
Name: message3, Length: 5572, dtype: object

In [35]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

data['message4'] = data['message3'].apply(lambda words: [stemmer.stem(word) for word in words])
data['message4']

0       [go, until, jurong, point, crazi, avail, onli,...
1                            [ok, lar, joke, wif, u, oni]
2       [free, entri, in, 2, a, wkli, comp, to, win, f...
3       [u, dun, say, so, earli, hor, u, c, alreadi, t...
4       [nah, i, dont, think, he, goe, to, usf, he, li...
                              ...                        
5567    [thi, is, the, 2nd, time, we, have, tri, 2, co...
5568             [will, ü, b, go, to, esplanad, fr, home]
5569    [piti, wa, in, mood, for, that, soani, other, ...
5570    [the, guy, did, some, bitch, but, i, act, like...
5571                       [rofl, it, true, to, it, name]
Name: message4, Length: 5572, dtype: object

In [36]:
# 'xxx'.join(['1', '2', '3']) -> '1xxx2xxx3'
data['message5'] = data['message4'].apply(lambda x: ' '.join(x))
data['message5']

0       go until jurong point crazi avail onli in bugi...
1                                   ok lar joke wif u oni
2       free entri in 2 a wkli comp to win fa cup fina...
3             u dun say so earli hor u c alreadi then say
4       nah i dont think he goe to usf he live around ...
                              ...                        
5567    thi is the 2nd time we have tri 2 contact u u ...
5568                      will ü b go to esplanad fr home
5569         piti wa in mood for that soani other suggest
5570    the guy did some bitch but i act like id be in...
5571                              rofl it true to it name
Name: message5, Length: 5572, dtype: object

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(data['message5'])
word_counts = cv.transform(data['message5'])
print( word_counts.shape )
# word_counts.toarray()
print(word_counts)

(5572, 8169)
  (0, 1146)	1
  (0, 1340)	1
  (0, 1748)	1
  (0, 1750)	1
  (0, 2029)	1
  (0, 2248)	1
  (0, 3336)	1
  (0, 3388)	1
  (0, 3425)	1
  (0, 3872)	1
  (0, 4128)	1
  (0, 4273)	1
  (0, 5292)	1
  (0, 5635)	1
  (0, 7130)	1
  (0, 7497)	1
  (0, 7715)	1
  (0, 7925)	1
  (1, 4094)	1
  (1, 4308)	1
  (1, 5257)	1
  (1, 5289)	1
  (1, 7835)	1
  (2, 71)	1
  (2, 433)	1
  :	:
  (5570, 1777)	1
  (5570, 2492)	1
  (5570, 2760)	1
  (5570, 3105)	1
  (5570, 3148)	1
  (5570, 3255)	1
  (5570, 3477)	1
  (5570, 3559)	1
  (5570, 3823)	1
  (5570, 3872)	1
  (5570, 3940)	1
  (5570, 3987)	1
  (5570, 4396)	1
  (5570, 5048)	1
  (5570, 6587)	1
  (5570, 6596)	1
  (5570, 7109)	1
  (5570, 7236)	1
  (5570, 7534)	1
  (5570, 7754)	1
  (5571, 3987)	2
  (5571, 4970)	1
  (5571, 6114)	1
  (5571, 7236)	1
  (5571, 7366)	1


In [43]:
from sklearn.model_selection import train_test_split
X, y = word_counts, data['label']
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, stratify=y, random_state=42)

In [44]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
nb.score(X_train, y_train), nb.score(X_test, y_test)

(0.990667623833453, 0.9777458722182341)