In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)

## 스팸 or 햄 문자 데이터


`spam.csv` 스팸 문자 내용에 관한 데이터
- text: 문자 내용
- type: 스팸여부 

In [3]:
spam = pd.read_csv("/content/drive/MyDrive/06. DX 캠프 코드/4강_분류 모델/Naive Bayes/spam.csv")
spam

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [4]:
X = spam["text"]
y = spam["type"]

print(X[0])
print(y[0])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham


In [5]:
y.value_counts()

ham     4827
spam     747
Name: type, dtype: int64

In [6]:
# 정답의 문자를 숫자로 변환
# ham은 0, spam은 1

y = y.map({"ham": 0, "spam": 1})
y.value_counts()

0    4827
1     747
Name: type, dtype: int64

In [7]:
# text를 문자 형태로 변환  
# regex를 통해 영어, 숫자 그리고 띄어쓰기를 제외한 모든 단어 삭제

re_pattern = "[^a-zA-Z0-9\ ]"
X[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [8]:
X.iloc[:1].str.replace(re_pattern, "", regex=True)[0]

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [9]:
X = X.str.replace(re_pattern, "", regex=True)
X[0]

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [10]:
# 대문자들을 모두 소문자로 변환
X.iloc[:1].str.lower()[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [11]:
X = X.str.lower()
X[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [12]:
# Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.7, random_state=42)

print(f"train_data size: {len(X_train)}, {len(X_train)/len(X):.2f}")
print(f"test_data size: {len(X_test)}, {len(X_test)/len(X):.2f}")

train_data size: 3901, 0.70
test_data size: 1673, 0.30


## Count Vectorize

이제 Naive Bayes를 학습시키기 위해서 각 문장에서 단어들이 몇 번 나왔는지로 변환 필요.

In [13]:
# word tokenize: 문장을 단어로 나누는 데에는 `nltk` 패키지의 `word_tokenize`를 이용.
import nltk
from nltk import word_tokenize

# punkt: 문장 구조를 학습한 일종의 모델로, 어떤 것이 약어에 쓰이는 "." 이고(Ex : Ph.D.), 어떤 것이 마침표인지 등이 학습됨.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
X_train.iloc[0]

'you are gorgeous keep those pix cumming  thank you'

In [15]:
word_tokenize(X_train.iloc[0])

['you', 'are', 'gorgeous', 'keep', 'those', 'pix', 'cumming', 'thank', 'you']

In [16]:
# count vectorize: 다음은 `sklearn.feature_extraction.text`의 `CountVectorizer`를 이용해 단어 count vector로 변환
from sklearn.feature_extraction.text import CountVectorizer

# 2개의 문장으로 CountVectorizer를 학습
y_train.iloc[:2].values

cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize)
cnt_vectorizer.fit(X_train.iloc[:2])

# CountVectorizer를 통해 문장에서 나온 단어
cnt_vectorizer.vocabulary_

  "The parameter 'token_pattern' will not be used"


{'you': 23,
 'are': 5,
 'gorgeous': 10,
 'keep': 12,
 'those': 20,
 'pix': 14,
 'cumming': 9,
 'thank': 19,
 'have': 11,
 'won': 22,
 '1000': 0,
 'cash': 7,
 'or': 13,
 'a': 4,
 '2000': 2,
 'prize': 15,
 'to': 21,
 'claim': 8,
 'call09050000327': 6,
 'tc': 18,
 'rstm': 16,
 'sw7': 17,
 '3ss': 3,
 '150ppm': 1}

In [17]:
vocab = sorted(cnt_vectorizer.vocabulary_.items(), key=lambda x: x[1])
vocab = list(map(lambda x: x[0], vocab))
vocab

['1000',
 '150ppm',
 '2000',
 '3ss',
 'a',
 'are',
 'call09050000327',
 'cash',
 'claim',
 'cumming',
 'gorgeous',
 'have',
 'keep',
 'or',
 'pix',
 'prize',
 'rstm',
 'sw7',
 'tc',
 'thank',
 'those',
 'to',
 'won',
 'you']

In [18]:
sample_cnt_vector = cnt_vectorizer.transform(X_train.iloc[:2]).toarray()
sample_cnt_vector

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 2],
       [1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1]])

In [19]:
X_train.iloc[:2].values

array(['you are gorgeous keep those pix cumming  thank you',
       'you have won 1000 cash or a 2000 prize to claim call09050000327 tc rstm sw7 3ss 150ppm'],
      dtype=object)

In [20]:
pd.DataFrame(sample_cnt_vector, columns=vocab)

Unnamed: 0,1000,150ppm,2000,3ss,a,are,call09050000327,cash,claim,cumming,...,pix,prize,rstm,sw7,tc,thank,those,to,won,you
0,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,1,1,0,0,2
1,1,1,1,1,1,0,1,1,1,0,...,0,1,1,1,1,0,0,1,1,1


## 모델 학습

In [22]:
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize)
cnt_vectorizer.fit(X_train)

print(f"전체 단어 수: {len(cnt_vectorizer.vocabulary_)}")

전체 단어 수: 7881


In [23]:
train_matrix = cnt_vectorizer.transform(X_train)
test_matrix = cnt_vectorizer.transform(X_test)

존재하지 않는 단어가 들어올 경우?  
=> CountVectorize는 학습한 단어장에 존재하지 않는 단어가 들어오게 될 경우 해당 단어 무시

In [24]:
cnt_vectorizer.transform(["notavailblewordforcnt"]).toarray().sum()

0

## Naive Bayes 모델

In [25]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes = BernoulliNB()
naive_bayes.fit(train_matrix, y_train)

BernoulliNB()

예측 및 평가

In [26]:
from sklearn.metrics import accuracy_score

train_pred = naive_bayes.predict(train_matrix)
test_pred = naive_bayes.predict(test_matrix)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print(f"Train Accuracy is {train_acc:.4f}")
print(f"Test Accuracy is {test_acc:.4f}")

Train Accuracy is 0.9844
Test Accuracy is 0.9737
