### SMS Spam 분류

In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- 데이터 전처리

In [8]:
# Selection
df = df[['v1', 'v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# 결측치 확인
df.isna().sum().sum()

0

In [13]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5169, 2), 5169)

In [14]:
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [15]:
# ['ham', 'spam'] => 0, 1
df.v1 = df.v1.replace('ham', '0').replace('spam', '1')
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Ham/Spam 분포
df.v1.value_counts()

0    4516
1     653
Name: v1, dtype: int64

- Text 전처리

In [18]:
# 구둣점, 숫자 제거 --> 영어 이외의 문자 공백으로 변환
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ')
df.head()

  df.v2 = df.v2.str.replace('[^A-Za-z]', ' ')


Unnamed: 0,v1,v2
0,0,Go until jurong point crazy Available only ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in a wkly comp to win FA Cup fina...
3,0,U dun say so early hor U c already then say
4,0,Nah I don t think he goes to usf he lives aro...


- 데이터셋 분리

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)

- Pipeline으로 베스트 파라미터 찾기
     - CountVectorizer + RandomFroestClassifier

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [33]:
params = {
    'CVECT__ngram_range': [(1,1), (1,2)],
    'RFC__max_depth': [2, 10, 30],
    'RFC__n_estimators': [100, 200, 300]
}

In [34]:
cvect = CountVectorizer(stop_words='english')
rfc = RandomForestClassifier(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('RFC', rfc)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 1min 13s
Wall time: 1min 14s


In [36]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'RFC__max_depth': 30, 'RFC__n_estimators': 300}

In [39]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.9526112185686654

In [40]:
params = {'RFC__max_depth': [60, 80, 100], 'RFC__n_estimators': [300, 400, 500]}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)

In [43]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.9671179883945842