### SMS Spam 분류

In [1]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [2]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- 데이터 전치리

In [4]:
# selection
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.isna().sum().sum()

0

In [10]:
# 중복 데이터 확인
df.shape,df.v2.nunique()

((5572, 2), 5169)

In [13]:
# 중복데이터 제거
df.drop_duplicates(subset=['v2'],inplace=True)

In [14]:
df.shape,df.v2.nunique()

((5169, 2), 5169)

In [16]:
# ['ham','spam']-> [0,1]
df.v1 = df.v1.replace(['ham','spam'], [0,1])

In [18]:
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
df.v1.value_counts()

v1
0    4516
1     653
Name: count, dtype: int64

In [22]:
x = df.v2.values
y = df.v1.values


- 텍스트 전처리

In [24]:
# 구둣점 , 숫자 제거
df.v2.str.replace('[^A-Za-z]',' ',regex=True)

0       Go until jurong point  crazy   Available only ...
1                           Ok lar    Joking wif u oni   
2       Free entry in   a wkly comp to win FA Cup fina...
3       U dun say so early hor    U c already then say   
4       Nah I don t think he goes to usf  he lives aro...
                              ...                        
5567    This is the  nd time we have tried   contact u...
5568                Will    b going to esplanade fr home 
5569    Pity    was in mood for that  So   any other s...
5570    The guy did some bitching but I acted like i d...
5571                           Rofl  Its true to its name
Name: v2, Length: 5169, dtype: object

In [25]:
df.v2[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

- 데이터셋 분리

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(
    df.v2.values,df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((4135,), (1034,), (4135,), (1034,))

- pipeline으로 best 파라메타 찾기 : CountVectorizer, RandomForestClassifier

In [31]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [51]:
params = {
    'CVECT__ngram_range' : [(1,1),(1,2)],
    'RFC__max_depth' : [2,10,30],
}

In [52]:
cvect = CountVectorizer(stop_words='english')
rfc = RandomForestClassifier(random_state=2023)
pipeline = Pipeline([('CVECT',cvect),('RFC',rfc)])
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)

CPU times: total: 17 s
Wall time: 17.4 s


In [53]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'RFC__max_depth': 30}

In [57]:
params= {'RFC__max_depth' : [100,110,115]}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
grid_pipe.fit(X_train,y_train)
grid_pipe.best_params_

{'RFC__max_depth': 100}

In [58]:
grid_pipe.best_estimator_.score(X_test,y_test)

0.9709864603481625

In [62]:
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((4135, 7336), (1034, 7336))

In [63]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023, max_iter=500)
lrc.fit(X_train_cv, y_train)

In [64]:
lrc.score(X_test_cv,y_test)

0.9709864603481625

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
tvect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
nb = MultinomialNB()
pipeline2 = Pipeline([('TVECT',tvect),('NB',nb)])

In [81]:
pipeline2.fit(X_train,y_train)

In [82]:
pipeline2.score(X_test,y_test)

0.9429400386847195

In [87]:
from sklearn.svm import SVC
svc = SVC(random_state=2023)
params = {'C': [0.01,0.1,1,10,100]}
grid_svc = GridSearchCV(
    svc, params, scoring='accuracy',cv=3
)

In [93]:
params = {'C': [8,10,13]}
grid_svc.fit(X_train_cv,y_train)

In [94]:
grid_svc.best_params_

{'C': 10}

In [97]:
grid_svc.best_estimator_.score(X_test_cv,y_test)

0.9738878143133463