In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [3]:
# Selection
df = df[['v1', 'v2']]
# 결측치 확인
df.isna().sum().sum()
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [4]:
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [5]:
# ['ham', 'spam'] => 0, 1
df.v1 = df.v1.replace('ham', '0').replace('spam', '1')
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# 구둣점, 숫자 제거 --> 영어 이외의 문자 공백으로 변환
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ')
df.head()

  df.v2 = df.v2.str.replace('[^A-Za-z]', ' ')


Unnamed: 0,v1,v2
0,0,Go until jurong point crazy Available only ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in a wkly comp to win FA Cup fina...
3,0,U dun say so early hor U c already then say
4,0,Nah I don t think he goes to usf he lives aro...


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)

In [20]:
params = {
    'CVECT__ngram_range': [(1,1), (1,2)],
    'KNN__n_neighbors': [3,4,5]
}

In [21]:
cvect = CountVectorizer(stop_words='english')
knn = KNeighborsClassifier()
pipeline = Pipeline([('CVECT', cvect), ('KNN', knn)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)

In [23]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'KNN__n_neighbors': 3}

In [24]:
params = {
    'KNN__n_neighbors': [1,2,3]
}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)
grid_pipe.best_estimator_.score(X_test, y_test)

0.941972920696325

In [35]:
from sklearn.svm import SVC

svc = SVC(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('SVC', svc)])
params = {
    'SVC__C': [8.5, 9, 9.5],
    'SVC__degree': [1, 2]
}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)
grid_pipe.best_estimator_.score(X_test, y_test)

0.9680851063829787

In [36]:
grid_pipe.best_params_

{'SVC__C': 8.5, 'SVC__degree': 1}