### 스팸 데이터 (https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv)에 대해서 다음을 답하세요.

##### 1. 데이터/텍스트 전처리를 하세요.(ham/spam 인코딩, 결측치, 중복데이터, 숫자 및 특수문자 제거 등)[20]

In [55]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [2]:
df = pd.read_csv(url, encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


- 데이터 전처리

In [4]:
df = df[['v1', 'v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# 결측치 확인
df.isna().sum().sum()

0

In [7]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [8]:
# 중복 데이터 제거
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [9]:
# v1: ham->0, spam->1
df.v1 = df.v1.replace(['ham', 'spam'], [0, 1])
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# ham/spam 분포
df.v1.value_counts()

v1
0    4516
1     653
Name: count, dtype: int64

In [11]:
# 구둣점, 숫자 제거
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ', regex=True)
df.v2[0]

'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

In [22]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)

In [15]:
# Pipeline으로 베트스 파라메터 찾기
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [31]:
tvect = TfidfVectorizer(stop_words='english')
lr = LogisticRegression(random_state=2023)
params = {
    'tvect__ngram_range' : [(1, 1), (1, 2)], 
    'tvect__max_df' : [480, 481, 482, 483],
    'lr__C' : [ 14, 15, 16]
}
pipeline = Pipeline([('tvect', tvect), ('lr', lr) ])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)

%time grid_pipe.fit(X_train, y_train)

print(grid_pipe.best_params_)


CPU times: total: 19.8 s
Wall time: 20.1 s
{'lrc__C': 15, 'tvect__max_df': 480, 'tvect__ngram_range': (1, 2)}


In [32]:
best_score = grid_pipe.best_estimator_.score(X_test, y_test)
print(f'분류 정확도 : {best_score:.4f}')

분류 정확도 : 0.9720


### 네이버 쇼핑 리뷰 (https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt) 데이터세트를 이용하여 다음에 답하세요.

##### 1. 데이터/텍스트 

In [41]:
import numpy as np

In [33]:
url = 'https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt'
df = pd.read_table(url, names=['score', 'review'])
df.head()

Unnamed: 0,score,review
0,5,배공빠르고 굿
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


In [34]:
# score의 분포
df.score.value_counts()

score
5    81177
2    63989
1    36048
4    18786
Name: count, dtype: int64

In [35]:
# 평점이 4, 5 점은 긍정(1), 나버지는 부정(0)
df.score = df.score.apply(lambda x: 1 if x >= 4 else 0)
df.head()

Unnamed: 0,score,review
0,1,배공빠르고 굿
1,0,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,1,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,0,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,1,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


In [36]:
# 결측치 확인
df.isna().sum()

score     0
review    0
dtype: int64

In [37]:
# 중복 데이터 확인
df.shape, df.review.nunique()

((200000, 2), 199908)

In [38]:
# 중복 데이터 제거
df.drop_duplicates(subset=['review'], inplace=True)
df.shape

(199908, 2)

In [39]:
# 한글 이외의 데이터는 제거
df.review = df.review.str.replace('[^-ㅎㅏ-ㅣ가-힣 ]', '', regex=True)

In [42]:
# 한글 이외의 데이터를 제거함으로써 발생하는 ''를 제거
df.review.replace('', np.nan, inplace=True)
df.isna().sum()

score     0
review    0
dtype: int64

In [43]:
# 한글 형태소 분석 및 불용어 제거

In [44]:
with open('../data/한글불용어.txt') as st:
    lines = st.readlines()
stop_words = [line.split('\t')[0] for line in lines]

In [45]:
from konlpy.tag import Okt
okt = Okt()

In [46]:
%%time
reviews = []
for review in df.review:
    morphs = okt.morphs(review, stem=True)
    tmp = [word for word in morphs if word not in stop_words]
    reviews.append(' '.join(tmp))

CPU times: total: 10min 49s
Wall time: 10min 35s


In [47]:
# 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(
    reviews, df.score.values, stratify=df.score.values, random_state=2023
)

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
lr = LogisticRegression(random_state=2023)

In [61]:
params = {
    'cvect__ngram_range' : [(1, 2)],
    'cvect__max_df': [0.6, 0.7, 0.8],
    'lr__max_iter' : [95, 100, 105]
}
pipeline = Pipeline([('cvect', cvect), ('lr', lr)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)


%time grid_pipe.fit(X_train, y_train)

print(grid_pipe.best_params_)


CPU times: total: 7min 1s
Wall time: 6min 49s
{'cvect__max_df': 0.6, 'cvect__ngram_range': (1, 2), 'lr__max_iter': 95}


In [62]:
best_score = grid_pipe.best_estimator_.score(X_test, y_test)
print(f'분류 정확도 : {best_score:.4f}')

분류 정확도 : 0.8972
