In [2]:
import os
import sys
import itertools
sys.path.append('../preprocess')

import re
import joblib
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from konlpy.tag import Komoran
from datetime import datetime as dt
from sklearn.pipeline import Pipeline
from datetime import timedelta as tmdt
from sklearn.pipeline import FeatureUnion
from gensim.sklearn_api import D2VTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from text_handling import flatten
from text_handling import do_text_ma
from text_handling import make_ngram
from get_clf_eval import get_clf_eval
from text_handling import text_preproc
from text_handling import add_ngrams_to_ma
from d2v_input_transformer_2 import D2VInputTransformer

# `['실비', '실손', '혜택', '할인']`만으로 `keyword_table` 만들어보기

In [3]:
with(open('백내장_레이블포함_ver1.pas')) as f_pas:
    df_with_lbl = pd.read_csv(f_pas, delimiter="|")
    
print(df_with_lbl.shape)
df_with_lbl.columns

(834, 27)


Index(['최초 No.', 'date', 'name', 'title', 'url', 'content', 'replies',
       'hsptl_name', 'hsptl_name_2', 'result_hsptl_name_2', '실손', '실비', '수수료',
       '수당', '할인', '페이백', '소개', '부수입', '호텔', '숙소', '실_손', '페이_백', '수_당', '분류자',
       '점검대상', '점검대상 선정 키워드', '부당행위 유/무'],
      dtype='object')

In [4]:
lbl_filter = lambda x: x=='0' or x=='1'

df_with_lbl = df_with_lbl.loc[df_with_lbl['부당행위 유/무'].map(lbl_filter)]
df_with_lbl['부당행위 유/무']  = df_with_lbl['부당행위 유/무'].astype(int)
print(df_with_lbl.shape)

(419, 27)


In [5]:
df_with_lbl = df_with_lbl.loc[~df_with_lbl['content'].isna()]
df_with_lbl = df_with_lbl.loc[df_with_lbl['content'].map(lambda x: '동물병원' not in x)]
print(df_with_lbl.shape)

(418, 27)


In [6]:
with open('doc_embedded_vector.pkl', 'rb') as f:
    doc_embedded_vector = pickle.load(f)
    
doc_embd_df = pd.DataFrame(doc_embedded_vector)
doc_embd_df.index = df_with_lbl.index

In [7]:
orgn_lbls = df_with_lbl['부당행위 유/무']

In [8]:
df_with_lbl['혜택'] = df_with_lbl['content'].map(lambda x: x.count('혜택'))

In [9]:
df_with_lbl.columns

Index(['최초 No.', 'date', 'name', 'title', 'url', 'content', 'replies',
       'hsptl_name', 'hsptl_name_2', 'result_hsptl_name_2', '실손', '실비', '수수료',
       '수당', '할인', '페이백', '소개', '부수입', '호텔', '숙소', '실_손', '페이_백', '수_당', '분류자',
       '점검대상', '점검대상 선정 키워드', '부당행위 유/무', '혜택'],
      dtype='object')

In [10]:
kywrd_table = df_with_lbl[['실손', '실비', '할인', '혜택', '실_손']]
kywrd_table.head(3)

Unnamed: 0,실손,실비,할인,혜택,실_손
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0


# 학습 및 모델 저장

In [11]:
X_fin = pd.concat([kywrd_table, doc_embd_df], axis=1)
y = orgn_lbls

X_fin_train, X_fin_test, y_train, y_test = train_test_split(X_fin, y)

In [12]:
%%time

pipeline_clf = Pipeline([
    ('clf', xgb.XGBClassifier(eval_metric = 'logloss'))
])

parameters_clf = {
    'clf__n_estimators': [100, 110, 120]
}

cv_clf = GridSearchCV(pipeline_clf, parameters_clf)
cv_clf.fit(X_fin_train, y_train)

CPU times: user 13.5 s, sys: 60.4 ms, total: 13.6 s
Wall time: 7.02 s


GridSearchCV(estimator=Pipeline(steps=[('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      eval_metric='logloss',
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                     

In [13]:
best_model_clf = cv_clf.best_estimator_
best_model_clf.fit(X_fin_train, y_train)

Pipeline(steps=[('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='logloss', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=120,
                               n_jobs=2, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               verbosity=None))])

In [14]:
y_pred = best_model_clf.predict(X_fin_test)
    
get_clf_eval(y_test, y_pred)

오차행렬:
 [[65 10]
 [10 20]]

정확도: 0.8095
정밀도: 0.6667
재현율: 0.6667
F1: 0.6667
AUC: 0.7667


In [15]:
joblib.dump(best_model_clf, 'siu_clf_ver2.sav')

['siu_clf_ver2.sav']

---