In [2]:
import os
import sys
import itertools

sys.path.append('../util')
sys.path.append('../preprocess')

import re
import joblib
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from konlpy.tag import Komoran
from datetime import datetime as dt
from sklearn.pipeline import Pipeline
from datetime import timedelta as tmdt
from sklearn.pipeline import FeatureUnion
from gensim.sklearn_api import D2VTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from text_handling import flatten
from text_handling import do_text_ma
from text_handling import make_ngram
from get_clf_eval import get_clf_eval
from text_handling import text_preproc
from text_handling import add_ngrams_to_ma
from siu_data_preproc import trim_by_length
from siu_data_preproc import transform_date
from siu_data_preproc import extract_hashtags
from siu_data_preproc import hsptl_prsnl_split
from siu_data_preproc import extract_hsptl_name
from d2v_input_transformer_2 import D2VInputTransformer

# `['실비', '실손', '혜택', '할인']`만으로 `keyword_table` 만들어보기

---

## 2-0. 로드

In [3]:
dir_blog = '../crawling/data/blog'

dir_temp = [os.listdir(f'{dir_blog}/{x}/contents') for x in os.listdir(dir_blog) if '백내장' in x]
dir_temp = [x for x in list(itertools.chain.from_iterable(dir_temp)) if 'rdbl' in x]

dir_files = [f"{dir_blog}/{fn.split('_')[2]}/contents/{fn}" for fn in dir_temp]

In [4]:
for i, fn in enumerate(dir_files):
    if i==0:
        df = pd.read_csv(fn, index_col=0)
    else:
        df_tmp = pd.read_csv(fn, index_col=0)
        df = pd.concat([df, df_tmp], axis=0)
        
df = df.reset_index(drop=True)
print(df.shape)
df.head(5)

(21896, 6)


Unnamed: 0,date,name,title,url,keyword,content
0,2021.05.24.,암보험비교사이트순위,7대질병수술비 특약 잘 보장받는 방법,https://blog.naver.com/PostView.naver?blogId=k...,백내장+실손,안녕하세요오늘은 여러 질병수술비특약중에서&quot;7대질병수술비 특약&quot; 에...
1,2021.05.21.,슬기로운 보험이야기,1세대 구실손,https://blog.naver.com/PostView.naver?blogId=l...,백내장+실손,
2,2021.05.18.,미소의 히든카드,실손의료비 백내장 다초첨렌즈 삽입술 보험금 보상 유무 꼭...,https://blog.naver.com/PostView.naver?blogId=h...,백내장+실손,백내장 수술을 할때다초첨렌즈 삽입?단초첨렌즈 삽입?실손의료비 미리 확인하세요!!백내...
3,2021.06.08.,☞ 다이즐한의원 ☜,백내장수술비용보험 실손 적용으로,https://blog.naver.com/PostView.naver?blogId=y...,백내장+실손,백내장수술비용보험 실손 적용으로 노안교정술은 크게 백내장 여부에따라서 그 방법이 달...
4,2021.06.01.,포켓보험,착한실손 전환 하셔야합니다 수십만원감당안되세요,https://blog.naver.com/PostView.naver?blogId=h...,백내장+실손,착한실손 전환 하셔야합니다 수십만원 감당안되세요안녕하세요 늘 사실만 전달해드리는돌직...


## 2-1. 전처리

### - Filter animal related contents

In [5]:
df = df.loc[~df['content'].isna()]
df = df.loc[df['content'].map(lambda x: '동물병원' not in x)]
print(df.shape)

(18459, 6)


### - Cleansing

In [6]:
df_preproc = text_preproc(df, ['name', 'title', 'content'], kor_only=True)

### - Trimming from each side w.r.t. length of content

In [7]:
df_preproc = trim_by_length(df_preproc, trim_low=.01, trim_upp=.1)
df_preproc.shape

(16424, 6)

### - Change date

In [8]:
crawl_dt = '2022.04.08.'
df_preproc['date'] = transform_date(df_preproc['date'], crawl_dt)

////////// Converting dates


### - Extract hashtags

In [9]:
hospital_re_target = '.*(안과|병원|의원)$'
df_preproc['#s'] = extract_hashtags(df_preproc['content'], hospital_re_target)

////////// Extracting hashtags


### - Split hospital and personal blogs

In [10]:
hsptl_words = ['병원', '의원', '외과']
excpt_words = ['동물']
df_hsptl, df_prsnl = hsptl_prsnl_split(df_preproc, hsptl_words, excpt_words)

////////// Data seperation; hospital and personal
Original: (16424, 7)
Blog written by hospital: (959, 7)
Blog written by personal: (15444, 7)


### - Filter date (after `2021.09`)

In [11]:
df_hsptl = df_hsptl.loc[df_hsptl['date'].map(lambda x: x[:7]>'2021.09')]
df_hsptl.shape

(473, 7)

### - Do morph analysis

In [12]:
%%time

X = df_hsptl['content']

tagger = Komoran()
pos_list = ('NNG', 'NNP', 'NP', 'VV', 'VA')
stopwords = []

X_ma = do_text_ma(X, tagger, pos_list, stopwords, is_morph='y')

CPU times: user 1min 13s, sys: 1.42 s, total: 1min 14s
Wall time: 47.2 s


Save

In [13]:
with open('X_ma_hsptl.pkl', 'wb') as f:
    pickle.dump(X_ma, f)

Load

In [14]:
# with open('X_ma_hsptl.pkl', 'rb') as f:
#     X_ma = pickle.load(f)

### - Load best hyperparameters for d2v

In [15]:
with open('best_params_d2v.pkl', 'rb') as f:
    best_params_d2v = pickle.load(f)

### - Make embedding vectors with best parameters

In [16]:
ngram_names = ['content_1gram_1', 'content_2gram', 'content_3gram']
ngram_ns = [1,2,3]
how_ngram = '1_2_3'

pipeline_d2v_final = Pipeline([
                                ('preproc', D2VInputTransformer(ngram_names, ngram_ns, how_ngram)),
                                ('embed', D2VTransformer())
                            ])

pipeline_d2v_final.set_params(**best_params_d2v)

Pipeline(steps=[('preproc',
                 D2VInputTransformer(how_ngram='1_2_3',
                                     ngram_names=['content_1gram_1',
                                                  'content_2gram',
                                                  'content_3gram'],
                                     ngram_ns=[1, 2, 3])),
                ('embed', D2VTransformer(window=7))])

In [17]:
%%time

doc_embedded_vector = pipeline_d2v_final.fit_transform(X_ma)

with open('doc_embedded_vector_hsptl.pkl', 'wb') as f:
    pickle.dump(doc_embedded_vector, f)

CPU times: user 16.3 s, sys: 79.9 ms, total: 16.4 s
Wall time: 11 s


Load

In [3]:
with open('doc_embedded_vector_hsptl.pkl', 'rb') as f:
    doc_embedded_vector = pickle.load(f)

In [5]:
type(doc_embedded_vector)

numpy.ndarray

### - Make embedding dataframe

In [19]:
doc_embedded_df = pd.DataFrame(doc_embedded_vector)
doc_embedded_df.columns = ['embed_{}'.format(x) for x in range(doc_embedded_df.shape[1])]
doc_embedded_df.head(5)

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_90,embed_91,embed_92,embed_93,embed_94,embed_95,embed_96,embed_97,embed_98,embed_99
0,0.542827,0.7459,0.023451,-0.439574,0.001806,0.506976,-0.3094,0.067186,-0.81276,0.231299,...,0.016092,-0.181778,-0.187428,0.119794,0.138912,-0.12813,0.095936,-0.12185,-0.214867,-0.138404
1,0.522133,0.697747,0.077942,-0.530826,0.066918,0.565048,-0.394347,0.085496,-0.878902,0.1777,...,0.017506,-0.276165,-0.155978,0.011234,0.143844,-0.112206,0.10383,-0.085031,-0.194712,-0.130108
2,0.189686,0.281768,-0.017316,-0.158676,-0.003215,0.199162,-0.103575,0.024215,-0.329302,0.122282,...,0.004071,-0.02824,-0.076116,0.073327,0.021756,-0.088236,0.03512,-0.075602,-0.09043,-0.051016
3,0.583685,0.809276,0.057902,-0.428562,-0.013372,0.540863,-0.29957,0.080982,-0.920348,0.332831,...,-0.04086,-0.049984,-0.18183,0.144617,0.092042,-0.142521,0.110799,-0.156674,-0.287912,-0.111643
4,0.771603,1.016767,0.243678,-0.726883,0.040944,0.7364,-0.595166,0.020481,-1.098224,0.228925,...,-0.049021,-0.21302,-0.005179,-0.034149,0.38798,-0.089223,0.149383,-0.004477,-0.24896,-0.124745


Save

In [20]:
with open('doc_embedded_df_hsptl.pkl', 'wb') as f:
    pickle.dump(doc_embedded_df, f)

Load

In [21]:
# with open('doc_embedded_df_hsptl.pkl', 'rb') as f:
#     doc_embedded_df = pickle.load(f)

### - Make `ngram` and keyword count

In [22]:
doc_embedded_df.shape

(473, 100)

In [23]:
X_ngram = X_ma.map(lambda x: make_ngram(x, 2))

In [24]:
kywrd_table_rl = pd.concat([df_hsptl['content'].map(lambda x: x.count('실손')).rename('실손'),
                            df_hsptl['content'].map(lambda x: x.count('실비')).rename('실비'),
                            df_hsptl['content'].map(lambda x: x.count('할인')).rename('할인'),
                            df_hsptl['content'].map(lambda x: x.count('혜택')).rename('혜택'),
                            X_ngram.map(lambda x: x.count('실_손')).rename('실_손')], axis=1)

### - Make input table (`X_rl`)

In [25]:
doc_embd_df_rl = pd.DataFrame(doc_embedded_df)
doc_embd_df_rl.index = df_hsptl.index
doc_embd_df_rl.shape

(473, 100)

In [26]:
X_rl = pd.concat([kywrd_table_rl, doc_embd_df_rl], axis=1)

### - Get final result dataframe after doing prediction

In [28]:
best_model_clf = joblib.load('siu_clf_ver2.sav')

In [29]:
rslt = best_model_clf.predict(X_rl)
rslt_proba = [x[1] for x in best_model_clf.predict_proba(X_rl)]

In [30]:
df_hsptl['label'] = rslt
df_hsptl['score'] = rslt_proba
df_hsptl

Unnamed: 0,date,name,title,url,keyword,content,#s,label,score
261,2021.10.15.,더불어민주당 동구미추홀구갑 국회의원 허종식,허종식 민간보험 공공데이터 요구건보심평원도,https://blog.naver.com/PostView.naver?blogId=j...,백내장+실손,허종식 민간보험 공공데이터 요구건보심평원도 민간데이터 확보해야 민간보험사 데이터 요...,[],0,0.457059
663,2022.02.14.,동탄퍼스트안과의원,실손실비보험 적용 제한되는 백내장 갑상선 도수치료,https://blog.naver.com/PostView.naver?blogId=j...,백내장+실손,안녕하세요#동탄안과 #동탄퍼스트안과 입니다년 분기부터 #갑상선 #백내장 #도수치료 ...,"[#동탄퍼스트안과, #동탄퍼스트안과, #동탄퍼스트안과, #동탄퍼스트안과]",0,0.398708
709,2022.03.08.,동탄 사랑채움한의원에서 소개하는 침치료 연구들,현대해상 대신증권,https://blog.naver.com/PostView.naver?blogId=l...,백내장+실손,대신증권 년 가장 기대되는 손보주투자의견 목표주가 원으로 상향 업종 내 최선호주 ...,[],1,0.577218
751,2022.02.14.,고은경희한의원의 블로그,백내장 도수치료 실비 제한한다 고은경희한의원 실손보험,https://blog.naver.com/PostView.naver?blogId=g...,백내장+실손,#고은경희한의원 #구디역 #구로디지털단지역 #구디역한의원 #구로동한의원 #이마트구로...,"[#고은경희한의원, #구디역한의원, #구로동한의원]",0,0.327471
755,2022.02.22.,서울삼성안과의원 공식 블로그,백내장수술 비용 실비실손보험 고민되신다면 상담받아보세요,https://blog.naver.com/PostView.naver?blogId=s...,백내장+실손,백내장수술 고민되신다면서울삼성안과의원에서 상담받아보세요안녕하세요 성남 수진동에 위치...,[],1,0.875331
...,...,...,...,...,...,...,...,...,...
16333,2021.11.04.,님의블로그,통증치료 잘하는 곳 홍제동정형외과,https://blog.naver.com/PostView.naver?blogId=h...,백내장+소개,안녕하세요현대인들에게 허리통증은 뗄 수 없는 질환이죠 하루 종일 앉아 컴퓨터로 업무...,[],0,0.336690
16342,2021.10.26.,님의 블로그,하남백내장 치료 병원 알아보고 있다면,https://blog.naver.com/PostView.naver?blogId=w...,백내장+소개,하남백내장 치료 병원 알아보고 있다면나이가 들면서 백내장 때문에 수술을 고려하시는 ...,[],0,0.473991
16373,2021.11.04.,님의블로그,홍제역정형외과 이곳을 소개합니다,https://blog.naver.com/PostView.naver?blogId=k...,백내장+소개,안녕하세요 오늘은 유용하고 좋은 정보를 나누고싶은 파워블로거를 꿈꾸는미스왕 인사드립...,[],0,0.420807
16392,2021.11.05.,,대전백내장수술잘하는병원에서 꼼꼼한 진료받고 밝은세상,https://blog.naver.com/PostView.naver?blogId=m...,백내장+소개,대전백내장수술잘하는병원에서 꼼꼼한 진료받고 밝은세상 만나보자백내장은 대부분 나이가 ...,[],0,0.472702
